diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 1be318cfe3..a1928bc352 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -245,21 +245,9 @@ impl QuantumEncoder for AmplitudeEncoder {
             buffer
         };
 
-        // Validate norms on host to catch zero or NaN samples early
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
-
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid norm".to_string(),
-                ));
-            }
-        }
-
-        // Launch batch kernel
+        // Launch batch encode kernel — takes GPU norm buffer directly, no D2H needed yet.
+        // We defer the norm validation D2H copy until AFTER the encode kernel + sync so that
+        // the norm kernel → encode kernel sequence runs without an intermediate GPU-CPU roundtrip.
         {
             crate::profile_scope!("GPU::BatchKernelLaunch");
             let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
@@ -288,7 +276,7 @@ impl QuantumEncoder for AmplitudeEncoder {
             }
         }
 
-        // Synchronize
+        // Synchronize — all GPU work (norm + encode) complete after this point.
         {
             crate::profile_scope!("GPU::Synchronize");
             device
@@ -296,6 +284,22 @@ impl QuantumEncoder for AmplitudeEncoder {
                 .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
         }
 
+        // Validate norms on host AFTER sync: D2H copy no longer blocks the encode kernel.
+        // This preserves error detection for zero/NaN samples without adding a mid-pipeline
+        // GPU-CPU roundtrip between the norm and encode kernels.
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+
         Ok(batch_state_vector)
     }
 
@@ -412,17 +416,8 @@ impl QuantumEncoder for AmplitudeEncoder {
             }
             buffer
         };
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid norm".to_string(),
-                ));
-            }
-        }
+        // Launch encode kernel before D2H norm validation: GPU norm buffer is passed directly,
+        // so the encode kernel can run immediately after the norm kernel without a CPU roundtrip.
         {
             crate::profile_scope!("GPU::BatchKernelLaunch");
             use cudarc::driver::DevicePtr;
@@ -450,10 +445,22 @@ impl QuantumEncoder for AmplitudeEncoder {
                 )));
             }
         }
+        // Synchronize first; then validate norms on host (D2H after all GPU work is done).
         {
             crate::profile_scope!("GPU::Synchronize");
             sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
         }
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
         Ok(batch_state_vector)
     }
 
diff --git a/qdp/qdp-core/src/pipeline_runner.rs b/qdp/qdp-core/src/pipeline_runner.rs
index 9a41ee4bc1..d3ce2a7a13 100644
--- a/qdp/qdp-core/src/pipeline_runner.rs
+++ b/qdp/qdp-core/src/pipeline_runner.rs
@@ -259,6 +259,47 @@ impl PipelineIterator {
         })
     }
 
+    /// Create a pipeline iterator from an in-memory array (e.g. from Python numpy).
+    /// Data is owned by the iterator; the full encode loop runs in Rust (take_batch + encode_batch).
+    pub fn new_from_array(
+        engine: QdpEngine,
+        data: Vec<f64>,
+        num_samples: usize,
+        sample_size: usize,
+        config: PipelineConfig,
+        batch_limit: usize,
+    ) -> Result<Self> {
+        let vector_len = vector_len(config.num_qubits, &config.encoding_method);
+        if sample_size != vector_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Array sample_size {} does not match vector_len {} for num_qubits={}, encoding={}",
+                sample_size, vector_len, config.num_qubits, config.encoding_method
+            )));
+        }
+        if data.len() != num_samples * sample_size {
+            return Err(MahoutError::InvalidInput(format!(
+                "Array length {} is not num_samples ({}) * sample_size ({})",
+                data.len(),
+                num_samples,
+                sample_size
+            )));
+        }
+        let source = DataSource::InMemory {
+            data,
+            cursor: 0,
+            num_samples,
+            sample_size,
+            batches_yielded: 0,
+            batch_limit,
+        };
+        Ok(Self {
+            engine,
+            config,
+            source,
+            vector_len,
+        })
+    }
+
     /// Create a pipeline iterator from a Parquet file using streaming read (Phase 2b).
     /// Only `.parquet` is supported; reduces memory for large files by reading in chunks.
     /// Validates sample_size == vector_len after the first chunk.
@@ -428,7 +469,61 @@ impl PipelineIterator {
     }
 
     /// Returns the next batch as a DLPack pointer; `Ok(None)` when exhausted.
+    /// For InMemory source, passes a slice reference to encode_batch (no per-batch copy).
     pub fn next_batch(&mut self) -> Result<Option<*mut DLManagedTensor>> {
+        // InMemory: update cursor, then encode from &data[start..end] to avoid to_vec().
+        let in_memory_range: Option<(usize, usize, usize, usize)> = match &mut self.source {
+            DataSource::InMemory {
+                data,
+                cursor,
+                sample_size,
+                batches_yielded,
+                batch_limit,
+                ..
+            } => {
+                if *batches_yielded >= *batch_limit {
+                    None
+                } else {
+                    let remaining = (data.len() - *cursor) / *sample_size;
+                    if remaining == 0 {
+                        None
+                    } else {
+                        let batch_n = remaining.min(self.config.batch_size);
+                        let start = *cursor;
+                        let end = start + batch_n * *sample_size;
+                        *cursor = end;
+                        *batches_yielded += 1;
+                        Some((
+                            start,
+                            batch_n,
+                            *sample_size,
+                            self.config.num_qubits as usize,
+                        ))
+                    }
+                }
+            }
+            _ => None,
+        };
+
+        if let Some((start, batch_n, sample_size, num_qubits)) = in_memory_range {
+            let slice = match &self.source {
+                DataSource::InMemory { data, .. } => {
+                    let len = batch_n * sample_size;
+                    &data[start..start + len]
+                }
+                _ => unreachable!(),
+            };
+            let ptr = self.engine.encode_batch(
+                slice,
+                batch_n,
+                sample_size,
+                num_qubits,
+                &self.config.encoding_method,
+            )?;
+            return Ok(Some(ptr));
+        }
+
+        // Synthetic / Streaming: take_batch_from_source (may copy) then encode.
         let Some((batch_data, batch_n, sample_size, num_qubits)) = self.take_batch_from_source()?
         else {
             return Ok(None);
diff --git a/qdp/qdp-python/benchmark/encoding_benchmarks/README.md b/qdp/qdp-python/benchmark/encoding_benchmarks/README.md
index 97c70ab369..3cb4dc64d8 100644
--- a/qdp/qdp-python/benchmark/encoding_benchmarks/README.md
+++ b/qdp/qdp-python/benchmark/encoding_benchmarks/README.md
@@ -75,3 +75,25 @@ To see the full list of options and defaults, append `--help`:
 uv run python benchmark/encoding_benchmarks/pennylane_baseline/iris_amplitude.py --help
 uv run python benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py --help
 ```
+
+## Credit Card Fraud amplitude baseline (PennyLane)
+
+Minimal, reproducible steps (run from `qdp/qdp-python`):
+
+1. **Download dataset (once)** — Kaggle `creditcard.csv` mirror:
+
+   ```bash
+   mkdir -p benchmark/encoding_benchmarks/pennylane_baseline/data
+   curl -L -o benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
+     https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
+   ```
+
+2. **Run the PennyLane baseline** — StandardScaler → PCA(16) → L2 norm → 4‑qubit amplitude VQC:
+
+   ```bash
+   uv run python benchmark/encoding_benchmarks/pennylane_baseline/creditcardfraud_amplitude.py \
+     --data-file benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
+     --max-samples 300000 --iters 200 --batch-size 512 --trials 1
+   ```
+
+This prints compile time, train time / throughput, and task metrics (AUPRC, F1, precision, recall) on the test set.
diff --git a/qdp/qdp-python/benchmark/encoding_benchmarks/pennylane_baseline/creditcardfraud_amplitude.py b/qdp/qdp-python/benchmark/encoding_benchmarks/pennylane_baseline/creditcardfraud_amplitude.py
new file mode 100644
index 0000000000..34e9148fd3
--- /dev/null
+++ b/qdp/qdp-python/benchmark/encoding_benchmarks/pennylane_baseline/creditcardfraud_amplitude.py
@@ -0,0 +1,426 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+PennyLane baseline: Credit Card Fraud (binary, highly imbalanced), amplitude encoding.
+
+Best practices (2025–2026, aligned with ENCODING_BENCHMARK_PLAN.md §2.2):
+- Data: StandardScaler + PCA (here 16–30 components) → padding to 2**num_qubits → L2-normalized vector.
+- Splits: Stratified train/validation/test; do not use accuracy as primary metric.
+- Imbalance: Class-weighted loss (minority class up-weighted); optional oversampling.
+- Task metrics: AUPRC (precision–recall AUC), F1-score, precision, recall on test set.
+- System metrics: Compile time (first forward), train time, throughput (samples/sec).
+
+Data source:
+  CSV with columns V1..V28, Amount, Class (0=legit, 1=fraud). Example: Kaggle
+  "Credit Card Fraud Detection" (https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud).
+  Pass path via --data-file. If no file, a small synthetic imbalanced dataset is used for smoke test.
+
+Training always runs on GPU via lightning.gpu for fair comparison with QDP pipeline.
+"""
+
+from __future__ import annotations
+
+import argparse
+import time
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+try:
+    import pennylane as qml
+except ImportError as e:
+    raise SystemExit(
+        "PennyLane is required. Install with: uv sync --group benchmark"
+    ) from e
+
+try:
+    from sklearn.decomposition import PCA
+    from sklearn.metrics import (
+        average_precision_score,
+        f1_score,
+        precision_score,
+        recall_score,
+    )
+    from sklearn.model_selection import train_test_split
+    from sklearn.preprocessing import StandardScaler
+except ImportError as e:
+    raise SystemExit(
+        "scikit-learn is required. Install with: uv sync --group benchmark"
+    ) from e
+
+
+NUM_QUBITS = 5
+FEATURE_DIM = 2**NUM_QUBITS  # amplitude embedding dimension (32 for 5 qubits)
+
+
+def _layer(layer_weights: torch.Tensor, wires: tuple[int, ...]) -> None:
+    """Single variational layer: Rot on each wire + ring of CNOTs."""
+    for i, w in enumerate(wires):
+        qml.Rot(layer_weights[i, 0], layer_weights[i, 1], layer_weights[i, 2], wires=w)
+    for i in range(len(wires)):
+        qml.CNOT(wires=[wires[i], wires[(i + 1) % len(wires)]])
+
+
+def load_creditcard_csv(path: str) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Load Credit Card Fraud CSV. Expects columns including V1..V28, Amount, Class.
+    Returns (X_raw shape (n, 30), y shape (n,) with 0/1).
+    """
+    data = np.genfromtxt(path, delimiter=",", skip_header=1, dtype=np.float64)
+    if data.ndim == 1:
+        data = data.reshape(1, -1)
+    # Last column = Class; rest = features (Time, V1..V28, Amount)
+    X = data[:, :-1]
+    y = data[:, -1].astype(np.int32)
+    # If CSV has header row with "Time", we already skipped it
+    if X.shape[1] >= 30:
+        X = X[:, -30:]  # last 30 cols: V1..V28, Amount (and drop Time if 31)
+    elif X.shape[1] < 30:
+        # Pad with zeros to 30
+        pad = np.zeros((X.shape[0], 30 - X.shape[1]), dtype=np.float64)
+        X = np.hstack([X, pad])
+    return X, y
+
+
+def make_synthetic_imbalanced(
+    seed: int, n_total: int = 2000, fraud_ratio: float = 0.02
+) -> tuple[np.ndarray, np.ndarray]:
+    """Synthetic 30-D imbalanced binary data for smoke test when no CSV is provided."""
+    rng = np.random.default_rng(seed)
+    n_fraud = max(1, int(n_total * fraud_ratio))
+    n_legit = n_total - n_fraud
+    X_legit = rng.standard_normal((n_legit, 30)).astype(np.float64) * 0.5
+    X_fraud = rng.standard_normal((n_fraud, 30)).astype(np.float64) * 0.5 + 1.0
+    X = np.vstack([X_legit, X_fraud])
+    y = np.array([0] * n_legit + [1] * n_fraud, dtype=np.int32)
+    perm = rng.permutation(n_total)
+    return X[perm], y[perm]
+
+
+def preprocess(
+    X: np.ndarray,
+    y: np.ndarray,
+    pca_dim: int,
+    seed: int,
+    test_size: float = 0.2,
+    val_size: float = 0.1,
+) -> tuple[
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    Any,
+    Any,
+    np.ndarray,
+]:
+    """
+    StandardScaler → PCA (to <= pca_dim) → pad to FEATURE_DIM → L2 normalize.
+    Stratified train/val/test. Returns X_train, y_train, X_val, y_val, X_test, y_test (all numpy),
+    scaler, pca (fitted), sample_weights_train (for weighted loss).
+    """
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+
+    pca = PCA(
+        n_components=min(pca_dim, X_scaled.shape[1], X_scaled.shape[0] - 1),
+        random_state=seed,
+    )
+    X_pca = pca.fit_transform(X_scaled)
+    # Pad PCA features up to FEATURE_DIM for amplitude embedding (remaining entries are zeros).
+    if X_pca.shape[1] < FEATURE_DIM:
+        pad = np.zeros((X_pca.shape[0], FEATURE_DIM - X_pca.shape[1]), dtype=np.float64)
+        X_pca = np.hstack([X_pca, pad])
+
+    norm = np.linalg.norm(X_pca, axis=1, keepdims=True)
+    norm[norm < 1e-12] = 1.0
+    X_norm = (X_pca / norm).astype(np.float64)
+
+    rng = np.random.RandomState(seed)
+    idx = rng.permutation(len(y))
+    X_norm, y = X_norm[idx], y[idx]
+
+    # Stratified split: first test, then val from train
+    X_temp, X_test, y_temp, y_test = train_test_split(
+        X_norm, y, test_size=test_size, stratify=y, random_state=seed
+    )
+    val_ratio = val_size / (1 - test_size)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_temp, y_temp, test_size=val_ratio, stratify=y_temp, random_state=seed
+    )
+
+    # Class weights for weighted MSE: n / (2 * n_class)
+    n0 = max(1, int(np.sum(y_train == 0)))
+    n1 = max(1, int(np.sum(y_train == 1)))
+    w0 = len(y_train) / (2 * n0)
+    w1 = len(y_train) / (2 * n1)
+    sample_weights = np.where(y_train == 0, w0, w1).astype(np.float64)
+
+    return (
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        X_test,
+        y_test,
+        scaler,
+        pca,
+        sample_weights,
+    )
+
+
+def run_training(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_test: np.ndarray,
+    y_test: np.ndarray,
+    sample_weights: np.ndarray,
+    *,
+    num_layers: int,
+    iterations: int,
+    batch_size: int,
+    lr: float,
+    seed: int,
+) -> dict[str, Any]:
+    """Train 5-qubit amplitude VQC on GPU with class-weighted loss; report AUPRC, F1, compile/train time."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA GPU is required for training. No CUDA device found.")
+    try:
+        dev = qml.device("lightning.gpu", wires=NUM_QUBITS)
+    except Exception as e:
+        raise RuntimeError(
+            "lightning.gpu is required for GPU training. Install with: "
+            "pip install pennylane-lightning[gpu]"
+        ) from e
+
+    device = torch.device("cuda")
+    dtype = torch.float64
+    wires = tuple(range(NUM_QUBITS))
+
+    @qml.qnode(dev, interface="torch", diff_method="adjoint")
+    def circuit(weights: torch.Tensor, features: torch.Tensor) -> torch.Tensor:
+        qml.AmplitudeEmbedding(features, wires=wires, normalize=True)
+        for w in weights:
+            _layer(w, wires)
+        return qml.expval(qml.PauliZ(0))
+
+    def model(
+        weights: torch.Tensor, bias: torch.Tensor, x: torch.Tensor
+    ) -> torch.Tensor:
+        return circuit(weights, x) + bias
+
+    def cost(
+        weights: torch.Tensor,
+        bias: torch.Tensor,
+        X_batch: torch.Tensor,
+        Y_batch: torch.Tensor,
+        w_batch: torch.Tensor,
+    ) -> torch.Tensor:
+        # Y in {0,1} -> target in {-1, 1}
+        target = Y_batch * 2.0 - 1.0
+        pred = model(weights, bias, X_batch)
+        return (w_batch * (target - pred) ** 2).sum() / (w_batch.sum() + 1e-12)
+
+    n_train = len(y_train)
+
+    torch.manual_seed(seed)
+    weights = torch.nn.Parameter(
+        0.01 * torch.randn(num_layers, NUM_QUBITS, 3, device=device, dtype=dtype)
+    )
+    bias = torch.nn.Parameter(torch.tensor(0.0, device=device, dtype=dtype))
+    opt = torch.optim.Adam([weights, bias], lr=lr)
+
+    X_train_t = torch.tensor(X_train, dtype=dtype, device=device)
+    # Float so autograd does not try to differentiate ints
+    Y_train_t = torch.tensor(
+        np.asarray(y_train, dtype=np.float64), dtype=dtype, device=device
+    )
+    W_train_t = torch.tensor(sample_weights, dtype=dtype, device=device)
+
+    X_test_t = torch.tensor(X_test, dtype=dtype, device=device)
+
+    # Compile (first forward + cost)
+    t0 = time.perf_counter()
+    _ = circuit(weights, X_train_t[0])
+    _ = cost(weights, bias, X_train_t[:1], Y_train_t[:1], W_train_t[:1])
+    compile_sec = time.perf_counter() - t0
+
+    # Train
+    _batch_n = min(batch_size, n_train)
+    t0 = time.perf_counter()
+    for _ in range(iterations):
+        opt.zero_grad()
+        idx = torch.randint(0, n_train, (_batch_n,), device=device)
+        Xb = X_train_t[idx]
+        Yb = Y_train_t[idx]
+        Wb = W_train_t[idx]
+        loss = cost(weights, bias, Xb, Yb, Wb)
+        loss.backward()
+        opt.step()
+    train_sec = time.perf_counter() - t0
+
+    # Test-set predictions and scores (for AUPRC we need continuous scores)
+    with torch.no_grad():
+        pred_scores = model(weights, bias, X_test_t).cpu().numpy().flatten()
+    pred_binary = (np.sign(pred_scores) > 0).astype(np.int32)
+    # Map expval in [-1,1] to positive-class score in [0,1] for AUPRC
+    scores_positive = (pred_scores + 1.0) / 2.0
+
+    y_test_np = np.asarray(y_test)
+    auprc = float(average_precision_score(y_test_np, scores_positive))
+    f1 = float(f1_score(y_test_np, pred_binary, zero_division=0))
+    prec = float(precision_score(y_test_np, pred_binary, zero_division=0))
+    rec = float(recall_score(y_test_np, pred_binary, zero_division=0))
+
+    return {
+        "compile_time_sec": compile_sec,
+        "train_time_sec": train_sec,
+        "samples_per_sec": (iterations * _batch_n) / train_sec
+        if train_sec > 0
+        else 0.0,
+        "auprc": auprc,
+        "f1_score": f1,
+        "precision": prec,
+        "recall": rec,
+        "n_train": n_train,
+        "n_test": len(y_test),
+        "iterations": iterations,
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="PennyLane Credit Card Fraud baseline (amplitude, 5 qubits, AUPRC/F1, GPU training)"
+    )
+    parser.add_argument(
+        "--data-file",
+        type=str,
+        default=None,
+        help="Path to CSV (e.g. Kaggle creditcard.csv). If omitted, use synthetic imbalanced data.",
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=50_000,
+        help="Max samples to use from CSV (default: 50000); ignored for synthetic.",
+    )
+    parser.add_argument(
+        "--pca-dim",
+        type=int,
+        default=30,
+        help="PCA components before padding to 2**num_qubits (default: 30, capped by feature dim).",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--iters", type=int, default=5000, help="Optimizer steps")
+    parser.add_argument("--batch-size", type=int, default=256, help="Batch size")
+    parser.add_argument("--layers", type=int, default=2, help="Variational layers")
+    parser.add_argument("--lr", type=float, default=0.01, help="Learning rate")
+    parser.add_argument(
+        "--trials",
+        type=int,
+        default=1,
+        help="Number of runs (same data, different seeds); report median AUPRC/F1.",
+    )
+    args = parser.parse_args()
+
+    if args.data_file:
+        path = Path(args.data_file)
+        if not path.is_file():
+            raise SystemExit(f"Data file not found: {path}")
+        X, y = load_creditcard_csv(str(path))
+        if len(X) > args.max_samples:
+            rng = np.random.default_rng(args.seed)
+            idx = rng.choice(len(X), size=args.max_samples, replace=False)
+            X, y = X[idx], y[idx]
+        data_src = f"CSV {path.name} (n={len(X)})"
+    else:
+        X, y = make_synthetic_imbalanced(args.seed, n_total=2000, fraud_ratio=0.02)
+        data_src = f"synthetic imbalanced (n={len(X)}, fraud~2%%)"
+
+    (
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        X_test,
+        y_test,
+        _scaler,
+        _pca,
+        sample_weights,
+    ) = preprocess(
+        X,
+        y,
+        pca_dim=args.pca_dim,
+        seed=args.seed,
+        test_size=0.2,
+        val_size=0.1,
+    )
+
+    print("Credit Card Fraud amplitude baseline (PennyLane, GPU)")
+    print(
+        f"  Data: {data_src} → StandardScaler → PCA({args.pca_dim}) → pad to {FEATURE_DIM} → L2 norm"
+    )
+    print(
+        f"  Train/val/test: {len(X_train)} / {len(X_val)} / {len(X_test)}  (stratified)"
+    )
+    print(
+        f"  Iters: {args.iters}, batch: {args.batch_size}, layers: {args.layers}, lr: {args.lr}"
+    )
+
+    results: list[dict[str, Any]] = []
+    for t in range(args.trials):
+        r = run_training(
+            X_train,
+            y_train,
+            X_test,
+            y_test,
+            sample_weights,
+            num_layers=args.layers,
+            iterations=args.iters,
+            batch_size=args.batch_size,
+            lr=args.lr,
+            seed=args.seed + t,
+        )
+        results.append(r)
+        print(f"\n  Trial {t + 1}:")
+        print(f"    Compile:   {r['compile_time_sec']:.4f} s")
+        print(
+            f"    Train:     {r['train_time_sec']:.4f} s  ({r['samples_per_sec']:.1f} samples/s)"
+        )
+        print(f"    AUPRC:     {r['auprc']:.4f}")
+        print(
+            f"    F1:        {r['f1_score']:.4f}  (P: {r['precision']:.4f}, R: {r['recall']:.4f})"
+        )
+
+    if args.trials > 1:
+        auprcs = sorted(r["auprc"] for r in results)
+        f1s = sorted(r["f1_score"] for r in results)
+        mid = args.trials // 2
+        print(
+            f"\n  Median AUPRC: {auprcs[mid]:.4f}  (min: {auprcs[0]:.4f}, max: {auprcs[-1]:.4f})"
+        )
+        print(
+            f"  Median F1:    {f1s[mid]:.4f}  (min: {f1s[0]:.4f}, max: {f1s[-1]:.4f})"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qdp/qdp-python/benchmark/encoding_benchmarks/pennylane_baseline/iris_amplitude.py b/qdp/qdp-python/benchmark/encoding_benchmarks/pennylane_baseline/iris_amplitude.py
index 66045f853a..3e684c25dc 100644
--- a/qdp/qdp-python/benchmark/encoding_benchmarks/pennylane_baseline/iris_amplitude.py
+++ b/qdp/qdp-python/benchmark/encoding_benchmarks/pennylane_baseline/iris_amplitude.py
@@ -28,6 +28,8 @@
   - Total samples: 100 (2-class Iris). Full Iris has 150 (3 classes).
 
 Pipeline: state prep (Möttönen angles) → Rot layers + CNOT → expval(PauliZ(0)) + bias; square loss; Adam or Nesterov.
+
+Training always runs on GPU via lightning.gpu for fair comparison with QDP pipeline.
 """
 
 from __future__ import annotations
@@ -38,11 +40,10 @@
 from typing import Any
 
 import numpy as np
+import torch
 
 try:
     import pennylane as qml
-    from pennylane import numpy as pnp
-    from pennylane.optimize import AdamOptimizer, NesterovMomentumOptimizer
 except ImportError as e:
     raise SystemExit(
         "PennyLane is required. Install with: uv sync --group benchmark"
@@ -90,7 +91,7 @@ def state_preparation(a, wires=(0, 1)) -> None:
 def layer(layer_weights, wires=(0, 1)) -> None:
     """Rot on each wire + CNOT (tutorial Iris section)."""
     for i, w in enumerate(wires):
-        qml.Rot(*layer_weights[i], wires=w)
+        qml.Rot(layer_weights[i, 0], layer_weights[i, 1], layer_weights[i, 2], wires=w)
     qml.CNOT(wires=list(wires))
 
 
@@ -131,7 +132,7 @@ def load_iris_binary(seed: int = 42) -> tuple[np.ndarray, np.ndarray]:
     return features, Y
 
 
-# --- Training: build circuit, split data, optimize, evaluate ---
+# --- Training: build circuit, split data, optimize, evaluate (GPU via lightning.gpu) ---
 def run_training(
     features: np.ndarray,
     Y: np.ndarray,
@@ -142,14 +143,23 @@ def run_training(
     lr: float,
     seed: int,
     test_size: float = 0.25,
-    optimizer: str = "adam",
     early_stop_target: float | None = 0.9,
 ) -> dict[str, Any]:
-    """Train classifier: circuit + bias, square loss, batched. Optional early stop when test acc ≥ target."""
-    dev = qml.device("default.qubit", wires=NUM_QUBITS)
+    """Train classifier on GPU: circuit + bias, square loss, batched. Optional early stop when test acc ≥ target."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA GPU is required for training. No CUDA device found.")
+    try:
+        dev = qml.device("lightning.gpu", wires=NUM_QUBITS)
+    except Exception as e:
+        raise RuntimeError(
+            "lightning.gpu is required for GPU training. Install with: "
+            "pip install pennylane-lightning[gpu]"
+        ) from e
+    device = torch.device("cuda")
+    dtype = torch.float64
 
     # Circuit: state_prep(angles) → layers of Rot+CNOT → expval(PauliZ(0))
-    @qml.qnode(dev, interface="autograd", diff_method="backprop")
+    @qml.qnode(dev, interface="torch", diff_method="adjoint")
     def circuit(weights, angles):
         state_preparation(angles, wires=(0, 1))
         for lw in weights:
@@ -161,78 +171,75 @@ def model(weights, bias, angles):
 
     def cost(weights, bias, X_batch, Y_batch):
         preds = model(weights, bias, X_batch.T)
-        return pnp.mean((Y_batch - preds) ** 2)
+        return torch.mean((Y_batch - preds) ** 2)
 
     # Train/val split (seed-driven)
     n = len(Y)
-    np.random.seed(seed)
-    try:
-        pnp.random.seed(seed)
-    except Exception:
-        pass
     rng = np.random.default_rng(seed)
     idx = rng.permutation(n)
     n_train = int(n * (1 - test_size))
-    feats_train = pnp.array(features[idx[:n_train]])
-    Y_train = pnp.array(Y[idx[:n_train]])
-    feats_test = features[idx[n_train:]]
-    Y_test = Y[idx[n_train:]]
+
+    feats_train_t = torch.tensor(features[idx[:n_train]], dtype=dtype, device=device)
+    Y_train_t = torch.tensor(
+        np.asarray(Y[idx[:n_train]], dtype=np.float64), dtype=dtype, device=device
+    )
+    feats_test_t = torch.tensor(features[idx[n_train:]], dtype=dtype, device=device)
+    Y_test_t = torch.tensor(
+        np.asarray(Y[idx[n_train:]], dtype=np.float64), dtype=dtype, device=device
+    )
 
     # Weights and optimizer
-    weights_init = 0.01 * pnp.random.randn(
-        num_layers, NUM_QUBITS, 3, requires_grad=True
+    torch.manual_seed(seed)
+    weights = torch.nn.Parameter(
+        0.01 * torch.randn(num_layers, NUM_QUBITS, 3, device=device, dtype=dtype)
     )
-    bias_init = pnp.array(0.0, requires_grad=True)
-    if optimizer == "adam":
-        opt = AdamOptimizer(lr)
-    else:
-        opt = NesterovMomentumOptimizer(lr)
+    bias = torch.nn.Parameter(torch.tensor(0.0, device=device, dtype=dtype))
+    opt = torch.optim.Adam([weights, bias], lr=lr)
 
     # Compile (first run)
     t0 = time.perf_counter()
-    _ = circuit(weights_init, feats_train[0])
-    _ = cost(weights_init, bias_init, feats_train[:1], Y_train[:1])
+    _ = circuit(weights, feats_train_t[0])
+    _ = cost(weights, bias, feats_train_t[:1], Y_train_t[:1])
     compile_sec = time.perf_counter() - t0
 
     # Optimize (batched steps; optional early stop every 100 steps)
     t0 = time.perf_counter()
-    weights, bias = weights_init, bias_init
     steps_done = 0
     for step in range(iterations):
+        opt.zero_grad()
         batch_idx = rng.integers(0, n_train, size=(batch_size,))
-        fb = feats_train[batch_idx]
-        yb = Y_train[batch_idx]
-        out = opt.step(cost, weights, bias, fb, yb)
-        weights, bias = out[0], out[1]
+        fb = feats_train_t[batch_idx]
+        yb = Y_train_t[batch_idx]
+        loss = cost(weights, bias, fb, yb)
+        loss.backward()
+        opt.step()
         steps_done += 1
         if early_stop_target is not None and (step + 1) % 100 == 0:
-            pred_test_now = np.sign(
-                np.array(model(weights, bias, pnp.array(feats_test).T))
-            ).flatten()
-            test_acc_now = float(
-                np.mean(np.abs(pred_test_now - np.array(Y_test)) < 1e-5)
-            )
+            with torch.no_grad():
+                pred_test_now = torch.sign(
+                    model(weights, bias, feats_test_t.T)
+                ).flatten()
+                test_acc_now = (
+                    (pred_test_now - Y_test_t).abs().lt(1e-5).float().mean().item()
+                )
             if test_acc_now >= early_stop_target:
                 break
     train_sec = time.perf_counter() - t0
 
     # Metrics (train/test accuracy)
-    pred_train = np.sign(np.array(model(weights, bias, feats_train.T))).flatten()
-    pred_test = np.sign(
-        np.array(model(weights, bias, pnp.array(feats_test).T))
-    ).flatten()
-    Y_train_np = np.array(Y_train)
-    Y_test_np = np.array(Y_test)
-    train_acc = float(np.mean(np.abs(pred_train - Y_train_np) < 1e-5))
-    test_acc = float(np.mean(np.abs(pred_test - Y_test_np) < 1e-5))
+    with torch.no_grad():
+        pred_train = torch.sign(model(weights, bias, feats_train_t.T)).flatten()
+        pred_test = torch.sign(model(weights, bias, feats_test_t.T)).flatten()
+    train_acc = (pred_train - Y_train_t).abs().lt(1e-5).float().mean().item()
+    test_acc = (pred_test - Y_test_t).abs().lt(1e-5).float().mean().item()
 
     return {
         "compile_time_sec": compile_sec,
         "train_time_sec": train_sec,
-        "train_accuracy": train_acc,
-        "test_accuracy": test_acc,
+        "train_accuracy": float(train_acc),
+        "test_accuracy": float(test_acc),
         "n_train": n_train,
-        "n_test": len(Y_test),
+        "n_test": len(Y) - n_train,
         "epochs": steps_done,
         "samples_per_sec": (steps_done * batch_size) / train_sec
         if train_sec > 0
@@ -242,7 +249,7 @@ def cost(weights, bias, X_batch, Y_batch):
 
 def main() -> None:
     parser = argparse.ArgumentParser(
-        description="PennyLane Iris amplitude encoding baseline (2-class)"
+        description="PennyLane Iris amplitude encoding baseline (2-class, GPU training)"
     )
     parser.add_argument(
         "--iters",
@@ -266,13 +273,6 @@ def main() -> None:
         help="Test fraction (default: 0.1); ignored if --data-file set",
     )
     parser.add_argument("--seed", type=int, default=0, help="Random seed (default: 0)")
-    parser.add_argument(
-        "--optimizer",
-        type=str,
-        default="adam",
-        choices=("adam", "nesterov"),
-        help="Optimizer (default: adam)",
-    )
     parser.add_argument(
         "--trials",
         type=int,
@@ -303,12 +303,12 @@ def main() -> None:
         test_size = args.test_size
         data_src = "sklearn load_iris, classes 0 & 1, 4 features"
     n = len(Y)
-    print("Iris amplitude baseline (PennyLane) — 2-class variational classifier")
+    print("Iris amplitude baseline (PennyLane, GPU) — 2-class variational classifier")
     print(
         f"  Data: {data_src} → L2 norm → get_angles  (n={n}; 2-class Iris = 100 samples)"
     )
     print(
-        f"  Iters: {args.iters}, batch_size: {args.batch_size}, layers: {args.layers}, lr: {args.lr}, optimizer: {args.optimizer}"
+        f"  Iters: {args.iters}, batch_size: {args.batch_size}, layers: {args.layers}, lr: {args.lr}"
     )
 
     results: list[dict[str, Any]] = []
@@ -322,7 +322,6 @@ def main() -> None:
             lr=args.lr,
             seed=args.seed + t,
             test_size=test_size,
-            optimizer=args.optimizer,
             early_stop_target=args.early_stop if args.early_stop > 0 else None,
         )
         results.append(r)
diff --git a/qdp/qdp-python/benchmark/encoding_benchmarks/qdp_pipeline/creditcardfraud_amplitude.py b/qdp/qdp-python/benchmark/encoding_benchmarks/qdp_pipeline/creditcardfraud_amplitude.py
new file mode 100644
index 0000000000..70dc17ecb7
--- /dev/null
+++ b/qdp/qdp-python/benchmark/encoding_benchmarks/qdp_pipeline/creditcardfraud_amplitude.py
@@ -0,0 +1,632 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+QDP pipeline: Credit Card Fraud (binary, highly imbalanced), amplitude encoding.
+
+Goal: **same data, model, loss, and metrics as the PennyLane baseline; only the
+encoding step is different**. Here we:
+
+- Preprocess features exactly as in the baseline:
+  StandardScaler → PCA (to <= pca_dim) → pad to FEATURE_DIM → L2-normalized vector.
+- Use QDP (`QuantumDataLoader` with `encoding("amplitude")`) to encode these
+  FEATURE_DIM vectors into **amplitude state vectors** of length `2**NUM_QUBITS`.
+- Feed the encoded state vectors into a PennyLane circuit via `qml.AmplitudeEmbedding`,
+  then apply the same variational layers, optimizer, and loss as the baseline.
+
+Best practices (aligned with ENCODING_BENCHMARK_PLAN.md §2.2):
+
+- Dataset: Kaggle "Credit Card Fraud Detection" (Time, V1..V28, Amount, Class).
+- Metrics: AUPRC (precision–recall AUC), F1-score, precision, recall.
+- Imbalance: class-weighted loss (minority class up-weighted); no accuracy.
+
+Training always runs on GPU via lightning.gpu.
+"""
+
+from __future__ import annotations
+
+import argparse
+import time
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+try:
+    import pennylane as qml
+except ImportError as e:
+    raise SystemExit(
+        "PennyLane is required. Install with: uv sync --group benchmark"
+    ) from e
+
+try:
+    from sklearn.decomposition import PCA
+    from sklearn.metrics import (
+        average_precision_score,
+        f1_score,
+        precision_score,
+        recall_score,
+    )
+    from sklearn.model_selection import train_test_split
+    from sklearn.preprocessing import StandardScaler
+except ImportError as e:
+    raise SystemExit(
+        "scikit-learn is required. Install with: uv sync --group benchmark"
+    ) from e
+
+try:
+    from qumat_qdp import QdpEngine, QuantumDataLoader
+except ImportError as e:
+    raise SystemExit(
+        "qumat_qdp (QDP Python bindings) is required. Build with: uv run maturin develop"
+    ) from e
+
+
+NUM_QUBITS = 5
+STATE_DIM = 2**NUM_QUBITS  # length of encoded state vector
+FEATURE_DIM = STATE_DIM  # pre-QDP feature dimension (padded to this)
+
+
+def _layer(layer_weights: torch.Tensor, wires: tuple[int, ...]) -> None:
+    """Single variational layer: Rot on each wire + ring of CNOTs."""
+    for i, w in enumerate(wires):
+        qml.Rot(layer_weights[i, 0], layer_weights[i, 1], layer_weights[i, 2], wires=w)
+    for i in range(len(wires)):
+        qml.CNOT(wires=[wires[i], wires[(i + 1) % len(wires)]])
+
+
+def load_creditcard_csv(path: str) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Load Credit Card Fraud CSV. Expects columns including V1..V28, Amount, Class.
+    Returns (X_raw shape (n, 30), y shape (n,) with 0/1).
+    """
+    data = np.genfromtxt(path, delimiter=",", skip_header=1, dtype=np.float64)
+    if data.ndim == 1:
+        data = data.reshape(1, -1)
+    # Last column = Class; rest = features (Time, V1..V28, Amount)
+    X = data[:, :-1]
+    y = data[:, -1].astype(np.int32)
+    # If CSV has header row with "Time", we already skipped it
+    if X.shape[1] >= 30:
+        X = X[:, -30:]  # last 30 cols: V1..V28, Amount (and drop Time if 31)
+    elif X.shape[1] < 30:
+        # Pad with zeros to 30
+        pad = np.zeros((X.shape[0], 30 - X.shape[1]), dtype=np.float64)
+        X = np.hstack([X, pad])
+    return X, y
+
+
+def make_synthetic_imbalanced(
+    seed: int, n_total: int = 2000, fraud_ratio: float = 0.02
+) -> tuple[np.ndarray, np.ndarray]:
+    """Synthetic 30-D imbalanced binary data for smoke test when no CSV is provided."""
+    rng = np.random.default_rng(seed)
+    n_fraud = max(1, int(n_total * fraud_ratio))
+    n_legit = n_total - n_fraud
+    X_legit = rng.standard_normal((n_legit, 30)).astype(np.float64) * 0.5
+    X_fraud = rng.standard_normal((n_fraud, 30)).astype(np.float64) * 0.5 + 1.0
+    X = np.vstack([X_legit, X_fraud])
+    y = np.array([0] * n_legit + [1] * n_fraud, dtype=np.int32)
+    perm = rng.permutation(n_total)
+    return X[perm], y[perm]
+
+
+def preprocess(
+    X: np.ndarray,
+    y: np.ndarray,
+    pca_dim: int,
+    seed: int,
+    test_size: float = 0.2,
+    val_size: float = 0.1,
+) -> tuple[
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    np.ndarray,
+    Any,
+    Any,
+    np.ndarray,
+]:
+    """
+    StandardScaler → PCA (to <= pca_dim) → pad to FEATURE_DIM → L2 normalize.
+    Stratified train/val/test. Returns X_train, y_train, X_val, y_val, X_test, y_test,
+    plus scaler, pca, and sample_weights for weighted loss.
+    """
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+
+    pca = PCA(
+        n_components=min(pca_dim, X_scaled.shape[1], X_scaled.shape[0] - 1),
+        random_state=seed,
+    )
+    X_pca = pca.fit_transform(X_scaled)
+    if X_pca.shape[1] < FEATURE_DIM:
+        pad = np.zeros((X_pca.shape[0], FEATURE_DIM - X_pca.shape[1]), dtype=np.float64)
+        X_pca = np.hstack([X_pca, pad])
+
+    norm = np.linalg.norm(X_pca, axis=1, keepdims=True)
+    norm[norm < 1e-12] = 1.0
+    X_norm = (X_pca / norm).astype(np.float64)
+
+    rng = np.random.RandomState(seed)
+    idx = rng.permutation(len(y))
+    X_norm, y = X_norm[idx], y[idx]
+
+    # Stratified split: first test, then val from train
+    X_temp, X_test, y_temp, y_test = train_test_split(
+        X_norm, y, test_size=test_size, stratify=y, random_state=seed
+    )
+    val_ratio = val_size / (1 - test_size)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X_temp, y_temp, test_size=val_ratio, stratify=y_temp, random_state=seed
+    )
+
+    # Class weights for weighted MSE: n / (2 * n_class)
+    n0 = max(1, int(np.sum(y_train == 0)))
+    n1 = max(1, int(np.sum(y_train == 1)))
+    w0 = len(y_train) / (2 * n0)
+    w1 = len(y_train) / (2 * n1)
+    sample_weights = np.where(y_train == 0, w0, w1).astype(np.float64)
+
+    return (
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        X_test,
+        y_test,
+        scaler,
+        pca,
+        sample_weights,
+    )
+
+
+def encode_via_qdp_engine(
+    X_norm: np.ndarray,
+    *,
+    batch_size: int,
+    device_id: int = 0,
+) -> torch.Tensor:
+    """
+    QDP API: amplitude-encode in memory via QdpEngine.encode() (batched).
+    No temp file; minimal CPU–GPU transfer by batching.
+    Returns GPU torch.Tensor shape (n, STATE_DIM).
+    """
+    n, dim = X_norm.shape
+    if dim != FEATURE_DIM:
+        raise ValueError(
+            f"X_norm must have {FEATURE_DIM} features for {NUM_QUBITS} qubits, got {dim}"
+        )
+    # Ensure float64 C-contiguous once before the loop (preprocess() already guarantees this,
+    # but guard against callers passing non-contiguous or non-float64 arrays).
+    if not (X_norm.dtype == np.float64 and X_norm.flags["C_CONTIGUOUS"]):
+        X_norm = np.ascontiguousarray(X_norm, dtype=np.float64)
+    engine = QdpEngine(device_id=device_id)
+    batches_list: list[torch.Tensor] = []
+    for start in range(0, n, batch_size):
+        end = min(start + batch_size, n)
+        # Pass slice directly — no per-batch astype() copy needed.
+        qt = engine.encode(X_norm[start:end], NUM_QUBITS, "amplitude")
+        t = torch.from_dlpack(qt)
+        batches_list.append(t)
+    # torch.cat produces exactly n rows and a contiguous tensor.
+    encoded = torch.cat(batches_list, dim=0)
+    # DLPack exports complex128 (CuDoubleComplex) even though imaginary parts are always 0.0
+    # (amplitude encoding of real input → real state vector; CUDA kernel hardcodes imag=0.0).
+    # Taking .real gives a float64 view (zero-copy) matching the baseline's dtype and halving
+    # memory footprint, which also avoids any complex-arithmetic paths in PennyLane.
+    if encoded.is_complex():
+        encoded = encoded.real
+    if encoded.shape[1] != STATE_DIM:
+        raise ValueError(
+            f"Encoded state dimension mismatch: expected {STATE_DIM}, got {encoded.shape[1]}"
+        )
+    return encoded
+
+
+def encoded_batches_from_loader(
+    X_norm: np.ndarray,
+    *,
+    batch_size: int,
+    device_id: int = 0,
+    data_dir: str | None = None,
+    filename: str = "creditcard_train.npy",
+) -> Iterator[tuple[torch.Tensor, int, int]]:
+    """
+    DataLoader API: stream amplitude-encoded batches from QuantumDataLoader (in-memory).
+    Uses source_array() (no temp file). Always returns GPU torch.Tensor batches.
+    Yields (batch, start_idx, end_idx).
+    """
+    n, dim = X_norm.shape
+    if dim != FEATURE_DIM:
+        raise ValueError(
+            f"X_norm must have {FEATURE_DIM} features for {NUM_QUBITS} qubits, got {dim}"
+        )
+    total_batches = (n + batch_size - 1) // batch_size
+    loader = (
+        QuantumDataLoader(device_id=device_id)
+        .qubits(NUM_QUBITS)
+        .encoding("amplitude")
+        .batches(total_batches, size=batch_size)
+        .source_array(X_norm.astype(np.float64))
+        .as_torch(device="cuda")
+    )
+    start = 0
+    for batch in loader:
+        end = min(start + batch.shape[0], n)
+        actual = batch[: end - start]
+        if actual.shape[1] != STATE_DIM:
+            raise ValueError(
+                f"Encoded state dimension mismatch: expected {STATE_DIM}, got {actual.shape[1]}"
+            )
+        yield actual, start, end
+        start = end
+
+
+def run_training(
+    encoded_train: torch.Tensor,
+    encoded_test: torch.Tensor,
+    y_train: np.ndarray,
+    y_test: np.ndarray,
+    sample_weights: np.ndarray,
+    *,
+    num_layers: int,
+    iterations: int,
+    batch_size: int,
+    lr: float,
+    seed: int,
+) -> dict[str, Any]:
+    """Train 5-qubit amplitude VQC on GPU; dispatch to lightning.gpu."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA GPU is required for training. No CUDA device found.")
+    try:
+        dev_qml = qml.device("lightning.gpu", wires=NUM_QUBITS)
+    except Exception as e:
+        raise RuntimeError(
+            "lightning.gpu is required for GPU training. Install with: "
+            "pip install pennylane-lightning[gpu]"
+        ) from e
+
+    # Ensure encoded data is on GPU
+    if not encoded_train.is_cuda:
+        encoded_train = encoded_train.cuda()
+    if not encoded_test.is_cuda:
+        encoded_test = encoded_test.cuda()
+
+    device = encoded_train.device
+    dtype = encoded_train.dtype
+    n_train = len(y_train)
+    y_test_np = np.asarray(y_test)
+
+    Y_train_t = torch.tensor(
+        np.asarray(y_train, dtype=np.float64), dtype=dtype, device=device
+    )
+    W_train_t = torch.tensor(sample_weights, dtype=dtype, device=device)
+
+    wires = tuple(range(NUM_QUBITS))
+
+    @qml.qnode(dev_qml, interface="torch", diff_method="adjoint")
+    def circuit(weights: torch.Tensor, state_vector: torch.Tensor) -> torch.Tensor:
+        qml.AmplitudeEmbedding(state_vector, wires=wires, normalize=False)
+        for w in weights:
+            _layer(w, wires)
+        return qml.expval(qml.PauliZ(0))
+
+    def model(
+        weights: torch.Tensor, bias: torch.Tensor, state_batch: torch.Tensor
+    ) -> torch.Tensor:
+        return circuit(weights, state_batch) + bias
+
+    def cost(
+        weights: torch.Tensor,
+        bias: torch.Tensor,
+        states_batch: torch.Tensor,
+        Y_batch: torch.Tensor,
+        w_batch: torch.Tensor,
+    ) -> torch.Tensor:
+        target = Y_batch * 2.0 - 1.0
+        preds = model(weights, bias, states_batch)
+        return (w_batch * (target - preds) ** 2).sum() / (w_batch.sum() + 1e-12)
+
+    torch.manual_seed(seed)
+    weights = torch.nn.Parameter(
+        0.01 * torch.randn(num_layers, NUM_QUBITS, 3, device=device, dtype=dtype)
+    )
+    bias = torch.nn.Parameter(torch.tensor(0.0, device=device, dtype=dtype))
+    opt = torch.optim.Adam([weights, bias], lr=lr)
+
+    t0 = time.perf_counter()
+    _ = circuit(weights, encoded_train[:1])
+    _ = cost(weights, bias, encoded_train[:1], Y_train_t[:1], W_train_t[:1])
+    compile_sec = time.perf_counter() - t0
+
+    # Use torch.randint so indices stay on GPU — avoids implicit H2D transfer of 256 indices
+    # per step (NumPy rng.integers → CPU array → implicit copy to index CUDA tensor).
+    _batch_n = min(batch_size, n_train)
+    t0 = time.perf_counter()
+    for _ in range(iterations):
+        opt.zero_grad()
+        idx = torch.randint(0, n_train, (_batch_n,), device=device)
+        sb = encoded_train[idx]
+        yb = Y_train_t[idx]
+        wb = W_train_t[idx]
+        loss = cost(weights, bias, sb, yb, wb)
+        loss.backward()
+        opt.step()
+    train_sec = time.perf_counter() - t0
+
+    with torch.no_grad():
+        pred_scores = model(weights, bias, encoded_test).cpu().numpy().flatten()
+    pred_binary = (np.sign(pred_scores) > 0).astype(np.int32)
+    scores_positive = (pred_scores + 1.0) / 2.0
+    auprc = float(average_precision_score(y_test_np, scores_positive))
+    f1 = float(f1_score(y_test_np, pred_binary, zero_division=0))
+    prec = float(precision_score(y_test_np, pred_binary, zero_division=0))
+    rec = float(recall_score(y_test_np, pred_binary, zero_division=0))
+
+    return {
+        "compile_time_sec": compile_sec,
+        "train_time_sec": train_sec,
+        "samples_per_sec": (iterations * _batch_n) / train_sec
+        if train_sec > 0
+        else 0.0,
+        "auprc": auprc,
+        "f1_score": f1,
+        "precision": prec,
+        "recall": rec,
+        "n_train": n_train,
+        "n_test": len(y_test_np),
+        "iterations": iterations,
+    }
+
+
+def run_training_from_loader(
+    X_train: np.ndarray,
+    encoded_test: torch.Tensor,
+    y_train: np.ndarray,
+    y_test: np.ndarray,
+    sample_weights: np.ndarray,
+    *,
+    num_layers: int,
+    iterations: int,
+    encode_batch_size: int,
+    device_id: int = 0,
+    encode_data_dir: str | None = None,
+    lr: float,
+    seed: int,
+    batch_size: int = 256,
+) -> dict[str, Any]:
+    """Train by streaming encoded batches from QuantumDataLoader on GPU.
+    Encode once on GPU, then train with lightning.gpu."""
+
+    encoded_train = encode_via_qdp_engine(
+        X_train,
+        batch_size=encode_batch_size,
+        device_id=device_id,
+    )
+    return run_training(
+        encoded_train,
+        encoded_test,
+        y_train,
+        y_test,
+        sample_weights,
+        num_layers=num_layers,
+        iterations=iterations,
+        batch_size=batch_size,
+        lr=lr,
+        seed=seed,
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="QDP Credit Card Fraud pipeline (amplitude, 5 qubits, AUPRC/F1, GPU training)"
+    )
+    parser.add_argument(
+        "--data-file",
+        type=str,
+        default=None,
+        help="Path to CSV (e.g. Kaggle creditcard.csv). If omitted, use synthetic imbalanced data.",
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=50_000,
+        help="Max samples to use from CSV (default: 50000); ignored for synthetic.",
+    )
+    parser.add_argument(
+        "--pca-dim",
+        type=int,
+        default=30,
+        help="PCA components before padding to FEATURE_DIM (default: 30, capped by feature dim).",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument(
+        "--iters",
+        type=int,
+        default=5000,
+        help="Optimizer steps (default: 5000; use same as baseline for apples-to-apples).",
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=256, help="Training batch size"
+    )
+    parser.add_argument("--layers", type=int, default=2, help="Variational layers")
+    parser.add_argument("--lr", type=float, default=0.01, help="Learning rate")
+    parser.add_argument(
+        "--trials",
+        type=int,
+        default=1,
+        help="Number of runs (same data, different seeds); report median AUPRC/F1.",
+    )
+    parser.add_argument(
+        "--device-id",
+        type=int,
+        default=0,
+        help="QDP device id (default: 0)",
+    )
+    parser.add_argument(
+        "--encode-batch-size",
+        type=int,
+        default=4096,
+        help="Batch size for QDP encoding (default: 4096).",
+    )
+    parser.add_argument(
+        "--encode-data-dir",
+        type=str,
+        default=None,
+        help="Directory for temporary .npy files used by QDP loader (default: system temp).",
+    )
+    parser.add_argument(
+        "--use-loader",
+        action="store_true",
+        help="Stream encoded batches via QuantumDataLoader.source_array() (DataLoader API).",
+    )
+    args = parser.parse_args()
+
+    if args.data_file:
+        path = Path(args.data_file)
+        if not path.is_file():
+            raise SystemExit(f"Data file not found: {path}")
+        X, y = load_creditcard_csv(str(path))
+        if len(X) > args.max_samples:
+            rng = np.random.default_rng(args.seed)
+            idx = rng.choice(len(X), size=args.max_samples, replace=False)
+            X, y = X[idx], y[idx]
+        data_src = f"CSV {path.name} (n={len(X)})"
+    else:
+        X, y = make_synthetic_imbalanced(args.seed, n_total=2000, fraud_ratio=0.02)
+        data_src = f"synthetic imbalanced (n={len(X)}, fraud~2%)"
+
+    (
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        X_test,
+        y_test,
+        _scaler,
+        _pca,
+        sample_weights,
+    ) = preprocess(
+        X,
+        y,
+        pca_dim=args.pca_dim,
+        seed=args.seed,
+        test_size=0.2,
+        val_size=0.1,
+    )
+
+    print("QDP Credit Card Fraud amplitude pipeline (GPU)")
+    print(
+        f"  Data: {data_src} → StandardScaler → PCA({args.pca_dim}) "
+        f"→ pad to {FEATURE_DIM} → QDP amplitude → L2 norm (implicit)"
+    )
+    print(
+        f"  Train/val/test (features pre-QDP): "
+        f"{len(X_train)} / {len(X_val)} / {len(X_test)}  (stratified)"
+    )
+    print(
+        f"  Iters: {args.iters}, train batch: {args.batch_size}, "
+        f"encode batch: {args.encode_batch_size}, layers: {args.layers}, lr: {args.lr}"
+    )
+    print("  Encode + Train: GPU (QDP encode + lightning.gpu circuit).")
+
+    # Encode test set via QDP (keep on GPU)
+    t_enc0 = time.perf_counter()
+    encoded_test = encode_via_qdp_engine(
+        X_test,
+        batch_size=args.encode_batch_size,
+        device_id=args.device_id,
+    )
+    enc_test_sec = time.perf_counter() - t_enc0
+    print(f"  Encode test  ({len(X_test)} samples): {enc_test_sec:.4f} s")
+
+    results: list[dict[str, Any]] = []
+    for t in range(args.trials):
+        if args.use_loader:
+            r = run_training_from_loader(
+                X_train,
+                encoded_test,
+                y_train,
+                y_test,
+                sample_weights,
+                num_layers=args.layers,
+                iterations=args.iters,
+                encode_batch_size=args.encode_batch_size,
+                device_id=args.device_id,
+                encode_data_dir=args.encode_data_dir,
+                lr=args.lr,
+                seed=args.seed + t,
+                batch_size=args.batch_size,
+            )
+            r["encode_train_sec"] = 0.0  # encoded lazily inside loader
+        else:
+            t_enc1 = time.perf_counter()
+            encoded_train = encode_via_qdp_engine(
+                X_train,
+                batch_size=args.encode_batch_size,
+                device_id=args.device_id,
+            )
+            enc_train_sec = time.perf_counter() - t_enc1
+            r = run_training(
+                encoded_train,
+                encoded_test,
+                y_train,
+                y_test,
+                sample_weights,
+                num_layers=args.layers,
+                iterations=args.iters,
+                batch_size=args.batch_size,
+                lr=args.lr,
+                seed=args.seed + t,
+            )
+            r["encode_train_sec"] = enc_train_sec
+        results.append(r)
+        print(f"\n  Trial {t + 1}:")
+        print(
+            f"    Encode train ({len(X_train)} samples): {r.get('encode_train_sec', 0.0):.4f} s"
+        )
+        print(f"    Compile:   {r['compile_time_sec']:.4f} s")
+        print(
+            f"    Train:     {r['train_time_sec']:.4f} s  "
+            f"({r['samples_per_sec']:.1f} samples/s)"
+        )
+        print(f"    AUPRC:     {r['auprc']:.4f}")
+        print(
+            f"    F1:        {r['f1_score']:.4f}  "
+            f"(P: {r['precision']:.4f}, R: {r['recall']:.4f})"
+        )
+
+    if args.trials > 1:
+        auprcs = sorted(r["auprc"] for r in results)
+        f1s = sorted(r["f1_score"] for r in results)
+        mid = args.trials // 2
+        print(
+            f"\n  Median AUPRC: {auprcs[mid]:.4f}  "
+            f"(min: {auprcs[0]:.4f}, max: {auprcs[-1]:.4f})"
+        )
+        print(
+            f"  Median F1:    {f1s[mid]:.4f}  (min: {f1s[0]:.4f}, max: {f1s[-1]:.4f})"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qdp/qdp-python/benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py b/qdp/qdp-python/benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py
index 5ab902b74a..7540e980e0 100644
--- a/qdp/qdp-python/benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py
+++ b/qdp/qdp-python/benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py
@@ -29,6 +29,8 @@
 
 Only difference from baseline: encoding. Here we use QDP (QdpEngine.encode + amplitude) → StatePrep(encoded);
 baseline uses get_angles → state_preparation(angles). Rest: same circuit (Rot + CNOT), loss, optimizer, CLI.
+
+Training always runs on GPU via lightning.gpu.
 """
 
 from __future__ import annotations
@@ -39,11 +41,10 @@
 from typing import Any
 
 import numpy as np
+import torch
 
 try:
     import pennylane as qml
-    from pennylane import numpy as pnp
-    from pennylane.optimize import AdamOptimizer, NesterovMomentumOptimizer
 except ImportError as e:
     raise SystemExit(
         "PennyLane is required. Install with: uv sync --group benchmark"
@@ -57,18 +58,17 @@
         "scikit-learn is required. Install with: uv sync --group benchmark"
     ) from e
 
-import torch
 from qumat_qdp import QdpEngine
 
 NUM_QUBITS = 2
 STATE_DIM = 2**NUM_QUBITS  # 4
 
 
-# --- Circuit: variational layer (Rot + CNOT); state prep is StatePrep(encoded) in training ---
+# --- Circuit: variational layer (Rot + CNOT); state prep is AmplitudeEmbedding(encoded) in training ---
 def layer(layer_weights, wires=(0, 1)) -> None:
     """Rot on each wire + CNOT (tutorial Iris section)."""
     for i, w in enumerate(wires):
-        qml.Rot(*layer_weights[i], wires=w)
+        qml.Rot(layer_weights[i, 0], layer_weights[i, 1], layer_weights[i, 2], wires=w)
     qml.CNOT(wires=list(wires))
 
 
@@ -120,7 +120,7 @@ def encode_via_qdp(
     """QDP: use QdpEngine.encode on 4-D vectors (amplitude), return encoded (n, 4) on GPU.
 
     Uses in-memory encoding via QdpEngine instead of writing/reading .npy files. The returned
-    tensor stays on the selected CUDA device and can be fed directly to qml.StatePrep.
+    tensor stays on the selected CUDA device and can be fed directly to qml.AmplitudeEmbedding.
     """
     n, dim = X_norm.shape
     if dim != STATE_DIM:
@@ -134,13 +134,17 @@ def encode_via_qdp(
         encoding_method="amplitude",
     )
     encoded = torch.from_dlpack(qt)
-    return encoded[:n]
+    # DLPack exports complex dtype even though imaginary parts are always 0.0
+    # (CUDA kernel hardcodes imag=0.0). Taking .real gives a real-valued zero-copy view.
+    if encoded.is_complex():
+        encoded = encoded.real
+    return encoded[:n].clone()
 
 
-# --- Training: StatePrep(encoded) + Rot layers, square loss, optional early stop ---
+# --- Training: AmplitudeEmbedding(encoded) + Rot layers, square loss, GPU only ---
 def run_training(
-    encoded_train: torch.Tensor | np.ndarray,
-    encoded_test: torch.Tensor | np.ndarray,
+    encoded_train: torch.Tensor,
+    encoded_test: torch.Tensor,
     Y_train: np.ndarray,
     Y_test: np.ndarray,
     *,
@@ -150,176 +154,28 @@ def run_training(
     lr: float,
     seed: int,
     early_stop_target: float | None = None,
-    optimizer: str = "nesterov",
 ) -> dict[str, Any]:
-    """Train variational classifier: StatePrep(encoded) + Rot layers + bias, square loss, batched.
-    If encoded_* are on GPU and lightning.gpu is available, training runs on GPU; otherwise on CPU.
+    """Train variational classifier on GPU: AmplitudeEmbedding(encoded) + Rot layers + bias, square loss, batched.
     When early_stop_target is set, evaluate test acc every 100 steps and stop when >= target."""
-    n_train = len(Y_train)
-    np.random.seed(seed)
-    rng = np.random.default_rng(seed)
-
-    # Prefer Lightning GPU when encoded data is on GPU
-    use_gpu = isinstance(encoded_train, torch.Tensor) and encoded_train.is_cuda
-    dev_qml = None
-    if use_gpu:
-        try:
-            dev_qml = qml.device("lightning.gpu", wires=NUM_QUBITS)
-        except Exception:
-            use_gpu = False
-    if not use_gpu or dev_qml is None:
-        dev_qml = qml.device("default.qubit", wires=NUM_QUBITS)
-        use_gpu = False
-        if isinstance(encoded_train, torch.Tensor):
-            encoded_train = encoded_train.cpu().numpy()
-        if isinstance(encoded_test, torch.Tensor):
-            encoded_test = encoded_test.cpu().numpy()
-
-    if use_gpu:
-        return _run_training_gpu(
-            encoded_train,
-            encoded_test,
-            Y_train,
-            Y_test,
-            dev_qml=dev_qml,
-            num_layers=num_layers,
-            iterations=iterations,
-            batch_size=batch_size,
-            lr=lr,
-            seed=seed,
-            n_train=n_train,
-            rng=rng,
-            early_stop_target=early_stop_target,
-        )
-    return _run_training_cpu(
-        encoded_train,
-        encoded_test,
-        Y_train,
-        Y_test,
-        dev_qml=dev_qml,
-        num_layers=num_layers,
-        iterations=iterations,
-        batch_size=batch_size,
-        lr=lr,
-        seed=seed,
-        n_train=n_train,
-        rng=rng,
-        qml_device="cpu",
-        early_stop_target=early_stop_target,
-        optimizer=optimizer,
-    )
-
-
-def _run_training_cpu(
-    encoded_train: np.ndarray,
-    encoded_test: np.ndarray,
-    Y_train: np.ndarray,
-    Y_test: np.ndarray,
-    *,
-    dev_qml: Any,  # noqa: ANN401
-    num_layers: int,
-    iterations: int,
-    batch_size: int,
-    lr: float,
-    seed: int,
-    n_train: int,
-    rng: np.random.Generator,
-    qml_device: str = "cpu",
-    early_stop_target: float | None = None,
-    optimizer: str = "nesterov",
-) -> dict[str, Any]:
-    """CPU path: default.qubit + autograd + Nesterov or Adam. Optional early stop every 100 steps."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA GPU is required for training. No CUDA device found.")
     try:
-        pnp.random.seed(seed)
-    except Exception:
-        pass
-    feats_train = pnp.array(encoded_train)
-    feats_test = encoded_test
-    Y_train_pnp = pnp.array(Y_train)
-    Y_test_np = np.asarray(Y_test)
-
-    @qml.qnode(dev_qml, interface="autograd", diff_method="backprop")
-    def circuit(weights, state_vector):
-        qml.StatePrep(state_vector, wires=(0, 1))
-        for lw in weights:
-            layer(lw, wires=(0, 1))
-        return qml.expval(qml.PauliZ(0))
+        dev_qml = qml.device("lightning.gpu", wires=NUM_QUBITS)
+    except Exception as e:
+        raise RuntimeError(
+            "lightning.gpu is required for GPU training. Install with: "
+            "pip install pennylane-lightning[gpu]"
+        ) from e
 
-    def model(weights, bias, state_batch):
-        return circuit(weights, state_batch) + bias
-
-    def cost(weights, bias, X_batch, Y_batch):
-        preds = model(weights, bias, X_batch)
-        return pnp.mean((Y_batch - preds) ** 2)
-
-    weights_init = 0.01 * pnp.random.randn(
-        num_layers, NUM_QUBITS, 3, requires_grad=True
-    )
-    bias_init = pnp.array(0.0, requires_grad=True)
-    opt = AdamOptimizer(lr) if optimizer == "adam" else NesterovMomentumOptimizer(lr)
-
-    t0 = time.perf_counter()
-    _ = circuit(weights_init, feats_train[0])
-    _ = cost(weights_init, bias_init, feats_train[:1], Y_train_pnp[:1])
-    compile_sec = time.perf_counter() - t0
-
-    t0 = time.perf_counter()
-    weights, bias = weights_init, bias_init
-    steps_done = 0
-    for step in range(iterations):
-        batch_idx = rng.integers(0, n_train, size=(batch_size,))
-        fb = feats_train[batch_idx]
-        yb = Y_train_pnp[batch_idx]
-        out = opt.step(cost, weights, bias, fb, yb)
-        weights, bias = out[0], out[1]
-        steps_done += 1
-        if early_stop_target is not None and (step + 1) % 100 == 0:
-            pred_test_now = np.sign(
-                np.array(model(weights, bias, pnp.array(feats_test)))
-            ).flatten()
-            test_acc_now = float(np.mean(np.abs(pred_test_now - Y_test_np) < 1e-5))
-            if test_acc_now >= early_stop_target:
-                break
-    train_sec = time.perf_counter() - t0
-
-    pred_train = np.sign(np.array(model(weights, bias, feats_train))).flatten()
-    pred_test = np.sign(np.array(model(weights, bias, pnp.array(feats_test)))).flatten()
-    Y_train_np = np.array(Y_train_pnp)
-    train_acc = float(np.mean(np.abs(pred_train - Y_train_np) < 1e-5))
-    test_acc = float(np.mean(np.abs(pred_test - Y_test_np) < 1e-5))
-
-    return {
-        "compile_time_sec": compile_sec,
-        "train_time_sec": train_sec,
-        "train_accuracy": train_acc,
-        "test_accuracy": test_acc,
-        "n_train": n_train,
-        "n_test": len(Y_test),
-        "epochs": steps_done,
-        "samples_per_sec": (steps_done * batch_size) / train_sec
-        if train_sec > 0
-        else 0.0,
-        "qml_device": qml_device,
-    }
+    n_train = len(Y_train)
+    rng = np.random.default_rng(seed)
 
+    # Ensure encoded data is on GPU
+    if not encoded_train.is_cuda:
+        encoded_train = encoded_train.cuda()
+    if not encoded_test.is_cuda:
+        encoded_test = encoded_test.cuda()
 
-def _run_training_gpu(
-    encoded_train: torch.Tensor,
-    encoded_test: torch.Tensor,
-    Y_train: np.ndarray,
-    Y_test: np.ndarray,
-    *,
-    dev_qml: Any,  # noqa: ANN401
-    num_layers: int,
-    iterations: int,
-    batch_size: int,
-    lr: float,
-    seed: int,
-    n_train: int,
-    rng: np.random.Generator,
-    early_stop_target: float | None = None,
-) -> dict[str, Any]:
-    """GPU path: lightning.gpu + PyTorch interface, data stays on GPU. Optional early stop every 100 steps."""
     device = encoded_train.device
     dtype = encoded_train.dtype
     Y_train_t = torch.tensor(Y_train, dtype=dtype, device=device)
@@ -327,7 +183,8 @@ def _run_training_gpu(
 
     @qml.qnode(dev_qml, interface="torch", diff_method="adjoint")
     def circuit(weights, state_vector):
-        qml.StatePrep(state_vector, wires=(0, 1))
+        # normalize=False: QDP pre-normalizes to unit norm, skipping PennyLane's re-normalization.
+        qml.AmplitudeEmbedding(state_vector, wires=(0, 1), normalize=False)
         for lw in weights:
             layer(lw, wires=(0, 1))
         return qml.expval(qml.PauliZ(0))
@@ -340,10 +197,10 @@ def cost(weights, bias, X_batch, Y_batch):
         return torch.mean((Y_batch - preds) ** 2)
 
     torch.manual_seed(seed)
-    weights = 0.01 * torch.randn(
-        num_layers, NUM_QUBITS, 3, device=device, dtype=dtype, requires_grad=True
+    weights = torch.nn.Parameter(
+        0.01 * torch.randn(num_layers, NUM_QUBITS, 3, device=device, dtype=dtype)
     )
-    bias = torch.tensor(0.0, device=device, dtype=dtype, requires_grad=True)
+    bias = torch.nn.Parameter(torch.tensor(0.0, device=device, dtype=dtype))
     opt = torch.optim.SGD([weights, bias], lr=lr, momentum=0.9, nesterov=True)
 
     t0 = time.perf_counter()
@@ -389,13 +246,12 @@ def cost(weights, bias, X_batch, Y_batch):
         "samples_per_sec": (steps_done * batch_size) / train_sec
         if train_sec > 0
         else 0.0,
-        "qml_device": "cuda",
     }
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(
-        description="QDP Iris amplitude encoding pipeline (2-class, same training as baseline)"
+        description="QDP Iris amplitude encoding pipeline (2-class, GPU training)"
     )
     parser.add_argument(
         "--iters",
@@ -419,13 +275,6 @@ def main() -> None:
         help="Test fraction (default: 0.1); ignored if --data-file set",
     )
     parser.add_argument("--seed", type=int, default=0, help="Random seed (default: 0)")
-    parser.add_argument(
-        "--optimizer",
-        type=str,
-        default="adam",
-        choices=("adam", "nesterov"),
-        help="Optimizer for CPU (default: adam); GPU uses SGD+Nesterov",
-    )
     parser.add_argument(
         "--trials",
         type=int,
@@ -471,7 +320,7 @@ def main() -> None:
     Y_train = Y[train_idx]
     Y_test = Y[test_idx]
 
-    # QDP encoding: 4-D → amplitude-encoded state vectors
+    # QDP encoding: 4-D → amplitude-encoded state vectors (on GPU)
     encoded_train = encode_via_qdp(
         X_train_4d,
         batch_size=args.batch_size,
@@ -487,10 +336,10 @@ def main() -> None:
         filename="iris_4d_test.npy",
     )
 
-    print("Iris amplitude (QDP encoding) — 2-class variational classifier")
+    print("Iris amplitude (QDP encoding, GPU) — 2-class variational classifier")
     print(f"  Data: {data_src} → QDP amplitude  (n={n}; 2-class Iris = 100 samples)")
     print(
-        f"  Iters: {args.iters}, batch_size: {args.batch_size}, layers: {args.layers}, lr: {args.lr}, optimizer: {args.optimizer}"
+        f"  Iters: {args.iters}, batch_size: {args.batch_size}, layers: {args.layers}, lr: {args.lr}"
     )
 
     results: list[dict[str, Any]] = []
@@ -507,11 +356,9 @@ def main() -> None:
             lr=args.lr,
             seed=args.seed + t,
             early_stop_target=early_stop,
-            optimizer=args.optimizer,
         )
         results.append(r)
         print(f"\n  Trial {t + 1}:")
-        print(f"    QML device: {r.get('qml_device', 'cpu')}")
         print(f"    Compile:   {r['compile_time_sec']:.4f} s")
         print(f"    Train:     {r['train_time_sec']:.4f} s")
         print(f"    Train acc: {r['train_accuracy']:.4f}  (n={r['n_train']})")
diff --git a/qdp/qdp-python/pyproject.toml b/qdp/qdp-python/pyproject.toml
index 3af565db43..7a4750445d 100644
--- a/qdp/qdp-python/pyproject.toml
+++ b/qdp/qdp-python/pyproject.toml
@@ -49,10 +49,12 @@ benchmark = [
 
 [tool.uv.sources]
 qumat = { path = "../..", editable = true }
+torch = { index = "pytorch" }
 
+# CUDA 12.6 wheels to match driver (libnvJitLink 12_6); cu122 pulls libs that need 12_8 and fail.
 [[tool.uv.index]]
 name = "pytorch"
-url = "https://download.pytorch.org/whl/cu122"
+url = "https://download.pytorch.org/whl/cu126"
 explicit = true
 
 # Invalidate uv cache when Rust or Cargo changes so extension is rebuilt (run_throughput_pipeline_py etc.).
diff --git a/qdp/qdp-python/qumat_qdp/loader.py b/qdp/qdp-python/qumat_qdp/loader.py
index c6514f0948..2fe4dc65a1 100644
--- a/qdp/qdp-python/qumat_qdp/loader.py
+++ b/qdp/qdp-python/qumat_qdp/loader.py
@@ -31,11 +31,19 @@
 
 from collections.abc import Iterator
 from functools import lru_cache
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, cast
+
+import numpy as np
 
 if TYPE_CHECKING:
     import _qdp
 
+# Optional torch for as_torch(); as_numpy() uses QuantumTensor.to_numpy() (no torch needed).
+try:
+    import torch as _torch
+except ImportError:
+    _torch = None  # type: ignore[assignment]
+
 # Seed must fit Rust u64: 0 <= seed <= 2^64 - 1.
 _U64_MAX = 2**64 - 1
 
@@ -120,6 +128,40 @@ def __init__(
         self._synthetic_requested = False  # set True only by source_synthetic()
         self._file_requested = False
         self._null_handling: str | None = None
+        self._array: np.ndarray | None = None
+        self._array_requested = False
+        # Output format: None = yield raw QuantumTensor (DLPack); ("torch", device) or ("numpy",)
+        self._output_format: tuple[str, ...] | None = None
+
+    def as_torch(self, device: str = "cuda") -> QuantumDataLoader:
+        """Yield batches as PyTorch tensors. device='cuda' keeps data on GPU (no copy); 'cpu' moves to CPU. Returns self."""
+        if device not in ("cuda", "cpu"):
+            raise ValueError(f"device must be 'cuda' or 'cpu', got {device!r}")
+        if _torch is None:
+            raise RuntimeError(
+                "PyTorch is required for as_torch(). Install with: pip install torch"
+            )
+        self._output_format = ("torch", device)
+        return self
+
+    def as_numpy(self) -> QuantumDataLoader:
+        """Yield batches as NumPy float64 arrays (CPU). Uses QuantumTensor.to_numpy() — no PyTorch required. Returns self."""
+        self._output_format = ("numpy",)
+        return self
+
+    def source_array(self, X: np.ndarray) -> QuantumDataLoader:
+        """Use in-memory array; no temp file. Encodes via QdpEngine.encode() per batch. Returns self."""
+        if X is None or not hasattr(X, "shape") or len(X.shape) != 2:
+            raise ValueError(
+                "source_array(X) requires a 2D array (n_samples, n_features)."
+            )
+        self._array = np.asarray(X, dtype=np.float64)
+        if not self._array.flags.c_contiguous:
+            self._array = np.ascontiguousarray(self._array)
+        self._array_requested = True
+        n = self._array.shape[0]
+        self._total_batches = max(1, (n + self._batch_size - 1) // self._batch_size)
+        return self
 
     def qubits(self, n: int) -> QuantumDataLoader:
         """Set number of qubits. Returns self for chaining."""
@@ -204,8 +246,55 @@ def null_handling(self, policy: str) -> QuantumDataLoader:
         self._null_handling = policy
         return self
 
+    def _array_iterator(self) -> Iterator[Any]:
+        """Yield one QuantumTensor per batch from in-memory array via QdpEngine.encode()."""
+        qdp = _get_qdp()
+        QdpEngine = getattr(qdp, "QdpEngine", None)
+        if QdpEngine is None:
+            raise RuntimeError("_qdp.QdpEngine not found. Build with maturin develop.")
+        engine = QdpEngine(device_id=self._device_id)
+        X = self._array
+        if X is None:
+            raise RuntimeError(
+                "Internal error: _array_iterator called without source_array() data."
+            )
+        assert X is not None  # type narrowing for static checkers
+        n = X.shape[0]
+        for start in range(0, n, self._batch_size):
+            end = min(start + self._batch_size, n)
+            qt = engine.encode(X[start:end], self._num_qubits, self._encoding_method)
+            yield qt
+
     def _create_iterator(self) -> Iterator[object]:
-        """Build engine and return the Rust-backed loader iterator (synthetic or file)."""
+        """Build engine and return the Rust-backed loader iterator (synthetic or file) or array iterator."""
+        if self._array_requested:
+            if self._synthetic_requested or self._file_requested:
+                raise ValueError(
+                    "Cannot combine source_array() with source_synthetic() or source_file(); use only one source."
+                )
+            if self._array is None:
+                raise ValueError(
+                    "source_array() was called without an array; set with .source_array(X)."
+                )
+            qdp = _get_qdp()
+            engine = getattr(qdp, "QdpEngine", None)
+            if engine is None:
+                raise RuntimeError(
+                    "_qdp.QdpEngine not found. Build with maturin develop."
+                )
+            engine = engine(device_id=self._device_id)
+            create_array_loader = getattr(engine, "create_array_loader", None)
+            if create_array_loader is not None:
+                return iter(
+                    create_array_loader(
+                        self._array,
+                        batch_size=self._batch_size,
+                        num_qubits=self._num_qubits,
+                        encoding_method=self._encoding_method,
+                        batch_limit=None,
+                    )
+                )
+            return iter(self._array_iterator())
         if self._synthetic_requested and self._file_requested:
             raise ValueError(
                 "Cannot set both synthetic and file sources; use either .source_synthetic() or .source_file(path), not both."
@@ -270,6 +359,26 @@ def _create_iterator(self) -> Iterator[object]:
             )
         )
 
+    def _wrap_iterator(self, raw_iter: Iterator[object]) -> Iterator[Any]:
+        if self._output_format is None:
+            yield from raw_iter
+            return
+        kind = self._output_format[0]
+        if kind == "torch":
+            device = self._output_format[1]
+            for qt in raw_iter:
+                t = _torch.from_dlpack(qt)
+                yield t.cpu() if device == "cpu" else t
+        elif kind == "numpy":
+            for qt in raw_iter:
+                # Rust QuantumTensor has to_numpy(); raw_iter is Iterator[object]
+                yield cast(Any, qt).to_numpy()
+        else:
+            yield from raw_iter
+
     def __iter__(self) -> Iterator[object]:
-        """Return Rust-backed iterator that yields one QuantumTensor per batch."""
-        return self._create_iterator()
+        """Return iterator yielding one batch per iteration (DLPack, torch, or numpy per as_torch/as_numpy)."""
+        raw = self._create_iterator()
+        if self._output_format is None:
+            return raw
+        return self._wrap_iterator(raw)
diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs
index 2c94899c53..b7d6306b78 100644
--- a/qdp/qdp-python/src/engine.rs
+++ b/qdp/qdp-python/src/engine.rs
@@ -688,6 +688,58 @@ impl QdpEngine {
         Ok(PyQuantumLoader::new(Some(iter)))
     }
 
+    #[cfg(target_os = "linux")]
+    /// Create an array-backed pipeline iterator (QuantumDataLoader.source_array(X)).
+    /// PyO3 best practice: one copy (to_vec) to own data for iterator lifetime; then detach()
+    /// so Rust work (new_from_array) runs without GIL. Iterator's next_batch uses &[f64] (no per-batch to_vec).
+    #[pyo3(signature = (data, batch_size, num_qubits, encoding_method, batch_limit=None))]
+    fn create_array_loader(
+        &self,
+        py: Python<'_>,
+        data: &Bound<'_, PyAny>,
+        batch_size: usize,
+        num_qubits: u32,
+        encoding_method: &str,
+        batch_limit: Option<usize>,
+    ) -> PyResult<PyQuantumLoader> {
+        let array_2d = data.extract::<PyReadonlyArray2<f64>>().map_err(|_| {
+            PyRuntimeError::new_err(
+                "create_array_loader requires a 2D NumPy array (float64, C-contiguous).",
+            )
+        })?;
+        let shape = array_2d.shape();
+        let num_samples = shape[0];
+        let sample_size = shape[1];
+        let data_slice = array_2d
+            .as_slice()
+            .map_err(|_| PyRuntimeError::new_err("NumPy array must be C-contiguous."))?;
+        let data_vec = data_slice.to_vec();
+        let batch_limit = batch_limit.unwrap_or(usize::MAX);
+        let config = config_from_args(
+            &self.engine,
+            batch_size,
+            num_qubits,
+            encoding_method,
+            0,
+            None,
+            qdp_core::reader::NullHandling::FillZero,
+        );
+        let engine = self.engine.clone();
+        let iter = py
+            .detach(|| {
+                qdp_core::PipelineIterator::new_from_array(
+                    engine,
+                    data_vec,
+                    num_samples,
+                    sample_size,
+                    config,
+                    batch_limit,
+                )
+            })
+            .map_err(|e| PyRuntimeError::new_err(format!("create_array_loader failed: {}", e)))?;
+        Ok(PyQuantumLoader::new(Some(iter)))
+    }
+
     #[cfg(target_os = "linux")]
     /// Create a streaming Parquet pipeline iterator (for QuantumDataLoader.source_file(path, streaming=True)).
     #[allow(clippy::too_many_arguments)]
diff --git a/qdp/qdp-python/src/tensor.rs b/qdp/qdp-python/src/tensor.rs
index ab341d1ac5..067e4f1857 100644
--- a/qdp/qdp-python/src/tensor.rs
+++ b/qdp/qdp-python/src/tensor.rs
@@ -14,10 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use numpy::{PyArray2, ndarray::Array2};
 use pyo3::exceptions::PyRuntimeError;
 use pyo3::ffi;
 use pyo3::prelude::*;
 use qdp_core::dlpack::DLManagedTensor;
+use std::ffi::c_void;
+
+// CUDA Runtime API — already linked transitively by qdp-core.
+unsafe extern "C" {
+    fn cudaMemcpy(dst: *mut c_void, src: *const c_void, count: usize, kind: i32) -> i32;
+}
+const CUDA_MEMCPY_DEVICE_TO_HOST: i32 = 2;
 
 /// Quantum tensor wrapper implementing DLPack protocol
 ///
@@ -98,6 +106,100 @@ impl QuantumTensor {
         }
     }
 
+    /// Copy encoded quantum state from GPU to a NumPy array (CPU, float64).
+    ///
+    /// Performs a synchronous cudaMemcpy D2H without requiring PyTorch.
+    /// Complex128 output (imaginary parts are always 0.0 per the CUDA kernel)
+    /// is reduced to float64 by discarding the zero imaginary components.
+    ///
+    /// Returns:
+    ///     numpy.ndarray of shape (batch_size, state_len), dtype float64.
+    ///
+    /// Raises:
+    ///     RuntimeError: If the tensor has already been consumed, the pointer is
+    ///                   invalid, the dtype is unsupported, or the CUDA copy fails.
+    #[allow(clippy::wrong_self_convention)] // mut required: sets self.consumed and calls DLPack deleter
+    fn to_numpy<'py>(&mut self, py: Python<'py>) -> PyResult<Bound<'py, PyArray2<f64>>> {
+        if self.consumed {
+            return Err(PyRuntimeError::new_err(
+                "DLPack tensor already consumed (can only be used once)",
+            ));
+        }
+        if self.ptr.is_null() {
+            return Err(PyRuntimeError::new_err("Invalid DLPack tensor pointer"));
+        }
+
+        let (rows, cols, host_data) = unsafe {
+            let dl_tensor = &(*self.ptr).dl_tensor;
+
+            // Shape — require 1-D or 2-D.
+            let ndim = dl_tensor.ndim as usize;
+            if ndim == 0 || ndim > 2 || dl_tensor.shape.is_null() {
+                return Err(PyRuntimeError::new_err(
+                    "to_numpy() requires a 1-D or 2-D tensor",
+                ));
+            }
+            let shape = std::slice::from_raw_parts(dl_tensor.shape, ndim);
+            let (rows, cols) = if ndim == 1 {
+                (1usize, shape[0] as usize)
+            } else {
+                (shape[0] as usize, shape[1] as usize)
+            };
+
+            // Dtype: complex128 (DL_COMPLEX=5, bits=128) or float64 (DL_FLOAT=2, bits=64).
+            let dtype = &dl_tensor.dtype;
+            let (is_complex, elem_bytes) = match (dtype.code, dtype.bits) {
+                (5, 128) => (true, 16usize),
+                (2, 64) => (false, 8usize),
+                _ => {
+                    return Err(PyRuntimeError::new_err(format!(
+                        "to_numpy() unsupported dtype: code={}, bits={}",
+                        dtype.code, dtype.bits
+                    )));
+                }
+            };
+
+            let n_elems = rows * cols;
+            // For complex128 each element is two consecutive f64 values.
+            let host_f64_count = if is_complex { n_elems * 2 } else { n_elems };
+            let mut host_buf = vec![0.0f64; host_f64_count];
+
+            let data_ptr = (dl_tensor.data as *const u8).add(dl_tensor.byte_offset as usize);
+
+            let ret = cudaMemcpy(
+                host_buf.as_mut_ptr() as *mut c_void,
+                data_ptr as *const c_void,
+                n_elems * elem_bytes,
+                CUDA_MEMCPY_DEVICE_TO_HOST,
+            );
+            if ret != 0 {
+                return Err(PyRuntimeError::new_err(format!(
+                    "cudaMemcpy D2H failed with error code {}",
+                    ret
+                )));
+            }
+
+            // Consumed: GPU memory is ours to free now.
+            self.consumed = true;
+            if let Some(deleter) = (*self.ptr).deleter {
+                deleter(self.ptr);
+            }
+
+            // complex128 → float64: discard imaginary parts (always 0.0).
+            let host_data: Vec<f64> = if is_complex {
+                host_buf.into_iter().step_by(2).collect()
+            } else {
+                host_buf
+            };
+
+            (rows, cols, host_data)
+        };
+
+        let arr = Array2::from_shape_vec((rows, cols), host_data)
+            .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+        Ok(PyArray2::from_owned_array(py, arr))
+    }
+
     /// Returns DLPack device information
     ///
     /// Returns:
@@ -122,8 +224,8 @@ impl QuantumTensor {
 
 impl Drop for QuantumTensor {
     fn drop(&mut self) {
-        // Only free if not consumed by __dlpack__
-        // If consumed, PyTorch/consumer will call the deleter
+        // Only free if not consumed; __dlpack__ leaves freeing to PyTorch,
+        // to_numpy() calls the deleter itself after the D2H copy.
         if !self.consumed && !self.ptr.is_null() {
             unsafe {
                 // Defensive check: qdp-core always provides a deleter
diff --git a/qdp/qdp-python/tests/test_quantum_data_loader.py b/qdp/qdp-python/tests/test_quantum_data_loader.py
index 8c93c45c56..43e24008a3 100644
--- a/qdp/qdp-python/tests/test_quantum_data_loader.py
+++ b/qdp/qdp-python/tests/test_quantum_data_loader.py
@@ -16,6 +16,9 @@
 
 """tests for Quantum Data Loader."""
 
+from unittest.mock import patch
+
+import numpy as np
 import pytest
 
 try:
@@ -28,6 +31,15 @@ def _loader_available():
     return QuantumDataLoader is not None
 
 
+def _cuda_available():
+    try:
+        import torch
+
+        return torch.cuda.is_available()
+    except ImportError:
+        return False
+
+
 @pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
 def test_mutual_exclusion_both_sources_raises() -> None:
     """Calling both .source_synthetic() and .source_file() then __iter__ raises ValueError."""
@@ -238,3 +250,134 @@ def test_source_file_s3_streaming_non_parquet_raises(path):
         )
     msg = str(exc_info.value).lower()
     assert "parquet" in msg or "streaming" in msg
+
+
+# --- as_torch() / as_numpy() output format tests ---
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+def test_as_torch_raises_at_config_time_when_torch_missing():
+    """as_torch() raises RuntimeError immediately (config time) when torch is not installed."""
+    with patch("qumat_qdp.loader._torch", None):
+        loader = QuantumDataLoader(device_id=0).qubits(4).batches(2, size=4)
+        with pytest.raises(RuntimeError) as exc_info:
+            loader.as_torch()
+        msg = str(exc_info.value)
+        assert "PyTorch" in msg or "torch" in msg.lower()
+        assert "pip install" in msg
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+def test_as_numpy_succeeds_at_config_time_without_torch():
+    """as_numpy() does not raise at config time even when torch is not installed."""
+    with patch("qumat_qdp.loader._torch", None):
+        loader = (
+            QuantumDataLoader(device_id=0)
+            .qubits(4)
+            .batches(2, size=4)
+            .source_synthetic()
+            .as_numpy()
+        )
+    assert loader._output_format == ("numpy",)
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_numpy_yields_float64_arrays():
+    """as_numpy() yields numpy float64 arrays with correct shape; no torch required."""
+    num_qubits = 4
+    batch_size = 8
+    state_len = 2**num_qubits  # 16
+
+    batches = []
+    with patch("qumat_qdp.loader._torch", None):
+        loader = (
+            QuantumDataLoader(device_id=0)
+            .qubits(num_qubits)
+            .batches(3, size=batch_size)
+            .source_synthetic()
+            .as_numpy()
+        )
+        for batch in loader:
+            batches.append(batch)
+
+    assert len(batches) == 3
+    for batch in batches:
+        assert isinstance(batch, np.ndarray), f"expected ndarray, got {type(batch)}"
+        assert batch.dtype == np.float64, f"expected float64, got {batch.dtype}"
+        assert batch.ndim == 2
+        assert batch.shape == (batch_size, state_len), f"unexpected shape {batch.shape}"
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_numpy_amplitudes_are_unit_norm():
+    """Each row from as_numpy() should be a unit-norm state vector (amplitude encoding)."""
+    num_qubits = 4
+    batch_size = 16
+
+    loader = (
+        QuantumDataLoader(device_id=0)
+        .qubits(num_qubits)
+        .batches(2, size=batch_size)
+        .source_synthetic()
+        .as_numpy()
+    )
+    for batch in loader:
+        arr = np.asarray(batch, dtype=np.float64)
+        norms = np.linalg.norm(arr, axis=1)
+        np.testing.assert_allclose(norms, 1.0, atol=1e-5)
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_torch_yields_cuda_tensors():
+    """as_torch(device='cuda') yields torch tensors on CUDA."""
+    try:
+        import torch
+    except ImportError:
+        pytest.skip("torch not installed")
+
+    num_qubits = 4
+    batch_size = 8
+    state_len = 2**num_qubits
+
+    loader = (
+        QuantumDataLoader(device_id=0)
+        .qubits(num_qubits)
+        .batches(2, size=batch_size)
+        .source_synthetic()
+        .as_torch(device="cuda")
+    )
+    for batch in loader:
+        assert isinstance(batch, torch.Tensor)
+        assert batch.is_cuda
+        assert batch.shape == (batch_size, state_len)
+
+
+@pytest.mark.skipif(not _loader_available(), reason="QuantumDataLoader not available")
+@pytest.mark.skipif(not _cuda_available(), reason="CUDA GPU required")
+def test_as_numpy_from_source_array():
+    """as_numpy() works with source_array(), yielding correct shapes and dtype."""
+    num_qubits = 3
+    state_len = 2**num_qubits  # 8
+    n_samples = 12
+    batch_size = 4
+
+    rng = np.random.default_rng(42)
+    X = rng.standard_normal((n_samples, state_len))
+
+    loader = (
+        QuantumDataLoader(device_id=0)
+        .qubits(num_qubits)
+        .batches(1, size=batch_size)
+        .encoding("amplitude")
+        .source_array(X)
+        .as_numpy()
+    )
+    batches = list(loader)
+    assert len(batches) == n_samples // batch_size
+    for batch in batches:
+        assert isinstance(batch, np.ndarray)
+        assert batch.dtype == np.float64
+        assert batch.shape[1] == state_len
diff --git a/qdp/qdp-python/uv.lock b/qdp/qdp-python/uv.lock
index dbba6c7b39..547461d8e8 100644
--- a/qdp/qdp-python/uv.lock
+++ b/qdp/qdp-python/uv.lock
@@ -827,34 +827,42 @@ wheels = [
 
 [[package]]
 name = "nvidia-cublas-cu12"
-version = "12.8.4.1"
+version = "12.6.4.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" },
+    { url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322, upload-time = "2024-11-20T17:40:25.65Z" },
+    { url = "https://files.pythonhosted.org/packages/97/0d/f1f0cadbf69d5b9ef2e4f744c9466cb0a850741d08350736dfdb4aa89569/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668", size = 390794615, upload-time = "2024-11-20T17:39:52.715Z" },
 ]
 
 [[package]]
 name = "nvidia-cuda-cupti-cu12"
-version = "12.8.90"
+version = "12.6.80"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/8b/2f6230cb715646c3a9425636e513227ce5c93c4d65823a734f4bb86d43c3/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:166ee35a3ff1587f2490364f90eeeb8da06cd867bd5b701bf7f9a02b78bc63fc", size = 8236764, upload-time = "2024-11-20T17:35:41.03Z" },
+    { url = "https://files.pythonhosted.org/packages/25/0f/acb326ac8fd26e13c799e0b4f3b2751543e1834f04d62e729485872198d4/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.whl", hash = "sha256:358b4a1d35370353d52e12f0a7d1769fc01ff74a191689d3870b2123156184c4", size = 8236756, upload-time = "2024-10-01T16:57:45.507Z" },
+    { url = "https://files.pythonhosted.org/packages/49/60/7b6497946d74bcf1de852a21824d63baad12cd417db4195fc1bfe59db953/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6768bad6cab4f19e8292125e5f1ac8aa7d1718704012a0e3272a6f61c4bce132", size = 8917980, upload-time = "2024-11-20T17:36:04.019Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/24/120ee57b218d9952c379d1e026c4479c9ece9997a4fb46303611ee48f038/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a3eff6cdfcc6a4c35db968a06fcadb061cbc7d6dde548609a941ff8701b98b73", size = 8917972, upload-time = "2024-10-01T16:58:06.036Z" },
 ]
 
 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
-version = "12.8.93"
+version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/2f/72df534873235983cc0a5371c3661bebef7c4682760c275590b972c7b0f9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5847f1d6e5b757f1d2b3991a01082a44aad6f10ab3c5c0213fa3e25bddc25a13", size = 23162955, upload-time = "2024-10-01T16:59:50.922Z" },
+    { url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380, upload-time = "2024-10-01T17:00:14.643Z" },
 ]
 
 [[package]]
 name = "nvidia-cuda-runtime-cu12"
-version = "12.8.90"
+version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/ea/590b2ac00d772a8abd1c387a92b46486d2679ca6622fd25c18ff76265663/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6116fad3e049e04791c0256a9778c16237837c08b27ed8c8401e2e45de8d60cd", size = 908052, upload-time = "2024-11-20T17:35:19.905Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/3d/159023799677126e20c8fd580cca09eeb28d5c5a624adc7f793b9aa8bbfa/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d461264ecb429c84c8879a7153499ddc7b19b5f8d84c204307491989a365588e", size = 908040, upload-time = "2024-10-01T16:57:22.221Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/23/e717c5ac26d26cf39a27fbc076240fad2e3b817e5889d671b67f4f9f49c5/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ba3b56a4f896141e25e19ab287cd71e52a6a0f4b29d0d31609f60e3b4d5219b7", size = 897690, upload-time = "2024-11-20T17:35:30.697Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/62/65c05e161eeddbafeca24dc461f47de550d9fa8a7e04eb213e32b55cfd99/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a84d15d5e1da416dd4774cb42edf5e954a3e60cc945698dc1d5be02321c44dc8", size = 897678, upload-time = "2024-10-01T16:57:33.821Z" },
 ]
 
 [[package]]
@@ -865,39 +873,47 @@ dependencies = [
     { name = "nvidia-cublas-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" },
     { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
 ]
 
 [[package]]
 name = "nvidia-cufft-cu12"
-version = "11.3.3.83"
+version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/37/c50d2b2f2c07e146776389e3080f4faf70bcc4fa6e19d65bb54ca174ebc3/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d16079550df460376455cba121db6564089176d9bac9e4f360493ca4741b22a6", size = 200164144, upload-time = "2024-11-20T17:40:58.288Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/f5/188566814b7339e893f8d210d3a5332352b1409815908dad6a363dcceac1/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8510990de9f96c803a051822618d42bf6cb8f069ff3f48d93a8486efdacb48fb", size = 200164135, upload-time = "2024-10-01T17:03:24.212Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632, upload-time = "2024-11-20T17:41:32.357Z" },
+    { url = "https://files.pythonhosted.org/packages/60/de/99ec247a07ea40c969d904fc14f3a356b3e2a704121675b75c366b694ee1/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:768160ac89f6f7b459bee747e8d175dbf53619cfe74b2a5636264163138013ca", size = 200221622, upload-time = "2024-10-01T17:03:58.79Z" },
 ]
 
 [[package]]
 name = "nvidia-cufile-cu12"
-version = "1.13.1.3"
+version = "1.11.1.6"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/66/cc9876340ac68ae71b15c743ddb13f8b30d5244af344ec8322b449e35426/nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc23469d1c7e52ce6c1d55253273d32c565dd22068647f3aa59b3c6b005bf159", size = 1142103, upload-time = "2024-11-20T17:42:11.83Z" },
+    { url = "https://files.pythonhosted.org/packages/17/bf/cc834147263b929229ce4aadd62869f0b195e98569d4c28b23edc72b85d9/nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:8f57a0051dcf2543f6dc2b98a98cb2719c37d3cee1baba8965d57f3bbc90d4db", size = 1066155, upload-time = "2024-11-20T17:41:49.376Z" },
 ]
 
 [[package]]
 name = "nvidia-curand-cu12"
-version = "10.3.9.90"
+version = "10.3.7.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" },
+    { url = "https://files.pythonhosted.org/packages/42/ac/36543605358a355632f1a6faa3e2d5dfb91eab1e4bc7d552040e0383c335/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:6e82df077060ea28e37f48a3ec442a8f47690c7499bff392a5938614b56c98d8", size = 56289881, upload-time = "2024-10-01T17:04:18.981Z" },
+    { url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010, upload-time = "2024-11-20T17:42:50.958Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/aa/2c7ff0b5ee02eaef890c0ce7d4f74bc30901871c5e45dee1ae6d0083cd80/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:99f1a32f1ac2bd134897fc7a203f779303261268a65762a623bf30cc9fe79117", size = 56279000, upload-time = "2024-10-01T17:04:45.274Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/02/5362a9396f23f7de1dd8a64369e87c85ffff8216fc8194ace0fa45ba27a5/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:7b2ed8e95595c3591d984ea3603dd66fe6ce6812b886d59049988a712ed06b6e", size = 56289882, upload-time = "2024-11-20T17:42:25.222Z" },
 ]
 
 [[package]]
 name = "nvidia-cusolver-cu12"
-version = "11.7.3.90"
+version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "nvidia-cublas-cu12" },
@@ -905,18 +921,24 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
+    { url = "https://files.pythonhosted.org/packages/93/17/dbe1aa865e4fdc7b6d4d0dd308fdd5aaab60f939abfc0ea1954eac4fb113/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0ce237ef60acde1efc457335a2ddadfd7610b892d94efee7b776c64bb1cac9e0", size = 157833628, upload-time = "2024-10-01T17:05:05.591Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790, upload-time = "2024-11-20T17:43:43.211Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/81/baba53585da791d043c10084cf9553e074548408e04ae884cfe9193bd484/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6cf28f17f64107a0c4d7802be5ff5537b2130bfc112f25d5a30df227058ca0e6", size = 158229780, upload-time = "2024-10-01T17:05:39.875Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/5f/07d0ba3b7f19be5a5ec32a8679fc9384cfd9fc6c869825e93be9f28d6690/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dbbe4fc38ec1289c7e5230e16248365e375c3673c9c8bac5796e2e20db07f56e", size = 157833630, upload-time = "2024-11-20T17:43:16.77Z" },
 ]
 
 [[package]]
 name = "nvidia-cusparse-cu12"
-version = "12.5.8.93"
+version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/eb/6681efd0aa7df96b4f8067b3ce7246833dd36830bb4cec8896182773db7d/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d25b62fb18751758fe3c93a4a08eff08effedfe4edf1c6bb5afd0890fe88f887", size = 216451147, upload-time = "2024-11-20T17:44:18.055Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/56/3af21e43014eb40134dea004e8d0f1ef19d9596a39e4d497d5a7de01669f/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7aa32fa5470cf754f72d1116c7cbc300b4e638d3ae5304cfa4a638a5b87161b1", size = 216451135, upload-time = "2024-10-01T17:06:03.826Z" },
+    { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367, upload-time = "2024-11-20T17:44:54.824Z" },
+    { url = "https://files.pythonhosted.org/packages/43/ac/64c4316ba163e8217a99680c7605f779accffc6a4bcd0c778c12948d3707/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:23749a6571191a215cb74d1cdbff4a86e7b19f1200c071b3fcf844a5bea23a2f", size = 216561357, upload-time = "2024-10-01T17:06:29.861Z" },
 ]
 
 [[package]]
@@ -924,31 +946,46 @@ name = "nvidia-cusparselt-cu12"
 version = "0.7.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" },
     { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" },
 ]
 
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.27.3"
+version = "2.27.5"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5c/5b/4e4fff7bad39adf89f735f2bc87248c81db71205b62bcc0d5ca5b606b3c3/nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adf27ccf4238253e0b826bce3ff5fa532d65fc42322c8bfdfaf28024c0fbe039", size = 322364134, upload-time = "2025-06-03T21:58:04.013Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" },
 ]
 
 [[package]]
 name = "nvidia-nvjitlink-cu12"
-version = "12.8.93"
+version = "12.6.85"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971, upload-time = "2024-11-20T17:46:53.366Z" },
+    { url = "https://files.pythonhosted.org/packages/31/db/dc71113d441f208cdfe7ae10d4983884e13f464a6252450693365e166dcf/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41", size = 19270338, upload-time = "2024-11-20T17:46:29.758Z" },
+]
+
+[[package]]
+name = "nvidia-nvshmem-cu12"
+version = "3.3.20"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/92/9d/3dd98852568fb845ec1f7902c90a22b240fe1cbabda411ccedf2fd737b7b/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b0b960da3842212758e4fa4696b94f129090b30e5122fea3c5345916545cff0", size = 124484616, upload-time = "2025-08-04T20:24:59.172Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/6c/99acb2f9eb85c29fc6f3a7ac4dccfd992e22666dd08a642b303311326a97/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5", size = 124657145, upload-time = "2025-08-04T20:25:19.995Z" },
 ]
 
 [[package]]
 name = "nvidia-nvtx-cu12"
-version = "12.8.90"
+version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/93/80f8a520375af9d7ee44571a6544653a176e53c2b8ccce85b97b83c2491b/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f44f8d86bb7d5629988d61c8d3ae61dddb2015dee142740536bc7481b022fe4b", size = 90549, upload-time = "2024-11-20T17:38:17.387Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/53/36e2fd6c7068997169b49ffc8c12d5af5e5ff209df6e1a2c4d373b3a638f/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:adcaabb9d436c9761fca2b13959a2d237c5f9fd406c8e4b723c695409ff88059", size = 90539, upload-time = "2024-10-01T17:00:27.179Z" },
+    { url = "https://files.pythonhosted.org/packages/56/9a/fff8376f8e3d084cd1530e1ef7b879bb7d6d265620c95c1b322725c694f4/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b90bed3df379fa79afbd21be8e04a0314336b8ae16768b58f2d34cb1d04cd7d2", size = 89276, upload-time = "2024-11-20T17:38:27.621Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1", size = 89265, upload-time = "2024-10-01T17:00:38.172Z" },
 ]
 
 [[package]]
@@ -1096,7 +1133,7 @@ dependencies = [
     { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "numpy", marker = "python_full_version >= '3.11'" },
     { name = "packaging", marker = "python_full_version >= '3.11'" },
-    { name = "pennylane-lightning", version = "0.43.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pennylane-lightning", version = "0.44.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "requests", marker = "python_full_version >= '3.11'" },
     { name = "rustworkx", marker = "python_full_version >= '3.11'" },
     { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -1140,7 +1177,7 @@ wheels = [
 
 [[package]]
 name = "pennylane-lightning"
-version = "0.43.0"
+version = "0.44.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version >= '3.12'",
@@ -1150,19 +1187,17 @@ dependencies = [
     { name = "pennylane", version = "0.43.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "scipy-openblas32", marker = "python_full_version >= '3.11'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/50/77/e7b484fda69da63fe02c4f56374dbc1e00aaf5492f8799c1b8ecb92c0e1f/pennylane_lightning-0.43.0.tar.gz", hash = "sha256:ee6f34d4733be0e1d1ba1a12b3a9d3672c9fa455786dbc062176bfe028d6c69d", size = 785957, upload-time = "2025-10-15T13:20:39.482Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/41/ce4d7728b0faf7c77c4e18e2bca77b6ba52c3cc43f5a321ea6596c963e9e/pennylane_lightning-0.44.0.tar.gz", hash = "sha256:4d7383ab8b53af17d14f5b9985afa867a0cec10d224bd068259d824eba812e7a", size = 791167, upload-time = "2026-01-13T15:36:40.241Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e3/33/8f2c98b82fd560a97ce724e027d5f806babe26769b7e21d01ec064457083/pennylane_lightning-0.43.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:2071e116c03c82a29a036ec0f529e29cabad248f3595c36a40452fcec1f13353", size = 1725043, upload-time = "2025-10-15T13:18:51.595Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/b5/321819f3702b90334dd34484655c09b152f891c3c4b5e374d22df81a3655/pennylane_lightning-0.43.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:cfa8422b7827b4be6240f6b52b298c91811bd50ab7b9702d6ea02282c4d559af", size = 2172500, upload-time = "2025-10-15T13:18:54.839Z" },
-    { url = "https://files.pythonhosted.org/packages/89/52/408f138ebd0a0eb0014f23509be02c2ad4f490ed69d802efeb078dc21272/pennylane_lightning-0.43.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8d24c919c7508aaa8e54b51d5626890c730204f78c52556bb559273b85d792dc", size = 2017459, upload-time = "2025-10-15T13:18:57.621Z" },
-    { url = "https://files.pythonhosted.org/packages/30/4c/43344cf028a228cc5162e734aa2a77d1e609dc3ca9b6bfd8fe541028a313/pennylane_lightning-0.43.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:32eb8fa0332b54969bb4693e4bd15e96273bc15e0e81af9b29b8a516a407453d", size = 2464101, upload-time = "2025-10-15T13:19:00.575Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/ea/3a4d6b6552a9ab0368a14c8e88f85635e5aa49d0bafa9dbb08701ff0e6b1/pennylane_lightning-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:165dba4748398b5852b91be312f690ee5567b860f054d7fbb6270da6b68f7e84", size = 5383887, upload-time = "2025-10-15T13:19:05.769Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/a9/9598c87859109bd74358a8f2623b586791028337c0f2ebe257567e38ba03/pennylane_lightning-0.43.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:3c4017557c84ed4334b05e2e33ef40407474195247f8e97434c097c8cef1f5e3", size = 1724477, upload-time = "2025-10-15T13:19:09.443Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/63/f60ebce7ec4dea995be8a26645f841379d57b45db25c37584d1bbe9745e0/pennylane_lightning-0.43.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:e4e27d0f892ba587e0fe274a9a349fbbdd5727ed898223a65c9d049a6f7609c1", size = 2173061, upload-time = "2025-10-15T13:19:12.306Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/4d/0ec98912a480d51d4433007b38a7682a8c975b03463c8cc7e91ee99241ca/pennylane_lightning-0.43.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:44820f846805d0919f3a85cdfc8938913af3c99418729d01cf1a6c3de7d862ba", size = 2016744, upload-time = "2025-10-15T13:19:15.018Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/e9/83d5460175bbe2701587d288c026eeabbdc4a23168fcee5a572be45115c8/pennylane_lightning-0.43.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6fc236ff206866d7ef5deed733a3bef719b0bda0476be577983a7bda8c516c68", size = 2463530, upload-time = "2025-10-15T13:19:17.763Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/a2/46cbbc0788890cae778ce5454151d2e0a3a5dcbe1e12941c7351d05e0106/pennylane_lightning-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:35bab12effe2ce3c652fef86ce2c32c5140c5e9c895d172fe99b77dceabe35cf", size = 5381017, upload-time = "2025-10-15T13:19:22.633Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/5f/b57d29a6794975b8dbed4afd5755a0b8d5f979e3b52d1bc986a28fa7fc82/pennylane_lightning-0.43.0-py3-none-any.whl", hash = "sha256:f8ac2d58d48133728bbb801cbf6f8f58808b878b44c32143a01ef703658a6d14", size = 1034810, upload-time = "2025-10-15T13:19:40.301Z" },
+    { url = "https://files.pythonhosted.org/packages/02/70/41e014c3fa7c94839da771acd6d293e597c5ee493ef91834bcfe7bf8743a/pennylane_lightning-0.44.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:9a492cb23d631b83f1493e1eb3ff0437e9e29c41921b0ed41d4cff7f016b98b8", size = 1725460, upload-time = "2026-01-13T15:35:18.008Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/53/728b93e80ef6a968d715c11c0de3ee2953cc934182a4d8de454aa6d5eb3e/pennylane_lightning-0.44.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:048df8e23a62bde4046162c1229ecfdd8cb7f17b8a16cb5a7c6f68280aff024f", size = 2020368, upload-time = "2026-01-13T15:35:19.913Z" },
+    { url = "https://files.pythonhosted.org/packages/25/71/703d4df1fd010fab517337ff12403ee4d040b48d45663c61145e80a36f06/pennylane_lightning-0.44.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4a1cb8827a05e58596f632fb02e014392d92c38a2e471817cd0cb826cb995305", size = 2470668, upload-time = "2026-01-13T15:35:21.257Z" },
+    { url = "https://files.pythonhosted.org/packages/27/23/31695ff221cb7ff4574c9567ce24d431962c42d4692c48c037a048cdb56d/pennylane_lightning-0.44.0-cp311-cp311-win_amd64.whl", hash = "sha256:6809ea3a0982c478b1434aaf0e78ca19bdafbddf27ea9ed04378cde5494fe1a7", size = 5411322, upload-time = "2026-01-13T15:35:23.015Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/76/c2339362329f468981b4ac24d59982ab06e9a3c4561928d0a4c1bd0d4720/pennylane_lightning-0.44.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:89d84f6b24675f011695d7be4a6dbe7821224f5f96747413367135b5c53ae414", size = 1724936, upload-time = "2026-01-13T15:35:25.034Z" },
+    { url = "https://files.pythonhosted.org/packages/35/22/dfb5af72c9bf9f85bdf114be4204369894a2b9d9d205ed180df422ff93a0/pennylane_lightning-0.44.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aae687962962f2d2a8620740c7b0eb2de16aba40f080bec64519652e3a25fba3", size = 2019499, upload-time = "2026-01-13T15:35:26.824Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/98/30c3164b620f89dfec71c05359a1025e15d695a42dbdbc6350f664fc6b58/pennylane_lightning-0.44.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e2030fcebea3cfc7e8d6fc4a5aa821704ffcc15ed4ba76bf653facb7d8ebe39", size = 2469486, upload-time = "2026-01-13T15:35:28.663Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/8a/418d1a9f8e292d322a66eac83c8e4b4f48e01f24e629b9b496689412cc0f/pennylane_lightning-0.44.0-cp312-cp312-win_amd64.whl", hash = "sha256:26e7d79a816da3a659ceba554999d1781cc1829699f544ff733cc3dbe2c6f83c", size = 5409508, upload-time = "2026-01-13T15:35:30.558Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/d1/b681ae8546b264a4c9d999b8e57a3291cbf38edf39194d5416fbae19f8af/pennylane_lightning-0.44.0-py3-none-any.whl", hash = "sha256:a5a257f89c623565df68f987437de380495299b1271b935e1269717198668e71", size = 1037558, upload-time = "2026-01-13T15:35:46.04Z" },
 ]
 
 [[package]]
@@ -1440,12 +1475,12 @@ benchmark = [
     { name = "qiskit-aer", specifier = ">=0.17.2" },
     { name = "scikit-learn", specifier = ">=1.3" },
     { name = "tensorflow", specifier = ">=2.20" },
-    { name = "torch", specifier = ">=2.2,<=2.9.0" },
+    { name = "torch", specifier = ">=2.2,<=2.9.0", index = "https://download.pytorch.org/whl/cu126" },
     { name = "tqdm" },
 ]
 dev = [
     { name = "pytest" },
-    { name = "torch", specifier = ">=2.2,<=2.9.0" },
+    { name = "torch", specifier = ">=2.2,<=2.9.0", index = "https://download.pytorch.org/whl/cu126" },
 ]
 
 [[package]]
@@ -1823,46 +1858,44 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.8.0"
-source = { registry = "https://pypi.org/simple" }
+version = "2.9.0+cu126"
+source = { registry = "https://download.pytorch.org/whl/cu126" }
 dependencies = [
     { name = "filelock" },
     { name = "fsspec" },
     { name = "jinja2" },
     { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux'" },
     { name = "setuptools", marker = "python_full_version >= '3.12'" },
     { name = "sympy" },
-    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "triton", marker = "sys_platform == 'linux'" },
     { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/63/28/110f7274254f1b8476c561dada127173f994afa2b1ffc044efb773c15650/torch-2.8.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:0be92c08b44009d4131d1ff7a8060d10bafdb7ddcb7359ef8d8c5169007ea905", size = 102052793, upload-time = "2025-08-06T14:53:15.852Z" },
-    { url = "https://files.pythonhosted.org/packages/70/1c/58da560016f81c339ae14ab16c98153d51c941544ae568da3cb5b1ceb572/torch-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:89aa9ee820bb39d4d72b794345cccef106b574508dd17dbec457949678c76011", size = 888025420, upload-time = "2025-08-06T14:54:18.014Z" },
-    { url = "https://files.pythonhosted.org/packages/70/87/f69752d0dd4ba8218c390f0438130c166fa264a33b7025adb5014b92192c/torch-2.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:e8e5bf982e87e2b59d932769938b698858c64cc53753894be25629bdf5cf2f46", size = 241363614, upload-time = "2025-08-06T14:53:31.496Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/d6/e6d4c57e61c2b2175d3aafbfb779926a2cfd7c32eeda7c543925dceec923/torch-2.8.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a3f16a58a9a800f589b26d47ee15aca3acf065546137fc2af039876135f4c760", size = 73611154, upload-time = "2025-08-06T14:53:10.919Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/c4/3e7a3887eba14e815e614db70b3b529112d1513d9dae6f4d43e373360b7f/torch-2.8.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:220a06fd7af8b653c35d359dfe1aaf32f65aa85befa342629f716acb134b9710", size = 102073391, upload-time = "2025-08-06T14:53:20.937Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/63/4fdc45a0304536e75a5e1b1bbfb1b56dd0e2743c48ee83ca729f7ce44162/torch-2.8.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c12fa219f51a933d5f80eeb3a7a5d0cbe9168c0a14bbb4055f1979431660879b", size = 888063640, upload-time = "2025-08-06T14:55:05.325Z" },
-    { url = "https://files.pythonhosted.org/packages/84/57/2f64161769610cf6b1c5ed782bd8a780e18a3c9d48931319f2887fa9d0b1/torch-2.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:8c7ef765e27551b2fbfc0f41bcf270e1292d9bf79f8e0724848b1682be6e80aa", size = 241366752, upload-time = "2025-08-06T14:53:38.692Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/5e/05a5c46085d9b97e928f3f037081d3d2b87fb4b4195030fc099aaec5effc/torch-2.8.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:5ae0524688fb6707c57a530c2325e13bb0090b745ba7b4a2cd6a3ce262572916", size = 73621174, upload-time = "2025-08-06T14:53:25.44Z" },
-    { url = "https://files.pythonhosted.org/packages/49/0c/2fd4df0d83a495bb5e54dca4474c4ec5f9c62db185421563deeb5dabf609/torch-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e2fab4153768d433f8ed9279c8133a114a034a61e77a3a104dcdf54388838705", size = 101906089, upload-time = "2025-08-06T14:53:52.631Z" },
-    { url = "https://files.pythonhosted.org/packages/99/a8/6acf48d48838fb8fe480597d98a0668c2beb02ee4755cc136de92a0a956f/torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b2aca0939fb7e4d842561febbd4ffda67a8e958ff725c1c27e244e85e982173c", size = 887913624, upload-time = "2025-08-06T14:56:44.33Z" },
-    { url = "https://files.pythonhosted.org/packages/af/8a/5c87f08e3abd825c7dfecef5a0f1d9aa5df5dd0e3fd1fa2f490a8e512402/torch-2.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:2f4ac52f0130275d7517b03a33d2493bab3693c83dcfadf4f81688ea82147d2e", size = 241326087, upload-time = "2025-08-06T14:53:46.503Z" },
-    { url = "https://files.pythonhosted.org/packages/be/66/5c9a321b325aaecb92d4d1855421e3a055abd77903b7dab6575ca07796db/torch-2.8.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:619c2869db3ada2c0105487ba21b5008defcc472d23f8b80ed91ac4a380283b0", size = 73630478, upload-time = "2025-08-06T14:53:57.144Z" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31ef6cf39c85a368b09b4fcb92e520ea6dae0121faba28107d8eab6f78f67d51" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:653962a66d992e3ba850154356e9ecd83c9beb07663065a3a01d083c8c49b6a5" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp310-cp310-win_amd64.whl", hash = "sha256:e8fa700af633d4dcfacc39e8e4d75827d13023243292d9a7fe1e5e5215a6e633" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:0ac8362cd4c8c85af5c865fb63a4580656f5f1aae39e77469a84dfb3d6c979d0" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:bd3329d3da1786cfd993eca23f0d1213f939145c5aa2ddadd1b0f6dbc37be17d" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp311-cp311-win_amd64.whl", hash = "sha256:94fc90845de9324943c2f4f5ebffca35df32135e562cd040c3b5cc17259bbc8a" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f80b9bdd81a8d4d48bea4fbab027b728e399bf733a7330f521924ae25aa48958" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ea68e3146cd7d770c662f0120f18b8b4a6d96be4314e7196047b282887828cfb" },
+    { url = "https://download.pytorch.org/whl/cu126/torch-2.9.0%2Bcu126-cp312-cp312-win_amd64.whl", hash = "sha256:321de9e00dfb066fac4e182c62b6f0a10eb7943924daecb261a7490f98ce3641" },
 ]
 
 [[package]]
@@ -1879,15 +1912,15 @@ wheels = [
 
 [[package]]
 name = "triton"
-version = "3.4.0"
+version = "3.5.0"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "setuptools" },
-]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/62/ee/0ee5f64a87eeda19bbad9bc54ae5ca5b98186ed00055281fd40fb4beb10e/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128", size = 155430069, upload-time = "2025-07-30T19:58:21.715Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/39/43325b3b651d50187e591eefa22e236b2981afcebaefd4f2fc0ea99df191/triton-3.4.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b70f5e6a41e52e48cfc087436c8a28c17ff98db369447bcaff3b887a3ab4467", size = 155531138, upload-time = "2025-07-30T19:58:29.908Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/66/b1eb52839f563623d185f0927eb3530ee4d5ffe9d377cdaf5346b306689e/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c1d84a5c0ec2c0f8e8a072d7fd150cab84a9c239eaddc6706c081bfae4eb04", size = 155560068, upload-time = "2025-07-30T19:58:37.081Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/22/507b6f58a35e05e84381630b2dc2a3cee1a7a2a7eaf4cba857c638a18a24/triton-3.5.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f90de6a6566bb619b4c0adc9855729e1b1b5e26533fca1bf6206e96b6d277a3", size = 159827599, upload-time = "2025-10-15T19:15:43.87Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/eb/09e31d107a5d00eb281aa7e6635ca463e9bca86515944e399480eadb71f8/triton-3.5.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5d3b3d480debf24eaa739623c9a42446b0b77f95593d30eb1f64cd2278cc1f0", size = 170333110, upload-time = "2025-10-13T16:37:49.588Z" },
+    { url = "https://files.pythonhosted.org/packages/79/f9/b6f60f978397c616fd8dacca2305759fe4f80d397b20ef72534803244bd5/triton-3.5.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8457b22148defefdcb7fa8144b05ce211b9faefad650a1ce85b23df488d5549c", size = 159926731, upload-time = "2025-10-15T19:15:49.682Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/78/949a04391c21956c816523678f0e5fa308eb5b1e7622d88c4e4ef5fceca0/triton-3.5.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f34bfa21c5b3a203c0f0eab28dcc1e49bd1f67d22724e77fb6665a659200a4ec", size = 170433488, upload-time = "2025-10-13T16:37:57.132Z" },
+    { url = "https://files.pythonhosted.org/packages/87/9b/30988039e1e84df7554fba24e6a734d2d0e847af33cabdf9b532b3c51456/triton-3.5.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da21fccceafc163e3a5e857abe34351ef76345af06cabf9637a914742671f0b", size = 159946647, upload-time = "2025-10-15T19:15:56.325Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/3a/e991574f3102147b642e49637e0281e9bb7c4ba254edb2bab78247c85e01/triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9e71db82261c4ffa3921cd050cd5faa18322d2d405c30eb56084afaff3b0833", size = 170476535, upload-time = "2025-10-13T16:38:05.18Z" },
 ]
 
 [[package]]
diff --git a/testing/qumat/test_parameter_binding.py b/testing/qumat/test_parameter_binding.py
index ae8fdb9e6b..7a7976cde4 100644
--- a/testing/qumat/test_parameter_binding.py
+++ b/testing/qumat/test_parameter_binding.py
@@ -224,7 +224,9 @@ def test_partially_bound_parameters_error(self, backend_name):
     @pytest.mark.parametrize("backend_name", TESTING_BACKENDS)
     def test_execute_circuit_does_not_mutate_backend_config(self, backend_name):
         """Test that execute_circuit does not mutate the user's backend_config across all backends."""
-        backend_config = get_backend_config(backend_name).copy()
+        cfg = get_backend_config(backend_name)
+        assert cfg is not None
+        backend_config = cfg.copy()
         original_config = backend_config.copy()
 
         qumat = QuMat(backend_config)
@@ -240,7 +242,9 @@ def test_execute_circuit_does_not_mutate_backend_config(self, backend_name):
     @pytest.mark.parametrize("backend_name", TESTING_BACKENDS)
     def test_get_final_state_vector_does_not_mutate_backend_config(self, backend_name):
         """Test that get_final_state_vector does not mutate the user's backend_config across all backends."""
-        backend_config = get_backend_config(backend_name).copy()
+        cfg = get_backend_config(backend_name)
+        assert cfg is not None
+        backend_config = cfg.copy()
         original_config = backend_config.copy()
 
         qumat = QuMat(backend_config)
diff --git a/uv.lock b/uv.lock
index 6781486846..bd88f82f75 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2076,12 +2076,8 @@ dev = [
 [[package]]
 name = "qumat-qdp"
 source = { editable = "qdp/qdp-python" }
-dependencies = [
-    { name = "qumat" },
-]
 
 [package.metadata]
-requires-dist = [{ name = "qumat", editable = "." }]
 
 [package.metadata.requires-dev]
 benchmark = [
@@ -2094,12 +2090,12 @@ benchmark = [
     { name = "qiskit-aer", specifier = ">=0.17.2" },
     { name = "scikit-learn", specifier = ">=1.3" },
     { name = "tensorflow", specifier = ">=2.20" },
-    { name = "torch", specifier = ">=2.2,<=2.9.0" },
+    { name = "torch", specifier = ">=2.2,<=2.9.0", index = "https://download.pytorch.org/whl/cu126" },
     { name = "tqdm" },
 ]
 dev = [
     { name = "pytest" },
-    { name = "torch", specifier = ">=2.2,<=2.9.0" },
+    { name = "torch", specifier = ">=2.2,<=2.9.0", index = "https://download.pytorch.org/whl/cu126" },
 ]
 
 [[package]]