Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 34 additions & 27 deletions qdp/qdp-core/src/gpu/encodings/amplitude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,21 +245,9 @@ impl QuantumEncoder for AmplitudeEncoder {
buffer
};

// Validate norms on host to catch zero or NaN samples early
{
crate::profile_scope!("GPU::NormValidation");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;

if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}

// Launch batch kernel
// Launch batch encode kernel — takes GPU norm buffer directly, no D2H needed yet.
// We defer the norm validation D2H copy until AFTER the encode kernel + sync so that
// the norm kernel → encode kernel sequence runs without an intermediate GPU-CPU roundtrip.
{
crate::profile_scope!("GPU::BatchKernelLaunch");
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
Expand Down Expand Up @@ -288,14 +276,30 @@ impl QuantumEncoder for AmplitudeEncoder {
}
}

// Synchronize
// Synchronize — all GPU work (norm + encode) complete after this point.
{
crate::profile_scope!("GPU::Synchronize");
device
.synchronize()
.map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
}

// Validate norms on host AFTER sync: D2H copy no longer blocks the encode kernel.
// This preserves error detection for zero/NaN samples without adding a mid-pipeline
// GPU-CPU roundtrip between the norm and encode kernels.
{
crate::profile_scope!("GPU::NormValidation");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;

if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}

Ok(batch_state_vector)
}

Expand Down Expand Up @@ -412,17 +416,8 @@ impl QuantumEncoder for AmplitudeEncoder {
}
buffer
};
{
crate::profile_scope!("GPU::NormValidation");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}
// Launch encode kernel before D2H norm validation: GPU norm buffer is passed directly,
// so the encode kernel can run immediately after the norm kernel without a CPU roundtrip.
{
crate::profile_scope!("GPU::BatchKernelLaunch");
use cudarc::driver::DevicePtr;
Expand Down Expand Up @@ -450,10 +445,22 @@ impl QuantumEncoder for AmplitudeEncoder {
)));
}
}
// Synchronize first; then validate norms on host (D2H after all GPU work is done).
{
crate::profile_scope!("GPU::Synchronize");
sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
{
crate::profile_scope!("GPU::NormValidation");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}
Ok(batch_state_vector)
}

Expand Down
95 changes: 95 additions & 0 deletions qdp/qdp-core/src/pipeline_runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,47 @@ impl PipelineIterator {
})
}

/// Create a pipeline iterator from an in-memory array (e.g. from Python numpy).
/// Data is owned by the iterator; the full encode loop runs in Rust (take_batch + encode_batch).
pub fn new_from_array(
engine: QdpEngine,
data: Vec<f64>,
num_samples: usize,
sample_size: usize,
config: PipelineConfig,
batch_limit: usize,
) -> Result<Self> {
let vector_len = vector_len(config.num_qubits, &config.encoding_method);
if sample_size != vector_len {
return Err(MahoutError::InvalidInput(format!(
"Array sample_size {} does not match vector_len {} for num_qubits={}, encoding={}",
sample_size, vector_len, config.num_qubits, config.encoding_method
)));
}
if data.len() != num_samples * sample_size {
return Err(MahoutError::InvalidInput(format!(
"Array length {} is not num_samples ({}) * sample_size ({})",
data.len(),
num_samples,
sample_size
)));
}
let source = DataSource::InMemory {
data,
cursor: 0,
num_samples,
sample_size,
batches_yielded: 0,
batch_limit,
};
Ok(Self {
engine,
config,
source,
vector_len,
})
}

/// Create a pipeline iterator from a Parquet file using streaming read (Phase 2b).
/// Only `.parquet` is supported; reduces memory for large files by reading in chunks.
/// Validates sample_size == vector_len after the first chunk.
Expand Down Expand Up @@ -428,7 +469,61 @@ impl PipelineIterator {
}

/// Returns the next batch as a DLPack pointer; `Ok(None)` when exhausted.
/// For InMemory source, passes a slice reference to encode_batch (no per-batch copy).
pub fn next_batch(&mut self) -> Result<Option<*mut DLManagedTensor>> {
// InMemory: update cursor, then encode from &data[start..end] to avoid to_vec().
let in_memory_range: Option<(usize, usize, usize, usize)> = match &mut self.source {
DataSource::InMemory {
data,
cursor,
sample_size,
batches_yielded,
batch_limit,
..
} => {
if *batches_yielded >= *batch_limit {
None
} else {
let remaining = (data.len() - *cursor) / *sample_size;
if remaining == 0 {
None
} else {
let batch_n = remaining.min(self.config.batch_size);
let start = *cursor;
let end = start + batch_n * *sample_size;
*cursor = end;
*batches_yielded += 1;
Some((
start,
batch_n,
*sample_size,
self.config.num_qubits as usize,
))
}
}
}
_ => None,
};

if let Some((start, batch_n, sample_size, num_qubits)) = in_memory_range {
let slice = match &self.source {
DataSource::InMemory { data, .. } => {
let len = batch_n * sample_size;
&data[start..start + len]
}
_ => unreachable!(),
};
let ptr = self.engine.encode_batch(
slice,
batch_n,
sample_size,
num_qubits,
&self.config.encoding_method,
)?;
return Ok(Some(ptr));
}

// Synthetic / Streaming: take_batch_from_source (may copy) then encode.
let Some((batch_data, batch_n, sample_size, num_qubits)) = self.take_batch_from_source()?
else {
return Ok(None);
Expand Down
22 changes: 22 additions & 0 deletions qdp/qdp-python/benchmark/encoding_benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,25 @@ To see the full list of options and defaults, append `--help`:
uv run python benchmark/encoding_benchmarks/pennylane_baseline/iris_amplitude.py --help
uv run python benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py --help
```

## Credit Card Fraud amplitude baseline (PennyLane)

Minimal, reproducible steps (run from `qdp/qdp-python`):

1. **Download dataset (once)** — Kaggle `creditcard.csv` mirror:

```bash
mkdir -p benchmark/encoding_benchmarks/pennylane_baseline/data
curl -L -o benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
```

2. **Run the PennyLane baseline** — StandardScaler → PCA(16) → L2 norm → 4‑qubit amplitude VQC:

```bash
uv run python benchmark/encoding_benchmarks/pennylane_baseline/creditcardfraud_amplitude.py \
--data-file benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
--max-samples 300000 --iters 200 --batch-size 512 --trials 1
```

This prints compile time, train time / throughput, and task metrics (AUPRC, F1, precision, recall) on the test set.
Loading