Enable graph capture for webgpu (#1848)

qjia7 · Copilot · web-flow · commit 32d101d997c9 · 2025-11-25T13:40:40.000+08:00
This PR enables the graph capture for webgpu. It implements CopyDeviceToCpu\CopyCpuToDevice\CopyFrom\Zero functions using the new `CopyTensors` API. The ort part needs to apply this PR [#26450](microsoft/onnxruntime#26450) to make it work for webgpu. Below things will be implemented in following-up PRs to get the full performance gain for graph capture (The original one is #1720). 1. Support UpdateAttentionMask, UpdatePositionIds, and Cast to keep the whole pipeline on gpu. 2. Optimize CopyFrom with offsets --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
@@ -61,7 +61,7 @@ parameters:
   - name: ort_version
     displayName: 'OnnxRuntime version'
     type: string
-    default: '1.22.0'
+    default: '1.23.0'
 
   - name: ort_winml_version
     displayName: 'Microsoft.WindowsAppSDK.ML Version (should match CMakeList.txt)'
@@ -76,12 +76,12 @@ parameters:
   - name: ort_cuda_version
     displayName: 'OnnxRuntime GPU version'
     type: string
-    default: '1.22.0'
+    default: '1.23.0'
 
   - name: ort_dml_version
     displayName: 'OnnxRuntime DML version'
     type: string
-    default: '1.22.0'
+    default: '1.23.0'
 
   - name: cuda_version
     displayName: 'CUDA version'
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -80,6 +80,10 @@ if(MSVC)
     "$<$<COMPILE_LANGUAGE:C>:/wd4100>"
     "$<$<COMPILE_LANGUAGE:CXX>:/wd4100>"
 
+    # Suppress warning C4819: file contains character that cannot be represented in current code page
+    "$<$<COMPILE_LANGUAGE:C>:/wd4819>"
+    "$<$<COMPILE_LANGUAGE:CXX>:/wd4819>"
+
     # Enable warning level 4 (more aggressive than default /W3)
     # Captures more potential bugs or code smells
     "$<$<COMPILE_LANGUAGE:C>:/W4>"
diff --git a/cmake/ortlib.cmake b/cmake/ortlib.cmake
@@ -81,16 +81,16 @@ if(ORT_HOME)
   endif()
 else()
   # If ORT_HOME is not specified, download the onnxruntime headers and libraries from the nightly feed
-  set(ORT_VERSION "1.22.0")
+  set(ORT_VERSION "1.23.0")
   set(ORT_FEED_ORG_NAME "aiinfra")
   set(ORT_FEED_PROJECT "2692857e-05ef-43b4-ba9c-ccf1c22c437c")
   set(ORT_NIGHTLY_FEED_ID "7982ae20-ed19-4a35-a362-a96ac99897b7")
 
   if (USE_DML)
-    set(ORT_VERSION "1.22.0")
+    set(ORT_VERSION "1.23.0")
     set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime.DirectML")
   elseif(USE_CUDA)
-    set(ORT_VERSION "1.22.0")
+    set(ORT_VERSION "1.23.0")
     if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
       set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime.Gpu.Linux")
     elseif(WIN32)
@@ -99,7 +99,7 @@ else()
       message(FATAL_ERROR "Unsupported platform for CUDA")
     endif()
   elseif(USE_ROCM)
-    set(ORT_VERSION "1.22.0")
+    set(ORT_VERSION "1.23.0")
     set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime.Rocm")
   else()
     set(ORT_PACKAGE_NAME "Microsoft.ML.OnnxRuntime")
diff --git a/examples/slm_engine/build_scripts/build_deps.py b/examples/slm_engine/build_scripts/build_deps.py
@@ -577,9 +577,9 @@ def main():
     ort_home = None
     if args.build_ort_from_source:
         if args.ort_version_to_use is None:
-            # If not Windows then use 1.22.0
+            # If not Windows then use 1.23.0
             if platform.system() != "Windows":
-                args.ort_version_to_use = "v1.22.0"
+                args.ort_version_to_use = "v1.23.0"
             else:
                 args.ort_version_to_use = "main"
         ort_home = build_ort(args, dep_src_dir, artifacts_dir)
@@ -590,7 +590,7 @@ def main():
             # The ORT binaries are available as they were downloaded during the GenAI build
             # This is the supported version for most platforms
             if args.ort_version_to_use is None:
-                ORT_VERSION = "1.22.0"
+                ORT_VERSION = "1.23.0"
             else:
                 ORT_VERSION = args.ort_version_to_use
             # Copy the ORT artifacts to the artifacts directory.
diff --git a/src/config.cpp b/src/config.cpp
@@ -1105,6 +1105,13 @@ bool IsGraphCaptureEnabled(const Config::SessionOptions& session_options) {
         }
       } else if (provider_options->name == "DML") {
         return true;
+      } else if (provider_options->name == "WebGPU") {
+        for (const auto& value : provider_options->options) {
+          if (value.first == "enableGraphCapture" && value.second == "1") {
+            return true;
+          }
+        }
+        return false;
       } else if (provider_options->name == "NvTensorRtRtx") {
         for (const auto& value : provider_options->options) {
           if (value.first == "enable_cuda_graph" && value.second == "1") {
diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -953,7 +953,9 @@ Model::Model(std::unique_ptr<Config> config) : config_{std::move(config)} {
   EnsureDeviceOrtInit(*p_device_, *config_, arena_cfg_);
 
   // Only CUDA, TRT-RTX and DML does every input on the device
-  if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx)
+  // For WebGPU, use device memory only if graph capture is enabled, otherwise use CPU
+  if (p_device_->GetType() == DeviceType::CUDA || p_device_->GetType() == DeviceType::DML || p_device_->GetType() == DeviceType::NvTensorRtRtx ||
+      (p_device_->GetType() == DeviceType::WEBGPU && IsGraphCaptureEnabled(config_->model.decoder.session_options)))
     p_device_inputs_ = p_device_;
   else
     p_device_inputs_ = GetDeviceInterface(DeviceType::CPU);
diff --git a/src/models/onnxruntime_api.h b/src/models/onnxruntime_api.h
@@ -477,6 +477,14 @@ struct OrtEnv {
 
   OrtEnv& CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, const OrtArenaCfg& arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocator
 
+  /// \brief Copy tensors between devices. Wraps OrtApi::CopyTensors
+  /// \param src_tensors Array of source OrtValue tensors
+  /// \param dst_tensors Array of destination OrtValue tensors (must be pre-allocated)
+  /// \param stream Optional sync stream for asynchronous copy (can be nullptr for synchronous)
+  void CopyTensors(const std::vector<const OrtValue*>& src_tensors,
+                   const std::vector<OrtValue*>& dst_tensors,
+                   OrtSyncStream* stream = nullptr) const;
+
   std::vector<const OrtEpDevice*> GetEpDevices();
 
   static void operator delete(void* p) { Ort::api->ReleaseEnv(reinterpret_cast<OrtEnv*>(p)); }
@@ -848,6 +856,26 @@ struct OrtShape {
   size_t shape_len;
 };
 
+/** \brief Wrapper around ::OrtSyncStream
+ *
+ * Used for asynchronous operations like CopyTensors.
+ * Requires ONNX Runtime 1.23.0 or later.
+ */
+struct OrtSyncStream {
+  /// \brief Create a sync stream for a specific execution provider device
+  /// \param ep_device The execution provider device (from OrtEnv::GetEpDevices)
+  /// \param stream_options Optional stream configuration options
+  static std::unique_ptr<OrtSyncStream> Create(const OrtEpDevice* ep_device, const OrtKeyValuePairs* stream_options = nullptr);
+
+  /// \brief Get the native stream handle (e.g., cudaStream_t for CUDA)
+  void* GetHandle() const;
+
+  static void operator delete(void* p) {
+    if (p) Ort::api->ReleaseSyncStream(reinterpret_cast<OrtSyncStream*>(p));
+  }
+  Ort::Abstract make_abstract;
+};
+
 /** \brief Wrapper around ::OrtValue
  *
  */
diff --git a/src/models/onnxruntime_inline.h b/src/models/onnxruntime_inline.h
@@ -256,6 +256,16 @@ inline std::unique_ptr<OrtMemoryInfo> OrtMemoryInfo::Create(const char* name, Or
   return std::unique_ptr<OrtMemoryInfo>{p};
 }
 
+inline std::unique_ptr<OrtSyncStream> OrtSyncStream::Create(const OrtEpDevice* ep_device, const OrtKeyValuePairs* stream_options) {
+  OrtSyncStream* p_stream = nullptr;
+  Ort::ThrowOnError(Ort::api->CreateSyncStreamForEpDevice(ep_device, stream_options, &p_stream));
+  return std::unique_ptr<OrtSyncStream>(p_stream);
+}
+
+inline void* OrtSyncStream::GetHandle() const {
+  return Ort::api->SyncStream_GetHandle(const_cast<OrtSyncStream*>(this));
+}
+
 inline std::unique_ptr<OrtIoBinding> OrtIoBinding::Create(OrtSession& session) {
   OrtIoBinding* p;
   Ort::ThrowOnError(Ort::api->CreateIoBinding(&session, &p));
@@ -398,6 +408,15 @@ inline OrtEnv& OrtEnv::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info,
   return *this;
 }
 
+inline void OrtEnv::CopyTensors(const std::vector<const OrtValue*>& src_tensors,
+                                const std::vector<OrtValue*>& dst_tensors,
+                                OrtSyncStream* stream) const {
+  if (src_tensors.size() != dst_tensors.size()) {
+    throw std::runtime_error("Number of source and destination tensors must match");
+  }
+  Ort::ThrowOnError(Ort::api->CopyTensors(this, src_tensors.data(), dst_tensors.data(), stream, src_tensors.size()));
+}
+
 inline std::vector<const OrtEpDevice*> OrtEnv::GetEpDevices() {
   size_t num_devices = 0;
   const OrtEpDevice* const* device_ptrs = nullptr;
diff --git a/src/webgpu/interface.cpp b/src/webgpu/interface.cpp
@@ -14,29 +14,136 @@ const char* device_label = "WebGPU";
 struct WebGPUMemory final : DeviceBuffer {
   WebGPUMemory(size_t size) : owned_{true} {
     size_in_bytes_ = size;
-    p_cpu_ = p_device_ = static_cast<uint8_t*>(ort_allocator_->Alloc(size_in_bytes_));
+    p_device_ = static_cast<uint8_t*>(ort_allocator_->Alloc(size_in_bytes_));
   }
 
   WebGPUMemory(void* p, size_t size) : owned_{false} {
     size_in_bytes_ = size;
-    p_cpu_ = p_device_ = static_cast<uint8_t*>(p);
+    p_device_ = static_cast<uint8_t*>(p);
   }
 
   ~WebGPUMemory() override {
     if (owned_)
       ort_allocator_->Free(p_device_);
+    if (p_cpu_)
+      free(p_cpu_);
   }
 
   const char* GetType() const override { return device_label; }
-  void AllocateCpu() override { throw std::runtime_error("CPU can't access WebGPU memory"); }
-  void CopyDeviceToCpu() override { throw std::runtime_error("CPU can't access WebGPU memory"); }
-  void CopyCpuToDevice() override { throw std::runtime_error("CPU can't access WebGPU memory"); }
+
+  void AllocateCpu() override {
+    if (!p_cpu_)
+      p_cpu_ = static_cast<uint8_t*>(malloc(size_in_bytes_));
+  }
+
+  void CopyDeviceToCpu() override {
+    if (!ort_allocator_) {
+      throw std::runtime_error("WebGPU allocator not initialized");
+    }
+
+    AllocateCpu();
+
+    // Get WebGPU allocator's memory info
+    const OrtMemoryInfo* webgpu_mem_info = nullptr;
+    Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));
+
+    // Create source tensor (WebGPU device memory) - treat as 1D uint8 array
+    int64_t shape_val = static_cast<int64_t>(size_in_bytes_);
+    std::span<const int64_t> shape{&shape_val, 1};
+    auto src_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+
+    // Create CPU memory info and destination tensor
+    auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+    auto dst_tensor = OrtValue::CreateTensor(*cpu_mem_info, p_cpu_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+
+    // Use ORT C API's CopyTensors (synchronous copy, stream = nullptr)
+    OrtValue* src_ptrs[] = {src_tensor.get()};
+    OrtValue* dst_ptrs[] = {dst_tensor.get()};
+    Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
+  }
+
+  void CopyCpuToDevice() override {
+    if (!ort_allocator_) {
+      throw std::runtime_error("WebGPU allocator not initialized");
+    }
+    assert(p_cpu_);
+
+    // Get WebGPU allocator's memory info
+    const OrtMemoryInfo* webgpu_mem_info = nullptr;
+    Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));
+
+    // Create source tensor (CPU memory) - treat as 1D uint8 array
+    int64_t shape_val = static_cast<int64_t>(size_in_bytes_);
+    std::span<const int64_t> shape{&shape_val, 1};
+    auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+    auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, p_cpu_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+
+    // Create destination tensor (WebGPU device memory)
+    auto dst_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+
+    // Use ORT C API's CopyTensors (synchronous copy, stream = nullptr)
+    OrtValue* src_ptrs[] = {src_tensor.get()};
+    OrtValue* dst_ptrs[] = {dst_tensor.get()};
+    Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
+  }
+
   void CopyFrom(size_t begin_dest, DeviceBuffer& source, size_t begin_source, size_t size_in_bytes) override {
-    throw std::runtime_error("CPU can't access WebGPU memory");
+    if (!ort_allocator_) {
+      throw std::runtime_error("WebGPU allocator not initialized");
+    }
+
+    // Fast path: WebGPU-to-WebGPU copy with zero offsets
+    // NOTE: p_device_ is a WGPUBuffer handle (cast to uint8_t*), not a memory pointer.
+    // We cannot use pointer arithmetic (p_device_ + offset) to create sub-buffer views.
+    // OrtValue::CreateTensor expects the actual buffer handle, not an offset pointer.
+    if (source.GetType() == device_label && begin_source == 0 && begin_dest == 0) {
+      // Get WebGPU allocator's memory info
+      const OrtMemoryInfo* webgpu_mem_info = nullptr;
+      Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));
+
+      // Full buffer copy using CopyTensors (no offsets)
+      int64_t shape_val = static_cast<int64_t>(size_in_bytes);
+      std::span<const int64_t> shape{&shape_val, 1};
+      auto src_tensor = OrtValue::CreateTensor(*webgpu_mem_info, source.p_device_, size_in_bytes, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+      auto dst_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+
+      // Use ORT C API's CopyTensors for GPU-to-GPU copy
+      OrtValue* src_ptrs[] = {src_tensor.get()};
+      OrtValue* dst_ptrs[] = {dst_tensor.get()};
+      Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
+    } else {
+      // Fallback: Copy through CPU for:
+      // - WebGPU-to-WebGPU copies with non-zero offsets (buffer handles don't support offset arithmetic)
+      // - Cross-device copies (e.g., CPU to WebGPU or vice versa)
+      CopyThroughCpu(*this, begin_dest, source, begin_source, size_in_bytes);
+    }
   }
 
   void Zero() override {
-    throw std::runtime_error("Zeroing not implemented for WebGPU memory");
+    if (!ort_allocator_) {
+      throw std::runtime_error("WebGPU allocator not initialized");
+    }
+
+    // Allocate zeroed CPU memory
+    std::vector<uint8_t> zero_buffer(size_in_bytes_, 0);
+
+    // Get WebGPU allocator's memory info
+    const OrtMemoryInfo* webgpu_mem_info = nullptr;
+    Ort::ThrowOnError(Ort::api->AllocatorGetInfo(ort_allocator_, &webgpu_mem_info));
+
+    // Create source tensor (CPU memory with zeros) - treat as 1D uint8 array
+    int64_t shape_val = static_cast<int64_t>(size_in_bytes_);
+    std::span<const int64_t> shape{&shape_val, 1};
+    auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+    auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, zero_buffer.data(), size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+
+    // Create destination tensor (WebGPU device memory)
+    auto dst_tensor = OrtValue::CreateTensor(*webgpu_mem_info, p_device_, size_in_bytes_, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+
+    // Use ORT C API's CopyTensors to copy zeros to GPU (synchronous copy, stream = nullptr)
+    OrtValue* src_ptrs[] = {src_tensor.get()};
+    OrtValue* dst_ptrs[] = {dst_tensor.get()};
+    Ort::ThrowOnError(Ort::api->CopyTensors(&GetOrtEnv(), src_ptrs, dst_ptrs, nullptr, 1));
   }
 
   bool owned_;
diff --git a/test/python/cpu/ort/requirements.txt b/test/python/cpu/ort/requirements.txt
@@ -1 +1 @@
-onnxruntime==1.22.0
+onnxruntime==1.23.0
diff --git a/test/python/cuda/ort/requirements.txt b/test/python/cuda/ort/requirements.txt
@@ -1 +1 @@
-onnxruntime-gpu==1.22.0
+onnxruntime-gpu==1.23.0
diff --git a/test/python/directml/ort/requirements.txt b/test/python/directml/ort/requirements.txt
@@ -1 +1 @@
-onnxruntime-directml==1.22.0
+onnxruntime-directml==1.23.0
diff --git a/test/python/macos/ort/requirements.txt b/test/python/macos/ort/requirements.txt
@@ -1 +1 @@
-onnxruntime==1.22.0
+onnxruntime==1.23.0

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-onnxruntime==1.22.0`
	`1`	`+onnxruntime==1.23.0`