Merge branch 'main' of https://github.com/microsoft/onnxruntime-genai …

…into abjindal/set_terminate
microsoft · Oct 23, 2024 · aec8564 · aec8564
2 parents f75e180 + 28f77a3
commit aec8564
Show file tree

Hide file tree

Showing 21 changed files with 329 additions and 99 deletions.
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
@@ -21,6 +21,10 @@ jobs:
         with:
           submodules: true
 
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12.x'
+
       - name: Get the Latest OnnxRuntime Nightly Version
         run: |
           ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
@@ -68,6 +72,7 @@ jobs:
       - name: Run the python tests
         run: |
           source genai-macos-venv/bin/activate
+          export HF_TOKEN="12345"
           export ORTGENAI_LOG_ORT_LIB=1
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models
 

diff --git a/.pipelines/codeql.yaml b/.pipelines/codeql.yaml
@@ -41,7 +41,6 @@ stages:
     - task: onebranch.pipeline.tsaoptions@1
       displayName: 'OneBranch TSAOptions'
       inputs:
-        tsaConfigFilePath: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
         appendSourceBranchName: false
     - task: CredScan@3
       displayName: 🔍 Run CredScan
@@ -65,4 +64,3 @@ stages:
       continueOnError: true
       inputs:
         GdnPublishTsaOnboard: true
-        GdnPublishTsaConfigFile: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml
@@ -26,7 +26,6 @@ steps:
 - task: onebranch.pipeline.tsaoptions@1
   displayName: 'OneBranch TSAOptions'
   inputs:
-    tsaConfigFilePath: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
     appendSourceBranchName: false
 
 - template: utils/set-nightly-build-option-variable.yml

diff --git a/.pipelines/stages/jobs/steps/compliant-and-cleanup-step.yml b/.pipelines/stages/jobs/steps/compliant-and-cleanup-step.yml
@@ -8,10 +8,9 @@ steps:
 
 - task: TSAUpload@2
   displayName: 'TSA upload'
-  condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+  condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'), ne(variables['os'], 'osx'), ne(variables['os'], 'ios')) # Not available on macOS.
   inputs:
     GdnPublishTsaOnboard: false
-    GdnPublishTsaConfigFile: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
   continueOnError: true
 
 - template: compliant/component-governance-component-detection-step.yml

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -36,6 +36,8 @@ include(cmake/check_cuda.cmake)
 include(cmake/check_rocm.cmake)
 # Checking if DML is supported
 include(cmake/check_dml.cmake)
+# Checking if WebGpu is supported
+include(cmake/check_webgpu.cmake)
 
 include(cmake/cxx_standard.cmake)
 

diff --git a/build.py b/build.py
@@ -126,6 +126,8 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript
 
     parser.add_argument("--use_rocm", action="store_true", help="Whether to use ROCm. Default is to not use rocm.")
 
+    parser.add_argument("--use_webgpu", action="store_true", help="Whether to use WebGpu. Default is to not use WebGpu.")
+
     parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.")
 
     # The following options are mutually exclusive (cross compiling options such as android, ios, etc.)
@@ -471,6 +473,7 @@ def update(args: argparse.Namespace, env: dict[str, str]):
         "-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
         f"-DUSE_CUDA={'ON' if args.use_cuda else 'OFF'}",
         f"-DUSE_ROCM={'ON' if args.use_rocm else 'OFF'}",
+        f"-DUSE_WEBGPU={'ON' if args.use_webgpu else 'OFF'}",
         f"-DUSE_DML={'ON' if args.use_dml else 'OFF'}",
         f"-DENABLE_JAVA={'ON' if args.build_java else 'OFF'}",
         f"-DBUILD_WHEEL={build_wheel}",

diff --git a/cmake/check_webgpu.cmake b/cmake/check_webgpu.cmake
@@ -0,0 +1,6 @@
+
+if(USE_WEBGPU)
+  add_compile_definitions(USE_WEBGPU=1)
+else()
+  add_compile_definitions(USE_WEBGPU=0)
+endif()
diff --git a/cmake/options.cmake b/cmake/options.cmake
@@ -4,6 +4,7 @@ include(CMakeDependentOption)
 option(USE_CUDA "Build with CUDA support" ON)
 option(USE_ROCM "Build with ROCm support" ON)
 option(USE_DML "Build with DML support" OFF)
+option(USE_WEBGPU "Build with WEBGPU support" ON)
 
 # bindings
 option(ENABLE_JAVA "Build the Java API." OFF)

diff --git a/src/config.cpp b/src/config.cpp
@@ -33,8 +33,13 @@ struct ProviderOptionsObject_Element : JSON::Element {
   explicit ProviderOptionsObject_Element(std::vector<Config::ProviderOptions>& v) : v_{v} {}
 
   JSON::Element& OnObject(std::string_view name) override {
-    if (options_element_)
-      throw std::runtime_error("Each object in the provider_options array can only have one member (named value)");
+    for (auto& v : v_) {
+      if (v.name == name) {
+        options_element_ = std::make_unique<ProviderOptions_Element>(v);
+        return *options_element_;
+      }
+    }
+
     auto& options = v_.emplace_back();
     options.name = name;
     options_element_ = std::make_unique<ProviderOptions_Element>(options);

diff --git a/src/generators.cpp b/src/generators.cpp
@@ -198,6 +198,8 @@ std::string to_string(DeviceType device_type) {
       return "CUDA";
     case DeviceType::DML:
       return "DirectML";
+    case DeviceType::WEBGPU:
+      return "WebGpu";
   }
   throw std::runtime_error("Unknown device type");
 }

diff --git a/src/generators.h b/src/generators.h
@@ -54,6 +54,7 @@ enum struct DeviceType {
   CPU,
   CUDA,
   DML,
+  WEBGPU,
 };
 
 std::string to_string(DeviceType device_type);

diff --git a/src/java/src/main/java/ai/onnxruntime/genai/Tensor.java b/src/java/src/main/java/ai/onnxruntime/genai/Tensor.java
@@ -5,12 +5,16 @@
 package ai.onnxruntime.genai;
 
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 
 public final class Tensor implements AutoCloseable {
-  private long nativeHandle = -1;
+  private long nativeHandle = 0;
   private final ElementType elementType;
   private final long[] shape;
 
+  // Buffer that owns the Tensor data.
+  private ByteBuffer dataBuffer = null;
+
   // The values in this enum must match ONNX values
   // https://github.com/onnx/onnx/blob/159fa47b7c4d40e6d9740fcf14c36fff1d11ccd8/onnx/onnx.proto#L499-L544
   public enum ElementType {
@@ -33,7 +37,7 @@ public enum ElementType {
   /**
    * Constructs a Tensor with the given data, shape and element type.
    *
-   * @param data The data for the Tensor. Must be a direct ByteBuffer.
+   * @param data The data for the Tensor. Must be a direct ByteBuffer with native byte order.
    * @param shape The shape of the Tensor.
    * @param elementType The type of elements in the Tensor.
    * @throws GenAIException
@@ -51,8 +55,14 @@ public Tensor(ByteBuffer data, long[] shape, ElementType elementType) throws Gen
           "Tensor data must be direct. Allocate with ByteBuffer.allocateDirect");
     }
 
+    // for now, require native byte order as the bytes will be used directly.
+    if (data.order() != ByteOrder.nativeOrder()) {
+      throw new GenAIException("Tensor data must have native byte order.");
+    }
+
     this.elementType = elementType;
     this.shape = shape;
+    this.dataBuffer = data;  // save a reference so the owning buffer will stay around.
 
     nativeHandle = createTensor(data, shape, elementType.ordinal());
   }

diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp
@@ -145,7 +145,8 @@ void InputIDs::Update(RoamingArray<int32_t> next_tokens_unk) {
             input_ids_cast_command_list_state_);
 #endif
       } break;
-      case DeviceType::CPU: {
+      default: {
+        // CPU, WEBGPU
         auto* data = value_->GetTensorMutableData<int64_t>();
         auto next_tokens = next_tokens_unk.GetCPU();
         for (int i = 0; i < shape_[0]; i++) {

diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
@@ -34,11 +34,11 @@ KV_Cache_Combined::KV_Cache_Combined(State& state)
   // Derive the KV data type from the KV input 0
   type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]);
 
-  empty_past_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+  empty_past_ = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);
   shape_[3] = state_.params_->sequence_length;
 
   for (int i = 0; i < layer_count_; ++i) {
-    presents_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
+    presents_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_));
   }
 }
 
@@ -67,7 +67,7 @@ void KV_Cache_Combined::Update(std::span<const int32_t> beam_indices, int curren
 
   shape_[3] = current_length;
   for (int i = 0; i < layer_count_; i++) {
-    presents_[i] = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+    presents_[i] = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);
     state_.inputs_[input_index_ + i] = pasts_[i].get();
     state_.outputs_[output_index_ + i] = presents_[i].get();
   }
@@ -81,7 +81,7 @@ void KV_Cache_Combined::PickPastState(std::span<const int32_t> beam_indices, int
   auto element_count = shape_[0] * past_key_size;
 
   const OrtValue& present = *presents_[index];
-  std::unique_ptr<OrtValue> past = OrtValue::CreateTensor<ScoreType>(*model_.allocator_device_, shape_);
+  std::unique_ptr<OrtValue> past = OrtValue::CreateTensor<ScoreType>(*model_.allocator_kvcache_, shape_);
   auto past_span = std::span<ScoreType>(past->GetTensorMutableData<ScoreType>(), element_count);
   auto present_span = std::span<const ScoreType>(present.GetTensorData<ScoreType>(), element_count);
 
@@ -149,7 +149,7 @@ KV_Cache::KV_Cache(State& state)
   // Derive the KV data type from the KV input 0
   type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]);
 
-  empty_past_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+  empty_past_ = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);
 
   // Set the size after empty_past_ has been created with 0 for this field
   if (past_present_share_buffer_)
@@ -167,7 +167,7 @@ KV_Cache::KV_Cache(State& state)
 
   for (int i = 0; i < layer_count_ * 2; ++i) {
     presents_.push_back(
-        sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)
+        sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_)
                               : sb_kv_caches_[i]->CreateTensorOnStaticBuffer(shape_, type_));
   }
 }
@@ -216,7 +216,7 @@ void KV_Cache::Update(std::span<const int32_t> beam_indices, int current_length)
 
   shape_[2] = current_length;
   for (int i = 0; i < layer_count_ * 2; i++) {
-    presents_[i] = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+    presents_[i] = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);
     state_.outputs_[output_index_ + i] = presents_[i].get();
   }
 }
@@ -228,7 +228,7 @@ void KV_Cache::PickPastState(std::span<const int32_t> beam_indices, int index) {
   auto element_count = shape_[0] * block_size_per_beam;
 
   const OrtValue& present_value = *presents_[index];
-  std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(*model_.allocator_device_, shape_);
+  std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(*model_.allocator_kvcache_, shape_);
   auto past_span = std::span<ScoreType>(past_value->GetTensorMutableData<ScoreType>(), element_count);
   auto present_span = std::span<const ScoreType>(present_value.GetTensorData<ScoreType>(), element_count);
 
@@ -280,8 +280,8 @@ Cross_Cache::Cross_Cache(State& state)
   type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]);
 
   for (int i = 0; i < layer_count_; ++i) {
-    values_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
-    values_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
+    values_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_));
+    values_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_));
   }
 }
 

diff --git a/src/models/logits.cpp b/src/models/logits.cpp
@@ -98,8 +98,8 @@ RoamingArray<float> Logits::Get() {
 #endif
           } break;
 
-          case DeviceType::CPU:
-          case DeviceType::CUDA: {
+          default: {
+            // CPU, CUDA, WEBGPU
             auto logits_raw = std::span<const uint8_t>{output_raw_->GetTensorMutableData<uint8_t>(), element_count * element_size};
             auto logits_last_tokens = std::span<uint8_t>{logits_of_last_token->GetTensorMutableData<uint8_t>(), element_count_last_token * element_size};
             auto target = logits_last_tokens.subspan(vocab_index * element_size, vocab_size * element_size);

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -294,14 +294,23 @@ void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) {
   if (device_type_ == DeviceType::CUDA) {
     allocator_device_ = GetCudaAllocator(session);
   }
-#elif USE_DML
+#endif
+#if USE_DML
   if (device_type_ == DeviceType::DML) {
     memory_info_device_ = OrtMemoryInfo::Create("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
     dml_owned_allocator_ = Ort::Allocator::Create(session, *memory_info_device_);
     allocator_device_ = dml_owned_allocator_.get();
   }
 #endif
-
+  allocator_kvcache_ = allocator_device_;
+#if USE_WEBGPU
+  if (device_type_ == DeviceType::WEBGPU) {
+    // for webgpu we only use device memory for kv_cache
+    memory_info_device_ = OrtMemoryInfo::Create("WebGPU_Buffer", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+    webgpu_owned_allocator_ = Ort::Allocator::Create(session, *memory_info_device_);
+    allocator_kvcache_ = webgpu_owned_allocator_.get();
+  }
+#endif
   session_info_ = std::make_unique<SessionInfo>(session);
   captured_graph_pool_ = std::make_shared<CapturedGraphPool>(config_.get(), session_info_.get(), allocator_device_);
 }
@@ -473,8 +482,11 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_
       for (auto& option : provider_options.options) {
         opts.emplace(option.first, option.second);
       }
-
       session_options.AppendExecutionProvider("QNN", opts);
+    } else if (provider_options.name == "web") {
+      device_type_ = DeviceType::WEBGPU;
+      std::unordered_map<std::string, std::string> opts;
+      session_options.AppendExecutionProvider("WebGPU", opts);
     } else
       throw std::runtime_error("Unknown provider type: " + provider_options.name);
   }
@@ -556,8 +568,9 @@ void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<Or
   auto* fp32 = p_out->GetTensorMutableData<float>();
 
   switch (device_type) {
+    case DeviceType::WEBGPU:
     case DeviceType::DML:
-      // DML doesn't currently support on-device scoring, so we fall back to the CPU
+      // DML, WebGpu doesn't currently support on-device scoring, so we fall back to the CPU
     case DeviceType::CPU:
       for (int i = 0; i < count; i++)
         fp32[i] = FastFloat16ToFloat32(fp16[i]);
@@ -617,7 +630,7 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
 
   // If we're on CUDA, we still want to do the copy to move the data over to CUDA memory where we will read from it later.
   // DML doesn't currently support on-device scoring, so we go the same route as the CPU
-  if (num_beams == 1 && (device_type_ == DeviceType::CPU || device_type_ == DeviceType::DML)) {
+  if (num_beams == 1 && (device_type_ == DeviceType::CPU || device_type_ == DeviceType::DML || device_type_ == DeviceType::WEBGPU)) {
     return std::move(input);
   }
 
@@ -637,8 +650,9 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
   auto* target = expanded_data;
 
   switch (device_type_) {
+    case DeviceType::WEBGPU:
     case DeviceType::DML:
-      // DML doesn't currently support on-device scoring, so we use the CPU for non-cache inputs/outputs
+      // DML and WebGpu doesn't currently support on-device scoring, so we use the CPU for non-cache inputs/outputs
     case DeviceType::CPU:
       for (int i = 0; i < batch_size; i++) {
         for (int j = 0; j < num_beams; j++) {

diff --git a/src/models/model.h b/src/models/model.h
@@ -146,7 +146,8 @@ struct Model : std::enable_shared_from_this<Model>, LeakChecked<Model> {
   cuda_stream_holder cuda_stream_;
   DeviceType device_type_{DeviceType::CPU};
   Ort::Allocator& allocator_cpu_{Ort::Allocator::GetWithDefaultOptions()};
-  Ort::Allocator* allocator_device_{};  // Can be CUDA or CPU based on the DeviceType in the model
+  Ort::Allocator* allocator_device_{};   // Can be CUDA or CPU based on the DeviceType in the model
+  Ort::Allocator* allocator_kvcache_{};  // keep allocator for kv_cache seperate to allow that only kv_cache is on device
 
   std::unique_ptr<SessionInfo> session_info_;
 
@@ -178,9 +179,14 @@ struct Model : std::enable_shared_from_this<Model>, LeakChecked<Model> {
   std::unique_ptr<DmlReadbackHeap> dml_readback_heap_;
   ComPtr<IDMLDevice> dml_device_;
   std::unique_ptr<Ort::Allocator> dml_owned_allocator_;
+#endif
+#if USE_WEBGPU
+  std::unique_ptr<Ort::Allocator> webgpu_owned_allocator_;
+  std::unique_ptr<OrtIoBinding> webgpu_io_binding_;
+#endif
+#if USE_DML || USE_WEBGPU
   std::unique_ptr<OrtMemoryInfo> memory_info_device_;
 #endif
-
   std::shared_ptr<CapturedGraphPool> captured_graph_pool_;
   std::map<std::string, std::unique_ptr<OrtSessionOptions>> pipeline_session_options_;
 };