Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into abjindal/set_terminate
  • Loading branch information
ajindal1 committed Oct 23, 2024
2 parents f75e180 + 28f77a3 commit aec8564
Show file tree
Hide file tree
Showing 21 changed files with 329 additions and 99 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/mac-cpu-arm64-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ jobs:
with:
submodules: true

- uses: actions/setup-python@v5
with:
python-version: '3.12.x'

- name: Get the Latest OnnxRuntime Nightly Version
run: |
ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
Expand Down Expand Up @@ -68,6 +72,7 @@ jobs:
- name: Run the python tests
run: |
source genai-macos-venv/bin/activate
export HF_TOKEN="12345"
export ORTGENAI_LOG_ORT_LIB=1
python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models
Expand Down
2 changes: 0 additions & 2 deletions .pipelines/codeql.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ stages:
- task: onebranch.pipeline.tsaoptions@1
displayName: 'OneBranch TSAOptions'
inputs:
tsaConfigFilePath: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
appendSourceBranchName: false
- task: CredScan@3
displayName: 🔍 Run CredScan
Expand All @@ -65,4 +64,3 @@ stages:
continueOnError: true
inputs:
GdnPublishTsaOnboard: true
GdnPublishTsaConfigFile: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
1 change: 0 additions & 1 deletion .pipelines/stages/jobs/steps/capi-win-step.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ steps:
- task: onebranch.pipeline.tsaoptions@1
displayName: 'OneBranch TSAOptions'
inputs:
tsaConfigFilePath: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
appendSourceBranchName: false

- template: utils/set-nightly-build-option-variable.yml
Expand Down
3 changes: 1 addition & 2 deletions .pipelines/stages/jobs/steps/compliant-and-cleanup-step.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ steps:

- task: TSAUpload@2
displayName: 'TSA upload'
condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'), ne(variables['os'], 'osx'), ne(variables['os'], 'ios')) # Not available on macOS.
inputs:
GdnPublishTsaOnboard: false
GdnPublishTsaConfigFile: '$(Build.Repository.LocalPath)\.config\tsaoptions.json'
continueOnError: true

- template: compliant/component-governance-component-detection-step.yml
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ include(cmake/check_cuda.cmake)
include(cmake/check_rocm.cmake)
# Checking if DML is supported
include(cmake/check_dml.cmake)
# Checking if WebGpu is supported
include(cmake/check_webgpu.cmake)

include(cmake/cxx_standard.cmake)

Expand Down
3 changes: 3 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ class HelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescript

parser.add_argument("--use_rocm", action="store_true", help="Whether to use ROCm. Default is to not use rocm.")

parser.add_argument("--use_webgpu", action="store_true", help="Whether to use WebGpu. Default is to not use WebGpu.")

parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.")

# The following options are mutually exclusive (cross compiling options such as android, ios, etc.)
Expand Down Expand Up @@ -471,6 +473,7 @@ def update(args: argparse.Namespace, env: dict[str, str]):
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
f"-DUSE_CUDA={'ON' if args.use_cuda else 'OFF'}",
f"-DUSE_ROCM={'ON' if args.use_rocm else 'OFF'}",
f"-DUSE_WEBGPU={'ON' if args.use_webgpu else 'OFF'}",
f"-DUSE_DML={'ON' if args.use_dml else 'OFF'}",
f"-DENABLE_JAVA={'ON' if args.build_java else 'OFF'}",
f"-DBUILD_WHEEL={build_wheel}",
Expand Down
6 changes: 6 additions & 0 deletions cmake/check_webgpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

if(USE_WEBGPU)
add_compile_definitions(USE_WEBGPU=1)
else()
add_compile_definitions(USE_WEBGPU=0)
endif()
1 change: 1 addition & 0 deletions cmake/options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include(CMakeDependentOption)
option(USE_CUDA "Build with CUDA support" ON)
option(USE_ROCM "Build with ROCm support" ON)
option(USE_DML "Build with DML support" OFF)
option(USE_WEBGPU "Build with WEBGPU support" ON)

# bindings
option(ENABLE_JAVA "Build the Java API." OFF)
Expand Down
9 changes: 7 additions & 2 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,13 @@ struct ProviderOptionsObject_Element : JSON::Element {
explicit ProviderOptionsObject_Element(std::vector<Config::ProviderOptions>& v) : v_{v} {}

JSON::Element& OnObject(std::string_view name) override {
if (options_element_)
throw std::runtime_error("Each object in the provider_options array can only have one member (named value)");
for (auto& v : v_) {
if (v.name == name) {
options_element_ = std::make_unique<ProviderOptions_Element>(v);
return *options_element_;
}
}

auto& options = v_.emplace_back();
options.name = name;
options_element_ = std::make_unique<ProviderOptions_Element>(options);
Expand Down
2 changes: 2 additions & 0 deletions src/generators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ std::string to_string(DeviceType device_type) {
return "CUDA";
case DeviceType::DML:
return "DirectML";
case DeviceType::WEBGPU:
return "WebGpu";
}
throw std::runtime_error("Unknown device type");
}
Expand Down
1 change: 1 addition & 0 deletions src/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ enum struct DeviceType {
CPU,
CUDA,
DML,
WEBGPU,
};

std::string to_string(DeviceType device_type);
Expand Down
14 changes: 12 additions & 2 deletions src/java/src/main/java/ai/onnxruntime/genai/Tensor.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@
package ai.onnxruntime.genai;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

public final class Tensor implements AutoCloseable {
private long nativeHandle = -1;
private long nativeHandle = 0;
private final ElementType elementType;
private final long[] shape;

// Buffer that owns the Tensor data.
private ByteBuffer dataBuffer = null;

// The values in this enum must match ONNX values
// https://github.com/onnx/onnx/blob/159fa47b7c4d40e6d9740fcf14c36fff1d11ccd8/onnx/onnx.proto#L499-L544
public enum ElementType {
Expand All @@ -33,7 +37,7 @@ public enum ElementType {
/**
* Constructs a Tensor with the given data, shape and element type.
*
* @param data The data for the Tensor. Must be a direct ByteBuffer.
* @param data The data for the Tensor. Must be a direct ByteBuffer with native byte order.
* @param shape The shape of the Tensor.
* @param elementType The type of elements in the Tensor.
* @throws GenAIException
Expand All @@ -51,8 +55,14 @@ public Tensor(ByteBuffer data, long[] shape, ElementType elementType) throws Gen
"Tensor data must be direct. Allocate with ByteBuffer.allocateDirect");
}

// for now, require native byte order as the bytes will be used directly.
if (data.order() != ByteOrder.nativeOrder()) {
throw new GenAIException("Tensor data must have native byte order.");
}

this.elementType = elementType;
this.shape = shape;
this.dataBuffer = data; // save a reference so the owning buffer will stay around.

nativeHandle = createTensor(data, shape, elementType.ordinal());
}
Expand Down
3 changes: 2 additions & 1 deletion src/models/input_ids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ void InputIDs::Update(RoamingArray<int32_t> next_tokens_unk) {
input_ids_cast_command_list_state_);
#endif
} break;
case DeviceType::CPU: {
default: {
// CPU, WEBGPU
auto* data = value_->GetTensorMutableData<int64_t>();
auto next_tokens = next_tokens_unk.GetCPU();
for (int i = 0; i < shape_[0]; i++) {
Expand Down
20 changes: 10 additions & 10 deletions src/models/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ KV_Cache_Combined::KV_Cache_Combined(State& state)
// Derive the KV data type from the KV input 0
type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]);

empty_past_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
empty_past_ = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);
shape_[3] = state_.params_->sequence_length;

for (int i = 0; i < layer_count_; ++i) {
presents_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
presents_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_));
}
}

Expand Down Expand Up @@ -67,7 +67,7 @@ void KV_Cache_Combined::Update(std::span<const int32_t> beam_indices, int curren

shape_[3] = current_length;
for (int i = 0; i < layer_count_; i++) {
presents_[i] = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
presents_[i] = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);
state_.inputs_[input_index_ + i] = pasts_[i].get();
state_.outputs_[output_index_ + i] = presents_[i].get();
}
Expand All @@ -81,7 +81,7 @@ void KV_Cache_Combined::PickPastState(std::span<const int32_t> beam_indices, int
auto element_count = shape_[0] * past_key_size;

const OrtValue& present = *presents_[index];
std::unique_ptr<OrtValue> past = OrtValue::CreateTensor<ScoreType>(*model_.allocator_device_, shape_);
std::unique_ptr<OrtValue> past = OrtValue::CreateTensor<ScoreType>(*model_.allocator_kvcache_, shape_);
auto past_span = std::span<ScoreType>(past->GetTensorMutableData<ScoreType>(), element_count);
auto present_span = std::span<const ScoreType>(present.GetTensorData<ScoreType>(), element_count);

Expand Down Expand Up @@ -149,7 +149,7 @@ KV_Cache::KV_Cache(State& state)
// Derive the KV data type from the KV input 0
type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]);

empty_past_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
empty_past_ = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);

// Set the size after empty_past_ has been created with 0 for this field
if (past_present_share_buffer_)
Expand All @@ -167,7 +167,7 @@ KV_Cache::KV_Cache(State& state)

for (int i = 0; i < layer_count_ * 2; ++i) {
presents_.push_back(
sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)
sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_)
: sb_kv_caches_[i]->CreateTensorOnStaticBuffer(shape_, type_));
}
}
Expand Down Expand Up @@ -216,7 +216,7 @@ void KV_Cache::Update(std::span<const int32_t> beam_indices, int current_length)

shape_[2] = current_length;
for (int i = 0; i < layer_count_ * 2; i++) {
presents_[i] = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
presents_[i] = OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_);
state_.outputs_[output_index_ + i] = presents_[i].get();
}
}
Expand All @@ -228,7 +228,7 @@ void KV_Cache::PickPastState(std::span<const int32_t> beam_indices, int index) {
auto element_count = shape_[0] * block_size_per_beam;

const OrtValue& present_value = *presents_[index];
std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(*model_.allocator_device_, shape_);
std::unique_ptr<OrtValue> past_value = OrtValue::CreateTensor<ScoreType>(*model_.allocator_kvcache_, shape_);
auto past_span = std::span<ScoreType>(past_value->GetTensorMutableData<ScoreType>(), element_count);
auto present_span = std::span<const ScoreType>(present_value.GetTensorData<ScoreType>(), element_count);

Expand Down Expand Up @@ -280,8 +280,8 @@ Cross_Cache::Cross_Cache(State& state)
type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]);

for (int i = 0; i < layer_count_; ++i) {
values_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
values_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
values_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_));
values_.push_back(OrtValue::CreateTensor(*model_.allocator_kvcache_, shape_, type_));
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/models/logits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ RoamingArray<float> Logits::Get() {
#endif
} break;

case DeviceType::CPU:
case DeviceType::CUDA: {
default: {
// CPU, CUDA, WEBGPU
auto logits_raw = std::span<const uint8_t>{output_raw_->GetTensorMutableData<uint8_t>(), element_count * element_size};
auto logits_last_tokens = std::span<uint8_t>{logits_of_last_token->GetTensorMutableData<uint8_t>(), element_count_last_token * element_size};
auto target = logits_last_tokens.subspan(vocab_index * element_size, vocab_size * element_size);
Expand Down
26 changes: 20 additions & 6 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,23 @@ void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) {
if (device_type_ == DeviceType::CUDA) {
allocator_device_ = GetCudaAllocator(session);
}
#elif USE_DML
#endif
#if USE_DML
if (device_type_ == DeviceType::DML) {
memory_info_device_ = OrtMemoryInfo::Create("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
dml_owned_allocator_ = Ort::Allocator::Create(session, *memory_info_device_);
allocator_device_ = dml_owned_allocator_.get();
}
#endif

allocator_kvcache_ = allocator_device_;
#if USE_WEBGPU
if (device_type_ == DeviceType::WEBGPU) {
// for webgpu we only use device memory for kv_cache
memory_info_device_ = OrtMemoryInfo::Create("WebGPU_Buffer", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
webgpu_owned_allocator_ = Ort::Allocator::Create(session, *memory_info_device_);
allocator_kvcache_ = webgpu_owned_allocator_.get();
}
#endif
session_info_ = std::make_unique<SessionInfo>(session);
captured_graph_pool_ = std::make_shared<CapturedGraphPool>(config_.get(), session_info_.get(), allocator_device_);
}
Expand Down Expand Up @@ -473,8 +482,11 @@ void Model::CreateSessionOptionsFromConfig(const Config::SessionOptions& config_
for (auto& option : provider_options.options) {
opts.emplace(option.first, option.second);
}

session_options.AppendExecutionProvider("QNN", opts);
} else if (provider_options.name == "web") {
device_type_ = DeviceType::WEBGPU;
std::unordered_map<std::string, std::string> opts;
session_options.AppendExecutionProvider("WebGPU", opts);
} else
throw std::runtime_error("Unknown provider type: " + provider_options.name);
}
Expand Down Expand Up @@ -556,8 +568,9 @@ void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<Or
auto* fp32 = p_out->GetTensorMutableData<float>();

switch (device_type) {
case DeviceType::WEBGPU:
case DeviceType::DML:
// DML doesn't currently support on-device scoring, so we fall back to the CPU
// DML, WebGpu doesn't currently support on-device scoring, so we fall back to the CPU
case DeviceType::CPU:
for (int i = 0; i < count; i++)
fp32[i] = FastFloat16ToFloat32(fp16[i]);
Expand Down Expand Up @@ -617,7 +630,7 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,

// If we're on CUDA, we still want to do the copy to move the data over to CUDA memory where we will read from it later.
// DML doesn't currently support on-device scoring, so we go the same route as the CPU
if (num_beams == 1 && (device_type_ == DeviceType::CPU || device_type_ == DeviceType::DML)) {
if (num_beams == 1 && (device_type_ == DeviceType::CPU || device_type_ == DeviceType::DML || device_type_ == DeviceType::WEBGPU)) {
return std::move(input);
}

Expand All @@ -637,8 +650,9 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
auto* target = expanded_data;

switch (device_type_) {
case DeviceType::WEBGPU:
case DeviceType::DML:
// DML doesn't currently support on-device scoring, so we use the CPU for non-cache inputs/outputs
// DML and WebGpu doesn't currently support on-device scoring, so we use the CPU for non-cache inputs/outputs
case DeviceType::CPU:
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < num_beams; j++) {
Expand Down
10 changes: 8 additions & 2 deletions src/models/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ struct Model : std::enable_shared_from_this<Model>, LeakChecked<Model> {
cuda_stream_holder cuda_stream_;
DeviceType device_type_{DeviceType::CPU};
Ort::Allocator& allocator_cpu_{Ort::Allocator::GetWithDefaultOptions()};
Ort::Allocator* allocator_device_{}; // Can be CUDA or CPU based on the DeviceType in the model
Ort::Allocator* allocator_device_{}; // Can be CUDA or CPU based on the DeviceType in the model
Ort::Allocator* allocator_kvcache_{}; // keep allocator for kv_cache seperate to allow that only kv_cache is on device

std::unique_ptr<SessionInfo> session_info_;

Expand Down Expand Up @@ -178,9 +179,14 @@ struct Model : std::enable_shared_from_this<Model>, LeakChecked<Model> {
std::unique_ptr<DmlReadbackHeap> dml_readback_heap_;
ComPtr<IDMLDevice> dml_device_;
std::unique_ptr<Ort::Allocator> dml_owned_allocator_;
#endif
#if USE_WEBGPU
std::unique_ptr<Ort::Allocator> webgpu_owned_allocator_;
std::unique_ptr<OrtIoBinding> webgpu_io_binding_;
#endif
#if USE_DML || USE_WEBGPU
std::unique_ptr<OrtMemoryInfo> memory_info_device_;
#endif

std::shared_ptr<CapturedGraphPool> captured_graph_pool_;
std::map<std::string, std::unique_ptr<OrtSessionOptions>> pipeline_session_options_;
};
Expand Down
Loading

0 comments on commit aec8564

Please sign in to comment.