csukuangfj commited on 25 days ago

Commit

11ce2d8

1 Parent(s): bc4ec38

add 1.23.1

Browse files

Files changed (19) hide show

v1.23.1/headers/cpu_provider_factory.h +19 -0
v1.23.1/headers/nnapi_provider_factory.h +62 -0
v1.23.1/headers/onnxruntime_c_api.h +0 -0
v1.23.1/headers/onnxruntime_cxx_api.h +0 -0
v1.23.1/headers/onnxruntime_cxx_inline.h +0 -0
v1.23.1/headers/onnxruntime_ep_c_api.h +988 -0
v1.23.1/headers/onnxruntime_ep_device_ep_metadata_keys.h +18 -0
v1.23.1/headers/onnxruntime_float16.h +535 -0
v1.23.1/headers/onnxruntime_lite_custom_op.h +1119 -0
v1.23.1/headers/onnxruntime_run_options_config_keys.h +54 -0
v1.23.1/headers/onnxruntime_session_options_config_keys.h +417 -0
v1.23.1/jni/arm64-v8a/libonnxruntime.so +3 -0
v1.23.1/jni/arm64-v8a/libonnxruntime4j_jni.so +3 -0
v1.23.1/jni/armeabi-v7a/libonnxruntime.so +3 -0
v1.23.1/jni/armeabi-v7a/libonnxruntime4j_jni.so +3 -0
v1.23.1/jni/x86/libonnxruntime.so +3 -0
v1.23.1/jni/x86/libonnxruntime4j_jni.so +3 -0
v1.23.1/jni/x86_64/libonnxruntime.so +3 -0
v1.23.1/jni/x86_64/libonnxruntime4j_jni.so +3 -0

v1.23.1/headers/cpu_provider_factory.h ADDED Viewed

	@@ -0,0 +1,19 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "onnxruntime_c_api.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * \param use_arena zero: false. non-zero: true.
+ */
+ORT_EXPORT
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_CPU, _In_ OrtSessionOptions* options, int use_arena)
+ORT_ALL_ARGS_NONNULL;
+#ifdef __cplusplus
+}
+#endif

v1.23.1/headers/nnapi_provider_factory.h ADDED Viewed

	@@ -0,0 +1,62 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "onnxruntime_c_api.h"
+// NNAPIFlags are bool options we want to set for NNAPI EP
+// This enum is defined as bit flags, and cannot have negative value
+// To generate an uint32_t nnapi_flags for using with OrtSessionOptionsAppendExecutionProvider_Nnapi below,
+//   uint32_t nnapi_flags = 0;
+//   nnapi_flags |= NNAPI_FLAG_USE_FP16;
+enum NNAPIFlags {
+  NNAPI_FLAG_USE_NONE = 0x000,
+  // Using fp16 relaxation in NNAPI EP, this may improve perf but may also reduce precision
+  NNAPI_FLAG_USE_FP16 = 0x001,
+  // Use NCHW layout in NNAPI EP, this is only available after Android API level 29
+  // Please note for now, NNAPI perform worse using NCHW compare to using NHWC
+  NNAPI_FLAG_USE_NCHW = 0x002,
+  // Prevent NNAPI from using CPU devices.
+  //
+  // NNAPI is more efficient using GPU or NPU for execution, and NNAPI might fall back to its own CPU implementation
+  // for operations not supported by GPU/NPU. The CPU implementation of NNAPI (which is called nnapi-reference)
+  // might be less efficient than the optimized versions of the operation of ORT. It might be advantageous to disable
+  // the NNAPI CPU fallback and handle execution using ORT kernels.
+  //
+  // For some models, if NNAPI would use CPU to execute an operation, and this flag is set, the execution of the
+  // model may fall back to ORT kernels.
+  //
+  // This option is only available after Android API level 29, and will be ignored for Android API level 28-
+  //
+  // For NNAPI device assignments, see https://developer.android.com/ndk/guides/neuralnetworks#device-assignment
+  // For NNAPI CPU fallback, see https://developer.android.com/ndk/guides/neuralnetworks#cpu-fallback
+  //
+  // Please note, the NNAPI EP will return error status if both NNAPI_FLAG_CPU_DISABLED
+  // and NNAPI_FLAG_CPU_ONLY flags are set
+  NNAPI_FLAG_CPU_DISABLED = 0x004,
+  // Using CPU only in NNAPI EP, this may decrease the perf but will provide
+  // reference output value without precision loss, which is useful for validation
+  //
+  // Please note, the NNAPI EP will return error status if both NNAPI_FLAG_CPU_DISABLED
+  // and NNAPI_FLAG_CPU_ONLY flags are set
+  NNAPI_FLAG_CPU_ONLY = 0x008,
+  // Keep NNAPI_FLAG_LAST at the end of the enum definition
+  // And assign the last NNAPIFlag to it
+  NNAPI_FLAG_LAST = NNAPI_FLAG_CPU_ONLY,
+};
+#ifdef __cplusplus
+extern "C" {
+#endif
+ORT_EXPORT ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Nnapi,
+                          _In_ OrtSessionOptions* options, uint32_t nnapi_flags);
+#ifdef __cplusplus
+}
+#endif

v1.23.1/headers/onnxruntime_c_api.h ADDED Viewed

The diff for this file is too large to render. See raw diff

v1.23.1/headers/onnxruntime_cxx_api.h ADDED Viewed

The diff for this file is too large to render. See raw diff

v1.23.1/headers/onnxruntime_cxx_inline.h ADDED Viewed

The diff for this file is too large to render. See raw diff

v1.23.1/headers/onnxruntime_ep_c_api.h ADDED Viewed

	@@ -0,0 +1,988 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+// Do not include this file directly. Please include "onnxruntime_c_api.h" instead.
+#ifdef __cplusplus
+extern "C" {
+#endif
+ORT_RUNTIME_CLASS(Ep);
+ORT_RUNTIME_CLASS(EpFactory);
+ORT_RUNTIME_CLASS(EpGraphSupportInfo);
+ORT_RUNTIME_CLASS(MemoryDevice);  // opaque class to wrap onnxruntime::OrtDevice
+ORT_RUNTIME_CLASS(NodeComputeContext);
+ORT_RUNTIME_CLASS(DataTransferImpl);
+ORT_RUNTIME_CLASS(SyncNotificationImpl);
+ORT_RUNTIME_CLASS(SyncStreamImpl);
+// struct that an EP implements for IDataTransfer to copy between devices it uses and CPU
+struct OrtDataTransferImpl {
+  uint32_t ort_version_supported;  ///< Must be initialized to ORT_API_VERSION
+  /** \brief Release the OrtDataTransferImpl instance.
+   *
+   * This is called by ORT when the OrtDataTransferImpl instance is no longer needed.
+   * The implementation should release any resources held by the instance.
+   *
+   * \param[in] this_ptr Pointer to the OrtDataTransferImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(void, Release, _In_ OrtDataTransferImpl* this_ptr);
+  /** \brief Check if the implementation can copy between the source and destination memory devices.
+   *
+   * \param[in] this_ptr Pointer to the OrtDataTransferImpl instance.
+   * \param[in] src_memory_device Source OrtMemoryDevice to copy from.
+   * \param[in] dst_memory_device Destination OrtMemoryDevice to copy to.
+   * \return True if the implementation can copy between the devices.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(bool, CanCopy, _In_ const OrtDataTransferImpl* this_ptr,
+            _In_ const OrtMemoryDevice* src_memory_device, _In_ const OrtMemoryDevice* dst_memory_device);
+  /** \brief Copy tensors from src_tensors to dst_tensors using the provided streams.
+   *
+   * The implementation can use the provided streams to perform asynchronous copies if supported.
+   * If a stream is not available, the copy is performed synchronously.
+   *
+   * \param[in] this_ptr Pointer to the OrtDataTransferImpl instance.
+   * \param[in] src_tensors Array of source OrtValue pointers to copy from.
+   * \param[in] dst_tensors Array of destination OrtValue pointers to copy to.
+   * \param[in] streams Array of OrtSyncStream pointers for the copy operations, if the execution provider is stream
+   *                    aware. nullptr if it is not.
+   * \param[in] num_tensors Number of tensors to copy.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(CopyTensors, _In_ OrtDataTransferImpl* this_ptr,
+                  _In_reads_(num_tensors) const OrtValue** src_tensors,
+                  _In_reads_(num_tensors) OrtValue** dst_tensors,
+                  _In_reads_(num_tensors) OrtSyncStream** streams,
+                  _In_ size_t num_tensors);
+};
+/** \brief Struct that an EP implements for Stream Notifications.
+ *
+ * \since Version 1.23.
+ */
+struct OrtSyncNotificationImpl {
+  uint32_t ort_version_supported;  ///< Must be initialized to ORT_API_VERSION
+  /** \brief Release the OrtSyncNotificationImpl instance.
+   *
+   * This is called by ORT when the OrtSyncNotificationImpl instance is no longer needed.
+   * The implementation should release any resources held by the instance.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncNotificationImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(void, Release, _In_ OrtSyncNotificationImpl* this_ptr);
+  /** \brief Called by ORT to activate the notification.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncNotificationImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(Activate, _In_ OrtSyncNotificationImpl* this_ptr);
+  /** \brief Wait for a device to device operation to complete.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncNotificationImpl instance.
+   * \param[in] stream The OrtSyncStream instance that will wait on this notification to be activated.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(WaitOnDevice, _In_ OrtSyncNotificationImpl* this_ptr, _In_ OrtSyncStream* consumer_stream);
+  /** \brief Wait for a device to host operation to complete.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncNotificationImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(WaitOnHost, _In_ OrtSyncNotificationImpl* this_ptr);
+};
+/** \brief Struct that an EP implements if it wishes to implement Stream support.
+ *
+ * This struct provides the overrides for onnxruntime::Stream's virtual methods.
+ *
+ * \since Version 1.23.
+ */
+struct OrtSyncStreamImpl {
+  uint32_t ort_version_supported;  ///< Must be initialized to ORT_API_VERSION
+  /** \brief Release the OrtSyncStreamImpl instance.
+   *
+   * This is called by ORT when the OrtSyncStreamImpl instance is no longer needed.
+   * The implementation should release any resources held by the instance.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncStreamImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(void, Release, _In_ OrtSyncStreamImpl* this_ptr);
+  /** \brief Get the handle of the stream.
+   *
+   * This returns the native handle for the stream. e.g. cudaStream_t for CUDA streams.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncStreamImpl instance.
+   * \return The handle of the stream.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(void*, GetHandle, _In_ OrtSyncStreamImpl* this_ptr);
+  /** \brief Create an OrtSyncNotificationImpl for the OrtSyncStreamImpl instance.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncStreamImpl instance
+   * \param[out] notification The new OrtSyncNotificationImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(CreateNotification, _In_ OrtSyncStreamImpl* this_ptr,
+                  _Outptr_ OrtSyncNotificationImpl** notification);
+  /** \brief Flush the stream.
+   *
+   * This is called by ORT to flush the stream, ensuring that all operations submitted to the stream are completed.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncStreamImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(Flush, _In_ OrtSyncStreamImpl* this_ptr);
+  /** \brief Notify the stream that a session run has ended.
+   *
+   * This is called by ORT to notify the stream that a session run has ended, allowing the stream to perform any
+   * necessary cleanup or finalization.
+   *
+   * \param[in] this_ptr Pointer to the OrtSyncStreamImpl instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(OnSessionRunEnd, _In_ OrtSyncStreamImpl* this_ptr);
+};
+struct OrtNodeFusionOptions;
+typedef struct OrtNodeFusionOptions OrtNodeFusionOptions;
+struct OrtNodeComputeInfo;
+typedef struct OrtNodeComputeInfo OrtNodeComputeInfo;
+/**
+ * \brief The OrtNodeFusionOptions struct specifies options for fusing nodes supported by an execution provider.
+ *
+ * Refer to OrtEpApi::EpGraphSupportInfo_AddNodesToFuse.
+ *
+ * \since Version 1.23.
+ */
+struct OrtNodeFusionOptions {
+  /** \brief The ONNX Runtime version the OrtNodeFusionOptions was compiled with.
+   *
+   * Implementation should set to ORT_API_VERSION.
+   * ORT will use this to ensure it does not use members that were not available when the EP library was compiled.
+   *
+   * \since Version 1.23.
+   */
+  uint32_t ort_version_supported;
+  /** \brief If set to true, specify that the execution provider does not require ONNX Runtime to provide constant
+   * initializers as inputs to the fused node during model inference. This is used when the execution
+   * provider saves a copy of constant initializers, and allows ONNX Runtime to release constant initializers that
+   * are not used by any execution provider.
+   *
+   * If not specified, defaults to false. That is, ONNX Runtime provides constant initializers as inputs to
+   * the fused node by default.
+   *
+   * \since Version 1.23.
+   */
+  bool drop_constant_initializers;
+  // const OrtNode* fused_node_schema;
+};
+/**
+ * \brief The OrtNodeComputeInfo struct provides functions that an OrtEp implements to specify the compute
+ * function for a compiled OrtGraph instance.
+ * \since Version 1.23.
+ */
+struct OrtNodeComputeInfo {
+  /** \brief The ONNX Runtime version the OrtNodeComputeInfo was compiled with.
+   *
+   * Implementation should set to ORT_API_VERSION.
+   * ORT will use this to ensure it does not call functions that were not available when the EP library was compiled.
+   *
+   * \since Version 1.23.
+   */
+  uint32_t ort_version_supported;
+  /** \brief Creates an opaque compute state object that is then passed to the Compute() function during inference.
+   * \param[in] this_ptr The OrtNodeComputeInfo instance.
+   * \param[in] compute_context OrtNodeComputeContext instance that contains compiled/fused node's name and host
+   *                            memory allocation functions. Can optionally be used to build the compute state.
+   * \param[out] compute_state Output parameter that is assigned the opaque computation state. ONNX Runtime calls
+   *                           ReleaseState() (after calling Compute()) to allow the implementer to release the
+   *                           compute state.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  OrtStatus*(ORT_API_CALL* CreateState)(_In_ OrtNodeComputeInfo* this_ptr,
+                                        _In_ OrtNodeComputeContext* compute_context,
+                                        _Outptr_ void** compute_state);
+  /** \brief Computation function called to execute the fused node compiled by an OrtEp instance.
+   * \param[in] this_ptr The OrtNodeComputeInfo instance.
+   * \param[in] compute_state The opaque computation state returned by CreateState().
+   * \param[in] kernel_context The OrtKernelContext instance used to access inputs/outputs.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  OrtStatus*(ORT_API_CALL* Compute)(_In_ OrtNodeComputeInfo* this_ptr, _In_ void* compute_state,
+                                    _In_ OrtKernelContext* kernel_context);
+  /** \brief Releases the compute state returned by CreateState().
+   * \param[in] this_ptr The OrtNodeComputeInfo instance.
+   * \param[inout] compute_state The opaque compute state returned by CreateState().
+   *
+   * \since Version 1.23.
+   */
+  void(ORT_API_CALL* ReleaseState)(_In_ OrtNodeComputeInfo* this_ptr, _Frees_ptr_opt_ void* compute_state);
+};
+struct OrtEpApi {
+  /** \brief Create an OrtEpDevice for the EP and an OrtHardwareDevice.
+   * \param[in] ep_factory Execution provider factory that is creating the instance.
+   * \param[in] hardware_device Hardware device that the EP can utilize.
+   * \param[in] ep_metadata Optional OrtKeyValuePairs instance for execution provider metadata that may be used
+   *                        during execution provider selection and passed to CreateEp.
+   *                        ep_device will copy this instance and the user should call ReleaseKeyValuePairs.
+   * \param[in] ep_options  Optional OrtKeyValuePairs instance for execution provider options that will be added
+   *                        to the Session configuration options if the execution provider is selected.
+   *                        ep_device will copy this instance and the user should call ReleaseKeyValuePairs.
+   * \param ep_device OrtExecutionDevice that is created.
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(CreateEpDevice, _In_ OrtEpFactory* ep_factory,
+                  _In_ const OrtHardwareDevice* hardware_device,
+                  _In_opt_ const OrtKeyValuePairs* ep_metadata,
+                  _In_opt_ const OrtKeyValuePairs* ep_options,
+                  _Out_ OrtEpDevice** ep_device);
+  ORT_CLASS_RELEASE(EpDevice);
+  /** \brief Specify nodes that are supported by an OrtEp and should be fused into one node.
+   *
+   * Because the nodes will be fused into one "fused node", there must not exist an unsupported node in
+   * a path between two of the provided nodes. Otherwise, the graph will become invalid.
+   *
+   * This function can be called multiple times. A subsequent call to this function will force the next set of
+   * nodes to be fused into a different node.
+   *
+   * \param[in] graph_support_info OrtEpGraphSupportInfo instance to which to add the supported nodes.
+   * \param[in] nodes Array of nodes supported by the EP that should be fused/compiled.
+   * \param[in] num_nodes The number of supported nodes.
+   * \param[in] node_fusion_options Optional node fusion options. Ignored if set to NULL.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(EpGraphSupportInfo_AddNodesToFuse, _In_ OrtEpGraphSupportInfo* graph_support_info,
+                  _In_reads_(num_nodes) const OrtNode* const* nodes, _In_ size_t num_nodes,
+                  _In_opt_ const OrtNodeFusionOptions* node_fusion_options);
+  /** \brief Specify a node that is supported by an OrtEp and should be run with a registered EP kernel.
+   *
+   * \param[in] graph_support_info OrtEpGraphSupportInfo instance to which to add the supported node.
+   * \param[in] node The supported OrtNode instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(EpGraphSupportInfo_AddSingleNode, _In_ OrtEpGraphSupportInfo* graph_support_info,
+                  _In_ const OrtNode* node);
+  /** \brief Query a OrtNodeComputeContext for the name of the node that encapsulates the compiled/fused node.
+   *
+   * Used in OrtNodeComputeInfo::CreateComputeState().
+   *
+   * \param[in] context The OrtNodeComputeContext instance to query.
+   * \return The node's name.
+   *
+   * \note Returned string is owned by ORT and valid only while OrtNodeComputeInfo::CreateComputeState() is called.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(const char*, NodeComputeContext_NodeName, _In_ const OrtNodeComputeContext* context);
+  /** \brief Register an allocator with the OrtEpDevice.
+   *
+   * This allows an EP to provide OrtMemoryInfo for DEFAULT and HOST_ACCESSIBLE memory type as needed.
+   * The registered values will be used in calls to OrtEpFactory::CreateAllocator to ensure the required allocator/s
+   * are available for EP usage.
+   *
+   * Multiple calls for the same entry type will replace a previous entry.
+   *
+   * Available entries:
+   *   - OrtDeviceAllocator with type of OrtDeviceMemoryType_DEFAULT
+   *   - OrtDeviceAllocator with type of OrtDeviceMemoryType_HOST_ACCESSIBLE
+   *   - OrtReadOnlyAllocator with type of OrtDeviceMemoryType_DEFAULT
+   *     - if provided this allocator will only be used to copy initializers to the device the EP uses.
+   *       ORT will use the OrtDeviceAllocator if not provided.
+   *
+   * \param[in] ep_device The OrtEpDevice instance to register the OrtMemoryInfo with.
+   * \param[in] allocator_memory_info The OrtMemoryInfo information for the allocator.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(EpDevice_AddAllocatorInfo, _In_ OrtEpDevice* ep_device,
+                  _In_ const OrtMemoryInfo* allocator_memory_info);
+  /** \brief Get the OrtMemoryDevice from an OrtMemoryInfo instance.
+   *
+   * This is required for OrtDataTransferImpl (which implements onnxruntime::IDataTransfer) where the OrtMemoryDevice
+   * is used in the CanCopy and CopyTensors functions.
+   *
+   * \param[in] memory_info The OrtMemoryInfo instance to get the memory device from.
+   * \return The OrtMemoryDevice associated with the OrtMemoryInfo instance.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(const OrtMemoryDevice*, MemoryInfo_GetMemoryDevice, _In_ const OrtMemoryInfo* memory_info);
+  /** \brief Get the OrtMemoryDevice from an OrtValue instance if it contains a Tensor.
+   *
+   * \param[in] value The OrtValue instance to get the memory device from.
+   * \return Memory device if OrtValue contains a Tensor, nullptr otherwise.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(const OrtMemoryDevice*, Value_GetMemoryDevice, _In_ const OrtValue* value);
+  /** \brief Compare two OrtMemoryDevice instances for equality.
+   *
+   * This is used to check if two memory devices are the same.
+   * Used to implement DataTransferImpl::CanCopy.
+   *
+   * \param[in] a The first OrtMemoryDevice instance to compare.
+   * \param[in] b The second OrtMemoryDevice instance to compare.
+   * \return True if the two OrtMemoryDevice instances are equal, false otherwise.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(bool, MemoryDevice_AreEqual, _In_ const OrtMemoryDevice* a, _In_ const OrtMemoryDevice* b);
+  /** \brief Get the OrtMemoryInfoDeviceType value from an OrtMemoryDevice instance.
+   *
+   * \param[in] memory_device OrtMemoryDevice instance.
+   * \return The OrtMemoryInfoDeviceType value.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(OrtMemoryInfoDeviceType, MemoryDevice_GetDeviceType, _In_ const OrtMemoryDevice* memory_device);
+  /** \brief Get the OrtDeviceMemoryType value from an OrtMemoryDevice instance.
+   *
+   * \param[in] memory_device OrtMemoryDevice instance.
+   * \return The OrtDeviceMemoryType value.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(OrtDeviceMemoryType, MemoryDevice_GetMemoryType, _In_ const OrtMemoryDevice* memory_device);
+  /** \brief Get the vendor ID from an OrtMemoryDevice instance.
+   *
+   * The vendor ID is used to identify the vendor of the device, and is typically set to the PCI vendor ID.
+   *
+   * If the device is not vendor specific (e.g. CPU memory) the vendor ID is set to 0.
+   *
+   * \param[in] memory_device OrtMemoryDevice instance.
+   * \return The vendor ID value.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(uint32_t, MemoryDevice_GetVendorId, _In_ const OrtMemoryDevice* memory_device);
+  /** \brief Get the device ID from an OrtMemoryDevice instance.
+   *
+   * \param[in] memory_device OrtMemoryDevice instance.
+   * \return The device ID.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(uint32_t, MemoryDevice_GetDeviceId, _In_ const OrtMemoryDevice* memory_device);
+  /** \brief Get the OrtSyncStreamImpl associated with an OrtSyncStream instance.
+   *
+   * This allows an the plugin library to connect its OrtSyncStreamImpl instance with an OrtSyncStream if needed.
+   *
+   * \param[in] stream The OrtSyncStream instance to find an OrtSyncStreamImpl for.
+   * \return The associated OrtSyncStreamImpl if found. nullptr otherwise.
+   *
+   * \since Version 1.23.
+   *
+   * \remarks There should always be an OrtSyncStreamImpl associated with an OrtSyncStream instance that the EP gets.
+   */
+  ORT_API_T(const OrtSyncStreamImpl*, SyncStream_GetImpl, _In_ const OrtSyncStream* stream);
+  /** \brief Get the current sync ID for a stream.
+   *
+   * \param[in] stream The OrtSyncStream to get the sync ID for.
+   * \return Current sync ID.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(uint64_t, SyncStream_GetSyncId, _In_ const OrtSyncStream* stream);
+  /** \brief Get the sync ID for the last time the consumer_stream waited on the producer_stream.
+   *
+   * When two streams are synchronized, the sync id represents the event used in that synchronization.
+   *
+   * \param[in] producer_stream The OrtSyncStream that produced the data.
+   * \param[in] consumer_stream The OrtSyncStream that waited on the producer_stream.
+   * \return ID for last sync. 0 if no sync has occurred between the two streams.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(uint64_t, GetSyncIdForLastWaitOnSyncStream,
+            _In_ const OrtSyncStream* producer_stream, _In_ const OrtSyncStream* consumer_stream);
+};
+/**
+ * \brief The data layout type.
+ *
+ * EPs may specify a preferred data layout type. ORT's default layout type is OrtEpDataLayout_NCHW, or
+ * OrtEpDataLayout_Default.
+ *
+ * \since Version 1.23.
+ */
+typedef enum OrtEpDataLayout {
+  OrtEpDataLayout_NCHW = 0,
+  OrtEpDataLayout_NHWC,
+  OrtEpDataLayout_Default = OrtEpDataLayout_NCHW,
+} OrtEpDataLayout;
+/**
+ * \brief The OrtEp struct provides functions to implement for an execution provider.
+ * \since Version 1.22.
+ */
+struct OrtEp {
+  /** \brief The ONNX Runtime version the execution provider was compiled with.
+   *
+   * Implementation should set to ORT_API_VERSION.
+   * ORT will use this to ensure it does not call functions that were not available when the library was compiled.
+   *
+   * \since Version 1.22.
+   */
+  uint32_t ort_version_supported;
+  /** \brief Get the execution provider name.
+   *
+   * The returned string should be a null-terminated, UTF-8 encoded string. ORT will copy it.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \return The execution provider name.
+   *
+   * \since Version 1.22.
+   */
+  ORT_API_T(const char*, GetName, _In_ const OrtEp* this_ptr);
+  /** \brief Get information about the nodes supported by the OrtEp instance.
+   *
+   * IMPORTANT: This is not the final version of this API function. This is currently experimental but will
+   * be stabilized by the ONNX Runtime 1.23 release.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[in] graph The OrtGraph instance for which to populate node support. The OrtGraph could be a nested subgraph
+   *                  contained by a node (e.g., an If or Loop node). ONNX Runtime calls this function separately
+   *                  for each nested subgraph.
+   * \param[inout] graph_support_info OrtEpGraphSupportInfo instance that the implementer must fill out in order to
+   *                                  specify the supported nodes.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(GetCapability, _In_ OrtEp* this_ptr, _In_ const OrtGraph* graph,
+                  _Inout_ OrtEpGraphSupportInfo* graph_support_info);
+  /** \brief Compile OrtGraph instances assigned to the OrtEp. Implementer must set a OrtNodeComputeInfo instance
+   * for each OrtGraph in order to define its computation function.
+   *
+   * If the session is configured to generate a pre-compiled model, the execution provider must return EPContext nodes,
+   * as OrtNode instances, that ONNX Runtime uses to create a pre-compiled model, known as an "EPContext model".
+   * An EPContext model contains EPContext nodes. Each EPContext node encapsulates the pre-compiled binary data for a
+   * OrtGraph compiled for a specific execution provider. For more details about the EPContext design, refer to:
+   *  \htmlonly
+   *  <a href="https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html">EPContext design document.</a>
+   *  \endhtmlonly
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[in] graphs Array of `count` OrtGraph instances to compile. Each graph contains only the nodes for
+   *                   which the execution provider indicated support. Nested subgraphs contained by a
+   *                   node, such as an If or Loop, have separate OrtGraph instances.
+   * \param[in] fused_nodes Array of `count` fused nodes that will replace the compiled graphs.
+   *                        Each fused node is an OrtNode initialized with the intended fused node name and
+   *                        input/output information.
+   * \param[in] count The number of OrtGraph instances to compile.
+   * \param[out] node_compute_infos Array of `count` OrtNodeComputeInfo instances that define each OrtGraph instance's
+   *                                computation function. The implementer allocates the OrtNodeComputeInfo instances.
+   *                                ORT calls ReleaseNodeComputeInfos() to release multiple instances in a batch.
+   * \param[out] ep_context_nodes Output array of `count` OrtNode instances, each representing an EPContext
+   *                              node for a compiled OrtGraph. The execution provider must use
+   *                              OrtModelEditorApi::CreateNode to create the OrtNode instances. ONNX Runtime takes
+   *                              ownership of the OrtNode instances, so the execution provider must NOT call
+   *                              OrtApi::ReleaseNode. Should be ignored if the session is not configured to generate an
+   *                              EPContext model.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \note Do NOT cache the provided OrtGraph instances in any of the OrtNodeComputeInfo functions because the
+   *       graphs are only valid for the duration of the call to Compile. Any graph/node/input/output
+   *       names that are needed by the OrtNodeComputeInfo functions must be copied and stored by the OrtEp.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(Compile, _In_ OrtEp* this_ptr, _In_ const OrtGraph** graphs,
+                  _In_ const OrtNode** fused_nodes, _In_ size_t count,
+                  _Out_writes_all_(count) OrtNodeComputeInfo** node_compute_infos,
+                  _Out_writes_(count) OrtNode** ep_context_nodes);
+  /** \brief Release OrtNodeComputeInfo instances.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[inout] node_compute_infos The OrtNodeComputeInfo instances to release.
+   * \param[in] num_node_compute_infos The number of OrtNodeComputeInfo instances.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(void, ReleaseNodeComputeInfos, _In_ OrtEp* this_ptr,
+            OrtNodeComputeInfo** node_compute_infos,
+            _In_ size_t num_node_compute_infos);
+  /** \brief Get the EP's preferred data layout.
+   *
+   * \note Implementation of this function is optional.
+   *       If not implemented, ORT will assume that this EP prefers the data layout `OrtEpDataLayout::NCHW`.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[out] preferred_data_layout The EP's preferred data layout.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(GetPreferredDataLayout, _In_ OrtEp* this_ptr, _Out_ OrtEpDataLayout* preferred_data_layout);
+  /** \brief Given an op with domain `domain` and type `op_type`, determine whether an associated node's data layout
+   *         should be converted to `target_data_layout`.
+   *         If the EP prefers a non-default data layout (see `GetPreferredDataLayout()`), this function will be called
+   *         during layout transformation with `target_data_layout` set to the EP's preferred data layout.
+   *
+   * \note Implementation of this function is optional.
+   *       If an EP prefers a non-default data layout, it may implement this to customize the specific op data layout
+   *       preferences at a finer granularity.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[in] domain The op domain. An empty string means the ONNX domain.
+   * \param[in] op_type The op type.
+   * \param[in] target_data_layout The target data layout.
+   * \param[out] should_convert Whether the associated node's data layout should be converted to `target_data_layout`.
+   *                            If greater than 0, convert.
+   *                            If 0, don't convert.
+   *                            Otherwise, if less than 0, leave the decision to ORT.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(ShouldConvertDataLayoutForOp, _In_ OrtEp* this_ptr,
+                  _In_z_ const char* domain, _In_z_ const char* op_type,
+                  _In_ OrtEpDataLayout target_data_layout,
+                  _Outptr_ int* should_convert);
+  /** \brief Set dynamic options on this EP.
+   *
+   * Dynamic options can be set by the user at any time after session creation with `OrtApi::SetEpDynamicOptions()`.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[in] option_keys The dynamic option keys.
+   * \param[in] option_values The dynamic option values.
+   * \param[in] num_options The number of dynamic options.
+   *
+   * \note Implementation of this function is optional.
+   *       An EP should only implement this if it needs to handle any dynamic options.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(SetDynamicOptions, _In_ OrtEp* this_ptr,
+                  _In_reads_(num_options) const char* const* option_keys,
+                  _In_reads_(num_options) const char* const* option_values,
+                  _In_ size_t num_options);
+  /** \brief Called by ORT to notify the EP of the start of a run.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[in] run_options The run options for this run.
+   *
+   * \note Implementation of this function is optional.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(OnRunStart, _In_ OrtEp* this_ptr, _In_ const OrtRunOptions* run_options);
+  /** \brief Called by ORT to notify the EP of the end of a run.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[in] run_options The run options for this run.
+   * \param[in] sync_stream Whether any associated stream should be synchronized during this call.
+   *                        Only applicable if there is such a stream.
+   *
+   * \note Implementation of this function is optional.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(OnRunEnd, _In_ OrtEp* this_ptr, _In_ const OrtRunOptions* run_options, _In_ bool sync_stream);
+  /** \brief Create an OrtAllocator for the given OrtMemoryInfo for an OrtSession.
+   *
+   * The OrtMemoryInfo instance will match one of the values set in the OrtEpDevice using EpDevice_AddAllocatorInfo.
+   * Any allocator specific options should be read from the session options.
+   *
+   * If nullptr OrtEpFactory::CreateAllocator will be used.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[in] memory_info The OrtMemoryInfo to create the allocator for. May be nullptr.
+   * \param[out] allocator The created OrtAllocator instance. Set to nullptr if the default CPU allocator is used.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(CreateAllocator, _In_ OrtEp* this_ptr,
+                  _In_ const OrtMemoryInfo* memory_info,
+                  _Outptr_result_maybenull_ OrtAllocator** allocator);
+  /** \brief Create a synchronization stream for the given memory device for an OrtSession.
+   *
+   * This is used to create a synchronization stream for the execution provider and is used to synchronize
+   * operations on the device during model execution.
+   * Any stream specific options should be read from the session options.
+   *
+   * If nullptr OrtEpFactory::CreateSyncStreamForDevice will be used.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[in] memory_device The OrtMemoryDevice to create the synchronization stream for.
+   * \param[out] stream The created OrtSyncStreamImpl instance. nullptr if the execution provider is not stream aware.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(CreateSyncStreamForDevice, _In_ OrtEp* this_ptr,
+                  _In_ const OrtMemoryDevice* memory_device,
+                  _Outptr_ OrtSyncStreamImpl** stream);
+  /** \brief Get a string with details about the EP stack used to produce a compiled model.
+   *
+   * This function gets a compatibility information string that contains details about the execution provider
+   * used to compile a given model. This string can later be used with ValidateCompiledModelCompatibilityInfo
+   * to determine if a compiled model is compatible with the EP.
+   *
+   * The returned string should be a null-terminated, UTF-8 encoded string. ORT will copy it.
+   *
+   * \param[in] this_ptr The OrtEp instance.
+   * \param[in] graph The OrtGraph instance for which to generate compatibility information.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(const char*, GetCompiledModelCompatibilityInfo, _In_ OrtEp* this_ptr,
+            _In_ const OrtGraph* graph);
+};
+/** \brief The function signature that ORT will call to create OrtEpFactory instances.
+ *
+ * This must be available in a function called 'CreateEpFactories' in the execution provider library.
+ *
+ * \param[in] registered_name The name the execution library is registered with by RegisterExecutionProviderLibrary
+ * \param[in] ort_api_base The OrtApiBase instance that is used by the factory to get the OrtApi instance for the
+ *                         version of ORT that the library was compiled against.
+ * \param[in] default_logger The default ORT logger that can be used for logging outside of an inference session.
+ * \param[in,out] factories The implementation should create and add OrtEpFactory instances to this
+ *                          pre-allocated array.
+ *                          i.e. usage is `factories[0] = new MyEpFactory();`
+ * \param[in] max_factories The maximum number of OrtEpFactory instances that can be added to `factories`.
+ *                          Current default is to allow 4 factories. This can be increased in the future if needed.
+ * \param[out] num_factories The number of OrtEpFactory instances created by the factory and added to `factories`.
+ *
+ * \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.22.
+ */
+typedef OrtStatus* (*CreateEpApiFactoriesFn)(_In_ const char* registered_name, _In_ const OrtApiBase* ort_api_base,
+                                             _In_ const OrtLogger* default_logger,
+                                             _Inout_ OrtEpFactory** factories, _In_ size_t max_factories,
+                                             _Out_ size_t* num_factories);
+/** \brief The function signature that ORT will call to release an OrtEpFactory instance.
+ *
+ * This must be available in a function called 'ReleaseEpFactory' in the execution provider library.
+ *
+ * \param[in] factory The OrtEpFactory instance to release.
+ *
+ * \snippet{doc} snippets.dox OrtStatus Return Value
+ *
+ * \since Version 1.22.
+ */
+typedef OrtStatus* (*ReleaseEpApiFactoryFn)(_In_ OrtEpFactory* factory);
+/**
+ * \brief The OrtEpFactory provides functions to create and manage execution providers.
+ * \since Version 1.22.
+ */
+struct OrtEpFactory {
+  /** \brief The ONNX Runtime version the execution provider was compiled with.
+   *
+   * Implementation should set to ORT_API_VERSION.
+   * ORT will use this to ensure it does not call functions that were not available when the library was compiled.
+   *
+   * \since Version 1.22.
+   */
+  uint32_t ort_version_supported;
+  /** \brief Get the name of the execution provider that the factory creates.
+   *
+   * The returned string should be a null-terminated, UTF-8 encoded string. ORT will copy it.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \return The name of the execution provider the factory creates.
+   *
+   * \since Version 1.22.
+   */
+  ORT_API_T(const char*, GetName, const OrtEpFactory* this_ptr);
+  /** \brief Get the name of vendor who owns the execution provider that the factory creates.
+   *
+   * The returned string should be a null-terminated, UTF-8 encoded string. ORT will copy it.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \return vendor The vendor name of the execution provider the factory creates.
+   *
+   * \since Version 1.22.
+   */
+  ORT_API_T(const char*, GetVendor, const OrtEpFactory* this_ptr);  // return EP vendor
+  /** \brief Get information from the execution provider about OrtHardwareDevice support.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   *                     Non-const as the factory is passed through to the CreateEp call via the OrtEpDevice.
+   * \param[in] devices The OrtHardwareDevice instances that are available.
+   * \param[in] num_devices The number of OrtHardwareDevice instances.
+   * \param[out] ep_devices OrtEpDevice instances for each OrtHardwareDevice that the EP can use.
+   *                        The implementation should call OrtEpApi::CreateEpDevice to create, and add the OrtEpDevice
+   *                        instances to this pre-allocated array. ORT will take ownership of the values returned.
+   *                        i.e. usage is `ep_devices[0] = <ptr to OrtEpDevice created with OrtEpApi::CreateEpDevice>;`
+   * \param[in] max_ep_devices The maximum number of OrtEpDevices that can be added to ep_devices.
+   *                           Current default is 8. This can be increased if needed.
+   * \param[out] num_ep_devices The number of EP devices added to ep_devices.
+   * \return true if the factory can create an execution provider that uses `device`.
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(GetSupportedDevices, _In_ OrtEpFactory* this_ptr,
+                  _In_reads_(num_devices) const OrtHardwareDevice* const* devices,
+                  _In_ size_t num_devices,
+                  _Inout_ OrtEpDevice** ep_devices,
+                  _In_ size_t max_ep_devices,
+                  _Out_ size_t* num_ep_devices);
+  /** \brief Function to create an OrtEp instance for use in a Session.
+   *
+   *  ORT will call ReleaseEp to release the instance when it is no longer needed.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[in] devices The OrtHardwareDevice instances that the execution provider was selected to use.
+   *                    May be a subset of the OrtHardwareDevice instances that the execution provider's factory
+   *                    set as supported in the call to OrtEpFactory::GetSupportedDevices.
+   * \param[in] ep_metadata_pairs Execution provider metadata that was provided to OrtEpApi::CreateEpDevice, for each
+   *                              device.
+   * \param[in] num_devices The number of devices the execution provider was selected for.
+   * \param[in] session_options The OrtSessionOptions instance that contains the configuration options for the
+   *                            session. This will include ep_options from GetSupportedDevices as well as any
+   *                            user provided overrides.
+   *                            Execution provider options will have been added with a prefix of 'ep.[ep name].'.
+   *                            The OrtSessionOptions instance will NOT be valid after this call and should not be
+   *                            stored for later use.
+   * \param[in] logger The OrtLogger instance for the session that the execution provider should use for logging.
+   * \param[out] ep The OrtEp instance created by the factory.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(CreateEp, _In_ OrtEpFactory* this_ptr,
+                  _In_reads_(num_devices) const OrtHardwareDevice* const* devices,
+                  _In_reads_(num_devices) const OrtKeyValuePairs* const* ep_metadata_pairs,
+                  _In_ size_t num_devices,
+                  _In_ const OrtSessionOptions* session_options,
+                  _In_ const OrtLogger* logger, _Outptr_ OrtEp** ep);
+  /** \brief Release the OrtEp instance.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[in] ep The OrtEp instance to release.
+   *
+   * \since Version 1.22.
+   */
+  ORT_API_T(void, ReleaseEp, OrtEpFactory* this_ptr, struct OrtEp* ep);
+  /** \brief Get the vendor id who owns the execution provider that the factory creates.
+   *
+   * This is typically the PCI vendor ID. See https://pcisig.com/membership/member-companies
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \return vendor_id The vendor ID of the execution provider the factory creates.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(uint32_t, GetVendorId, const OrtEpFactory* this_ptr);
+  /** \brief Get the version of the execution provider that the factory creates.
+   *
+   * The version string should adhere to the Semantic Versioning 2.0 specification
+   * (https://github.com/semver/semver/blob/v2.0.0/semver.md).
+   *
+   * The returned string should be a null-terminated, UTF-8 encoded string. ORT will copy it.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \return The execution provider version string.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(const char*, GetVersion, _In_ const OrtEpFactory* this_ptr);
+  /** \brief Validate the compatibility of a compiled model with the execution provider factory for one or more devices.
+   *
+   * Given a compatibility info string produced during model compilation, the EP factory should determine whether the
+   * compiled model is compatible with the EP factory when targeting the provided hardware devices. All devices provided
+   * must belong to the same execution provider instance that this factory creates.
+   *
+   * The EP factory implementation should consider the set of devices (e.g., multi-adapter or multi-GPU scenarios) when
+   * evaluating compatibility and set `model_compatibility` accordingly.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[in] devices Array of OrtHardwareDevice pointers that the EP would run on. All must map to this EP.
+   * \param[in] num_devices Number of entries in `devices`.
+   * \param[in] compatibility_info The compatibility information string produced when the model was compiled.
+   * \param[out] model_compatibility OrtCompiledModelCompatibility value describing the compatibility of the model with the EP.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(ValidateCompiledModelCompatibilityInfo, _In_ OrtEpFactory* this_ptr,
+                  _In_reads_(num_devices) const OrtHardwareDevice* const* devices,
+                  _In_ size_t num_devices,
+                  _In_ const char* compatibility_info,
+                  _Out_ OrtCompiledModelCompatibility* model_compatibility);
+  /** \brief Create an OrtAllocator that can be shared across sessions for the given OrtMemoryInfo.
+   *
+   * The factory that creates the EP is responsible for providing the allocators required by the EP.
+   * The OrtMemoryInfo instance will match one of the values set in the OrtEpDevice using EpDevice_AddAllocatorInfo.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[in] memory_info The OrtMemoryInfo to create the allocator for. May be nullptr.
+   * \param[in] allocator_options Optional key-value pairs for allocator options, can be nullptr.
+   * \param[out] allocator The created OrtAllocator instance. Set to nullptr if the default CPU allocator is used.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(CreateAllocator, _In_ OrtEpFactory* this_ptr,
+                  _In_ const OrtMemoryInfo* memory_info,
+                  _In_opt_ const OrtKeyValuePairs* allocator_options,
+                  _Outptr_result_maybenull_ OrtAllocator** allocator);
+  /** \brief Release an OrtAllocator created by the factory.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(void, ReleaseAllocator, _In_ OrtEpFactory* this_ptr, _In_ OrtAllocator* allocator);
+  /** \brief Create an OrtDataTransferImpl instance for the factory.
+   *
+   * This is used to create an IDataTransfer implementation that can be used to copy data between devices
+   * that the execution provider supports.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[out] data_transfer The created OrtDataTransferImpl instance. Set to nullptr if not required.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(CreateDataTransfer, _In_ OrtEpFactory* this_ptr,
+                  _Outptr_result_maybenull_ OrtDataTransferImpl** data_transfer);
+  /** \brief Check if execution providers created by the factory are stream aware.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \return True if the factory creates execution providers that are stream aware and it implements CreateSyncStreamForDevice.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API_T(bool, IsStreamAware, _In_ const OrtEpFactory* this_ptr);
+  /** \brief Create a synchronization stream for the given memory device.
+   *
+   * This is used to create a synchronization stream for the memory device that can be used for operations outside of
+   * a session.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[in] memory_device The OrtMemoryDevice to create the synchronization stream for.
+   * \param[in] stream_options Options for stream creation. May be nullptr.
+   * \param[out] stream The created OrtSyncStreamImpl instance. nullptr if the execution provider is not stream aware.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(CreateSyncStreamForDevice, _In_ OrtEpFactory* this_ptr,
+                  _In_ const OrtMemoryDevice* memory_device,
+                  _In_opt_ const OrtKeyValuePairs* stream_options,
+                  _Outptr_ OrtSyncStreamImpl** stream);
+};
+#ifdef __cplusplus
+}
+#endif

v1.23.1/headers/onnxruntime_ep_device_ep_metadata_keys.h ADDED Viewed

	@@ -0,0 +1,18 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+// This file contains well-known keys for OrtEpDevice EP metadata entries.
+// It does NOT specify all available metadata keys.
+// Key for the execution provider version string. This should be available for all plugin EPs.
+static const char* const kOrtEpDevice_EpMetadataKey_Version = "version";
+// Prefix for execution provider compatibility information stored in model metadata.
+// Used when generating EP context models to store compatibility strings for each EP.
+// Full key format: "ep_compatibility_info.<EP_TYPE>"
+static const char* const kOrtModelMetadata_EpCompatibilityInfoPrefix = "ep_compatibility_info.";
+// Key for the execution provider library path (for dynamically loaded EPs)
+static const char* const kOrtEpDevice_EpMetadataKey_LibraryPath = "library_path";

v1.23.1/headers/onnxruntime_float16.h ADDED Viewed

	@@ -0,0 +1,535 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <cstring>
+#include <limits>
+namespace onnxruntime_float16 {
+namespace detail {
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error onnxruntime_float16::detail::endian is not implemented in this environment.
+#endif
+};
+static_assert(
+    endian::native == endian::little || endian::native == endian::big,
+    "Only little-endian or big-endian native byte orders are supported.");
+}  // namespace detail
+/// <summary>
+/// Shared implementation between public and internal classes. CRTP pattern.
+/// </summary>
+template <class Derived>
+struct Float16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloatImpl() const noexcept;
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
+  static constexpr uint16_t kMaxValueBits = 0x7BFFU;  // Largest normal number
+  static constexpr uint16_t kOneBits = 0x3C00U;
+  static constexpr uint16_t kMinusOneBits = 0xBC00U;
+  uint16_t val{0};
+  Float16Impl() = default;
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept {
+    return static_cast<int16_t>(val) < 0;
+  }
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept {
+    return val == kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept {
+    return val == kNegativeInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) != 0);  // is not subnormal (has a non-zero exponent)
+  }
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) == 0);  // is subnormal (has a zero exponent)
+  }
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept {
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+  bool operator==(const Float16Impl& rhs) const noexcept {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const Float16Impl& rhs) const noexcept {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+    const bool left_is_negative = IsNegative();
+    if (left_is_negative != rhs.IsNegative()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZero(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+};
+// The following Float16_t conversions are based on the code from
+// Eigen library.
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+namespace detail {
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+}  // namespace detail
+template <class Derived>
+inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept {
+  detail::float32_bits f{};
+  f.f = v;
+  constexpr detail::float32_bits f32infty = {255 << 23};
+  constexpr detail::float32_bits f16max = {(127 + 16) << 23};
+  constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  constexpr unsigned int sign_mask = 0x80000000u;
+  uint16_t val = static_cast<uint16_t>(0x0u);
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+  if (f.u >= f16max.u) {                         // result is Inf or NaN (all exponent bits set)
+    val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                                       // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                     // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+      // and one integer subtract of the bias later, we have our final float!
+      val = static_cast<uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      val = static_cast<uint16_t>(f.u >> 13);
+    }
+  }
+  val |= static_cast<uint16_t>(sign >> 16);
+  return val;
+}
+template <class Derived>
+inline float Float16Impl<Derived>::ToFloatImpl() const noexcept {
+  constexpr detail::float32_bits magic = {113 << 23};
+  constexpr unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+  detail::float32_bits o{};
+  o.u = (val & 0x7fff) << 13;            // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;  // just the exponent
+  o.u += (127 - 15) << 23;               // exponent adjust
+  // handle exponent special cases
+  if (exp == shifted_exp) {   // Inf/NaN?
+    o.u += (128 - 16) << 23;  // extra exp adjust
+  } else if (exp == 0) {      // Zero/Denormal?
+    o.u += 1 << 23;           // extra exp adjust
+    o.f -= magic.f;           // re-normalize
+  }
+  // Attempt to workaround the Internal Compiler Error on ARM64
+  // for bitwise | operator, including std::bitset
+#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
+  if (IsNegative()) {
+    return -o.f;
+  }
+#else
+  // original code:
+  o.u |= (val & 0x8000U) << 16U;  // sign bit
+#endif
+  return o.f;
+}
+/// Shared implementation between public and internal classes. CRTP pattern.
+template <class Derived>
+struct BFloat16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  static uint16_t ToUint16Impl(float v) noexcept;
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloatImpl() const noexcept;
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
+  static constexpr uint16_t kMaxValueBits = 0x7F7FU;
+  static constexpr uint16_t kRoundToNearest = 0x7FFFU;
+  static constexpr uint16_t kOneBits = 0x3F80U;
+  static constexpr uint16_t kMinusOneBits = 0xBF80U;
+  uint16_t val{0};
+  BFloat16Impl() = default;
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept {
+    return static_cast<int16_t>(val) < 0;
+  }
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept {
+    return val == kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept {
+    return val == kNegativeInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) != 0);  // is not subnormal (has a non-zero exponent)
+  }
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) == 0);  // is subnormal (has a zero exponent)
+  }
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+template <class Derived>
+inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept {
+  uint16_t result;
+  if (std::isnan(v)) {
+    result = kPositiveQNaNBits;
+  } else {
+    auto get_msb_half = [](float fl) {
+      uint16_t result;
+#ifdef __cpp_if_constexpr
+      if constexpr (detail::endian::native == detail::endian::little) {
+#else
+      if (detail::endian::native == detail::endian::little) {
+#endif
+        std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+      } else {
+        std::memcpy(&result, &fl, sizeof(uint16_t));
+      }
+      return result;
+    };
+    uint16_t upper_bits = get_msb_half(v);
+    union {
+      uint32_t U32;
+      float F32;
+    };
+    F32 = v;
+    U32 += (upper_bits & 1) + kRoundToNearest;
+    result = get_msb_half(F32);
+  }
+  return result;
+}
+template <class Derived>
+inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept {
+  if (IsNaN()) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
+#ifdef __cpp_if_constexpr
+  if constexpr (detail::endian::native == detail::endian::little) {
+#else
+  if (detail::endian::native == detail::endian::little) {
+#endif
+    std::memset(first, 0, sizeof(uint16_t));
+    std::memcpy(second, &val, sizeof(uint16_t));
+  } else {
+    std::memcpy(first, &val, sizeof(uint16_t));
+    std::memset(second, 0, sizeof(uint16_t));
+  }
+  return result;
+}
+}  // namespace onnxruntime_float16

v1.23.1/headers/onnxruntime_lite_custom_op.h ADDED Viewed

	@@ -0,0 +1,1119 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+// Summary
+// The header has APIs to save custom op authors the trouble of defining schemas,
+// which will be inferred by functions' signature, as long as their argument list has types supported here.
+// Input could be:
+// 1. Tensor of onnx data types.
+// 2. Span of onnx data types.
+// 3. Scalar of onnx data types.
+// A input could be optional if indicated as std::optional<...>.
+// For an output, it must be a tensor of onnx data types.
+// Further, the header also has utility for a simple custom struct, where resources could be kept, to be registered as a custom op.
+// For concrete examples, please search keyword "LiteCustomOpTest" under "<cloned_src_dir>/onnxruntime/test/".
+// Note - all APIs in this header are ABI.
+#pragma once
+#include "onnxruntime_cxx_api.h"
+#include <optional>
+#include <numeric>
+#include <functional>
+#include <unordered_set>
+namespace Ort {
+namespace Custom {
+class ArgBase {
+ public:
+  ArgBase(OrtKernelContext* ctx,
+          size_t indice,
+          bool is_input) : ctx_(ctx), indice_(indice), is_input_(is_input) {}
+  virtual ~ArgBase() {};
+ protected:
+  struct KernelContext ctx_;
+  size_t indice_;
+  bool is_input_;
+};
+using ArgPtr = std::unique_ptr<Custom::ArgBase>;
+using ArgPtrs = std::vector<ArgPtr>;
+class TensorBase : public ArgBase {
+ public:
+  TensorBase(OrtKernelContext* ctx,
+             size_t indice,
+             bool is_input) : ArgBase(ctx, indice, is_input) {}
+  operator bool() const {
+    return shape_.has_value();
+  }
+  const std::vector<int64_t>& Shape() const {
+    if (!shape_.has_value()) {
+      ORT_CXX_API_THROW("tensor shape is not yet initialized", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return shape_.value();
+  }
+  ONNXTensorElementDataType Type() const {
+    return type_;
+  }
+  int64_t NumberOfElement() const {
+    if (shape_.has_value()) {
+      return std::accumulate(shape_->begin(), shape_->end(), 1LL, std::multiplies<int64_t>());
+    } else {
+      return 0;
+    }
+  }
+  std::string Shape2Str() const {
+    if (shape_.has_value()) {
+      std::string shape_str;
+      for (const auto& dim : *shape_) {
+        shape_str.append(std::to_string(dim));
+        shape_str.append(", ");
+      }
+      return shape_str;
+    } else {
+      return "empty";
+    }
+  }
+  bool IsCpuTensor() const {
+    return strcmp("Cpu", mem_type_) == 0;
+  }
+  virtual const void* DataRaw() const = 0;
+  virtual size_t SizeInBytes() const = 0;
+ protected:
+  std::optional<std::vector<int64_t>> shape_;
+  ONNXTensorElementDataType type_ = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  const char* mem_type_ = "Cpu";
+};
+template <typename T>
+struct Span {
+  const T* data_ = {};
+  size_t size_ = {};
+  void Assign(const T* data, size_t size) {
+    data_ = data;
+    size_ = size;
+  }
+  size_t size() const { return size_; }
+  T operator[](size_t indice) const {
+    return data_[indice];
+  }
+  const T* data() const { return data_; }
+};
+template <typename T>
+class Tensor : public TensorBase {
+ public:
+  using TT = typename std::remove_reference<T>::type;
+  Tensor(OrtKernelContext* ctx, size_t indice, bool is_input) : TensorBase(ctx, indice, is_input) {
+    if (is_input_) {
+      if (indice >= ctx_.GetInputCount()) {
+        ORT_CXX_API_THROW("invalid indice for Ort::Custom::Tensor", OrtErrorCode::ORT_INVALID_ARGUMENT);
+      }
+      const_value_ = ctx_.GetInput(indice);
+      auto type_shape_info = const_value_.GetTensorTypeAndShapeInfo();
+      shape_ = type_shape_info.GetShape();
+    }
+  }
+  const TT* Data() const {
+    return reinterpret_cast<const TT*>(const_value_.GetTensorRawData());
+  }
+  TT* Allocate(const std::vector<int64_t>& shape) {
+    shape_ = shape;
+    if (!data_) {
+      shape_ = shape;
+      data_ = ctx_.GetOutput(indice_, shape).template GetTensorMutableData<TT>();
+    }
+    return data_;
+  }
+  static TT GetT() { return (TT)0; }
+  const Span<T>& AsSpan() {
+    if (!shape_.has_value() || shape_->size() != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a span out of Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    span_.Assign(Data(), static_cast<size_t>((*shape_)[0]));
+    return span_;
+  }
+  const T& AsScalar() {
+    if (!shape_.has_value() || shape_->size() != 1 || (*shape_)[0] != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a scalar from Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return *Data();
+  }
+  const void* DataRaw() const override {
+    return reinterpret_cast<const void*>(Data());
+  }
+  size_t SizeInBytes() const override {
+    return sizeof(TT) * static_cast<size_t>(NumberOfElement());
+  }
+ private:
+  ConstValue const_value_;  // for input
+  TT* data_{};              // for output
+  Span<T> span_;
+};
+template <>
+class Tensor<std::string> : public TensorBase {
+ public:
+  using strings = std::vector<std::string>;
+  Tensor(OrtKernelContext* ctx, size_t indice, bool is_input) : TensorBase(ctx, indice, is_input) {
+    if (is_input_) {
+      if (indice >= ctx_.GetInputCount()) {
+        ORT_CXX_API_THROW("invalid indice for Ort::Custom::Tensor", OrtErrorCode::ORT_INVALID_ARGUMENT);
+      }
+      auto const_value = ctx_.GetInput(indice);
+      auto type_shape_info = const_value.GetTensorTypeAndShapeInfo();
+      shape_ = type_shape_info.GetShape();
+      auto num_chars = const_value.GetStringTensorDataLength();
+      // note - there will be copy ...
+      auto num_strings = static_cast<size_t>(NumberOfElement());
+      if (num_strings) {
+        std::vector<char> chars(num_chars + 1, '\0');
+        std::vector<size_t> offsets(num_strings);
+        const_value.GetStringTensorContent(static_cast<void*>(chars.data()), num_chars, offsets.data(), offsets.size());
+        auto upper_bound = num_strings - 1;
+        input_strings_.resize(num_strings);
+        for (size_t i = upper_bound;; --i) {
+          if (i < upper_bound) {
+            chars[offsets[i + 1]] = '\0';
+          }
+          input_strings_[i] = chars.data() + offsets[i];
+          if (0 == i) {
+            break;
+          }
+        }
+      }
+    }
+  }
+  const strings& Data() const {
+    return input_strings_;
+  }
+  const void* DataRaw() const override {
+    if (input_strings_.size() != 1) {
+      ORT_CXX_API_THROW("DataRaw() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return reinterpret_cast<const void*>(input_strings_[0].c_str());
+  }
+  size_t SizeInBytes() const override {
+    if (input_strings_.size() != 1) {
+      ORT_CXX_API_THROW("SizeInBytes() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return input_strings_[0].size();
+  }
+  void SetStringOutput(const strings& ss, const std::vector<int64_t>& dims) {
+    shape_ = dims;
+    std::vector<const char*> raw;
+    for (const auto& s : ss) {
+      raw.push_back(s.data());
+    }
+    auto output = ctx_.GetOutput(indice_, dims.data(), dims.size());
+    // note - there will be copy ...
+    output.FillStringTensor(raw.data(), raw.size());
+  }
+  const Span<std::string>& AsSpan() {
+    ORT_CXX_API_THROW("span for TensorT of string not implemented", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+  const std::string& AsScalar() {
+    if (input_strings_.size() != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a scalar string from Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return input_strings_[0];
+  }
+ private:
+  std::vector<std::string> input_strings_;  // for input
+};
+template <>
+class Tensor<std::string_view> : public TensorBase {
+ public:
+  using strings = std::vector<std::string>;
+  using string_views = std::vector<std::string_view>;
+  Tensor(OrtKernelContext* ctx, size_t indice, bool is_input) : TensorBase(ctx, indice, is_input) {
+    if (is_input_) {
+      if (indice >= ctx_.GetInputCount()) {
+        ORT_CXX_API_THROW("invalid indice for Ort::Custom::Tensor", OrtErrorCode::ORT_INVALID_ARGUMENT);
+      }
+      auto const_value = ctx_.GetInput(indice);
+      auto type_shape_info = const_value.GetTensorTypeAndShapeInfo();
+      shape_ = type_shape_info.GetShape();
+      auto num_chars = const_value.GetStringTensorDataLength();
+      chars_.resize(num_chars + 1, '\0');
+      auto num_strings = static_cast<size_t>(NumberOfElement());
+      if (num_strings) {
+        std::vector<size_t> offsets(num_strings);
+        const_value.GetStringTensorContent(static_cast<void*>(chars_.data()), num_chars, offsets.data(), offsets.size());
+        offsets.push_back(num_chars);
+        for (size_t i = 0; i < num_strings; ++i) {
+          input_string_views_.emplace_back(chars_.data() + offsets[i], offsets[i + 1] - offsets[i]);
+        }
+      }
+    }
+  }
+  const string_views& Data() const {
+    return input_string_views_;
+  }
+  const void* DataRaw() const override {
+    if (input_string_views_.size() != 1) {
+      ORT_CXX_API_THROW("DataRaw() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return reinterpret_cast<const void*>(input_string_views_[0].data());
+  }
+  size_t SizeInBytes() const override {
+    if (input_string_views_.size() != 1) {
+      ORT_CXX_API_THROW("SizeInBytes() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return input_string_views_[0].size();
+  }
+  void SetStringOutput(const strings& ss, const std::vector<int64_t>& dims) {
+    shape_ = dims;
+    std::vector<const char*> raw;
+    for (const auto& s : ss) {
+      raw.push_back(s.data());
+    }
+    auto output = ctx_.GetOutput(indice_, dims.data(), dims.size());
+    // note - there will be copy ...
+    output.FillStringTensor(raw.data(), raw.size());
+  }
+  const Span<std::string_view>& AsSpan() {
+    ORT_CXX_API_THROW("span for TensorT of string view not implemented", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+  std::string_view AsScalar() {
+    if (input_string_views_.size() != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a scalar string view from Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return input_string_views_[0];
+  }
+ private:
+  std::vector<char> chars_;                           // for input
+  std::vector<std::string_view> input_string_views_;  // for input
+};
+using TensorPtr = std::unique_ptr<Custom::TensorBase>;
+using TensorPtrs = std::vector<TensorPtr>;
+struct TensorArray : public ArgBase {
+  TensorArray(OrtKernelContext* ctx,
+              size_t start_indice,
+              bool is_input) : ArgBase(ctx,
+                                       start_indice,
+                                       is_input) {
+    if (is_input) {
+      auto input_count = ctx_.GetInputCount();
+      for (size_t ith_input = start_indice; ith_input < input_count; ++ith_input) {
+        auto const_value = ctx_.GetInput(start_indice);
+        auto type_shape_info = const_value.GetTensorTypeAndShapeInfo();
+        auto type = type_shape_info.GetElementType();
+        TensorPtr tensor;
+        switch (type) {
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+            tensor = std::make_unique<Custom::Tensor<bool>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+            tensor = std::make_unique<Custom::Tensor<float>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
+            tensor = std::make_unique<Custom::Tensor<double>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+            tensor = std::make_unique<Custom::Tensor<uint8_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+            tensor = std::make_unique<Custom::Tensor<int8_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
+            tensor = std::make_unique<Custom::Tensor<uint16_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
+            tensor = std::make_unique<Custom::Tensor<int16_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
+            tensor = std::make_unique<Custom::Tensor<uint32_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+            tensor = std::make_unique<Custom::Tensor<int32_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
+            tensor = std::make_unique<Custom::Tensor<uint64_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+            tensor = std::make_unique<Custom::Tensor<int64_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING:
+            tensor = std::make_unique<Custom::Tensor<std::string>>(ctx, ith_input, true);
+            break;
+          default:
+            ORT_CXX_API_THROW("unknown input type", ORT_RUNTIME_EXCEPTION);
+            break;
+        }
+        tensors_.emplace_back(tensor.release());
+      }  // for
+    }
+  }
+  template <typename T>
+  T* AllocateOutput(size_t ith_output, const std::vector<int64_t>& shape) {
+    // ith_output is the indice of output relative to the tensor array
+    // indice_ + ith_output is the indice relative to context
+    auto tensor = std::make_unique<Tensor<T>>(ctx_.GetOrtKernelContext(), indice_ + ith_output, false);
+    auto raw_output = tensor.get()->Allocate(shape);
+    tensors_.emplace_back(tensor.release());
+    return raw_output;
+  }
+  Tensor<std::string>& AllocateStringTensor(size_t ith_output) {
+    // ith_output is the indice of output relative to the tensor array
+    // indice_ + ith_output is the indice relative to context
+    auto tensor = std::make_unique<Tensor<std::string>>(ctx_.GetOrtKernelContext(), indice_ + ith_output, false);
+    Tensor<std::string>& output = *tensor;
+    tensors_.emplace_back(tensor.release());
+    return output;
+  }
+  size_t Size() const {
+    return tensors_.size();
+  }
+  const TensorPtr& operator[](size_t ith_input) const {
+    // ith_input is the indice of output relative to the tensor array
+    return tensors_.at(ith_input);
+  }
+ private:
+  TensorPtrs tensors_;
+};
+using Variadic = TensorArray;
+/*
+Note:
+OrtLiteCustomOp inherits from OrtCustomOp to bridge tween a custom func/struct and ort core.
+The lifetime of an OrtLiteCustomOp instance is managed by customer code, not ort, so:
+1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierarchy.
+2. OrtLiteCustomFunc and OrtLiteCustomStruct, as two sub-structs, can be released in form of OrtLiteCustomOp since all members are kept in the OrtLiteCustomOp,
+   hence memory could still be recycled properly.
+Further, OrtCustomOp is a c struct bearing no v-table, so offspring structs are by design to be of zero virtual functions to maintain cast safety.
+*/
+struct OrtLiteCustomOp : public OrtCustomOp {
+  using ConstOptionalFloatTensor = std::optional<const Custom::Tensor<float>&>;
+  using OptionalFloatTensor = std::optional<Custom::Tensor<float>>;
+  // CreateTuple
+  template <size_t ith_input, size_t ith_output, typename... Ts>
+  static typename std::enable_if<sizeof...(Ts) == 0, std::tuple<>>::type
+  CreateTuple(OrtKernelContext*, ArgPtrs&, size_t, size_t, const std::string&) {
+    return std::make_tuple();
+  }
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, OrtKernelContext*>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    std::tuple<T> current = std::tuple<OrtKernelContext*>{context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, OrtKernelContext&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    std::tuple<T> current = std::tuple<OrtKernelContext&>{*context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+#ifdef ORT_CUDA_CTX
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const CudaContext&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    thread_local CudaContext cuda_context;
+    cuda_context.Init(*context);
+    std::tuple<T> current = std::tuple<const CudaContext&>{cuda_context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+#endif
+#ifdef ORT_ROCM_CTX
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const RocmContext&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    thread_local RocmContext rocm_context;
+    rocm_context.Init(*context);
+    std::tuple<T> current = std::tuple<const RocmContext&>{rocm_context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+#endif
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const TensorArray*>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_input, true));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const TensorArray&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_input, true));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, TensorArray*>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_output, false));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, TensorArray&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_output, false));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+#define CREATE_TUPLE_INPUT(data_type)                                                                                                 \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Tensor<data_type>*>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};                                                    \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Tensor<data_type>&>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};                                                   \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, std::optional<const Custom::Tensor<data_type>*>>::value, std::tuple<T, Ts...>>::type \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if (ith_input < num_input) {                                                                                                      \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                          \
+      std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())};                         \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    } else {                                                                                                                          \
+      std::tuple<T> current = std::tuple<T>{};                                                                                        \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    }                                                                                                                                 \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Span<data_type>*>::value, std::tuple<T, Ts...>>::type                  \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if ("CPUExecutionProvider" != ep) {                                                                                               \
+      ORT_CXX_API_THROW("span input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                           \
+    }                                                                                                                                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{&reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsSpan()};                \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Span<data_type>&>::value, std::tuple<T, Ts...>>::type                  \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if ("CPUExecutionProvider" != ep) {                                                                                               \
+      ORT_CXX_API_THROW("span input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                           \
+    }                                                                                                                                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsSpan()};                 \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, std::optional<const Custom::Span<data_type>*>>::value, std::tuple<T, Ts...>>::type   \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if (ith_input < num_input) {                                                                                                      \
+      if ("CPUExecutionProvider" != ep) {                                                                                             \
+        ORT_CXX_API_THROW("span input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                         \
+      }                                                                                                                               \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                          \
+      std::tuple<T> current = std::tuple<T>{&reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsSpan()};              \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    } else {                                                                                                                          \
+      std::tuple<T> current = std::tuple<T>{};                                                                                        \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    }                                                                                                                                 \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, data_type>::value, std::tuple<T, Ts...>>::type                                       \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if ("CPUExecutionProvider" != ep) {                                                                                               \
+      ORT_CXX_API_THROW("scalar input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                         \
+    }                                                                                                                                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsScalar()};               \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, std::optional<data_type>>::value, std::tuple<T, Ts...>>::type                        \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if (ith_input < num_input) {                                                                                                      \
+      if ("CPUExecutionProvider" != ep) {                                                                                             \
+        ORT_CXX_API_THROW("scalar input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                       \
+      }                                                                                                                               \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                          \
+      std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsScalar()};             \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    } else {                                                                                                                          \
+      std::tuple<T> current = std::tuple<T>{};                                                                                        \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    }                                                                                                                                 \
+  }
+#define CREATE_TUPLE_OUTPUT(data_type)                                                                                          \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                    \
+  static typename std::enable_if<std::is_same<T, Custom::Tensor<data_type>*>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {           \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_output, false));                                    \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};                                              \
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                        \
+    return std::tuple_cat(current, next);                                                                                       \
+  }                                                                                                                             \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                    \
+  static typename std::enable_if<std::is_same<T, Custom::Tensor<data_type>&>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {           \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_output, false));                                    \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};                                             \
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                        \
+    return std::tuple_cat(current, next);                                                                                       \
+  }                                                                                                                             \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                    \
+  static typename std::enable_if<std::is_same<T, std::optional<Custom::Tensor<data_type>*>>::value, std::tuple<T, Ts...>>::type \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {           \
+    if (ith_output < num_output) {                                                                                              \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_output, false));                                  \
+      std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())};                   \
+      auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                      \
+      return std::tuple_cat(current, next);                                                                                     \
+    } else {                                                                                                                    \
+      std::tuple<T> current = std::tuple<T>{};                                                                                  \
+      auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                      \
+      return std::tuple_cat(current, next);                                                                                     \
+    }                                                                                                                           \
+  }
+#define CREATE_TUPLE(data_type) \
+  CREATE_TUPLE_INPUT(data_type) \
+  CREATE_TUPLE_OUTPUT(data_type)
+  CREATE_TUPLE(bool)
+  CREATE_TUPLE(float)
+  CREATE_TUPLE(Ort::Float16_t)
+  CREATE_TUPLE(Ort::BFloat16_t)
+  CREATE_TUPLE(double)
+  CREATE_TUPLE(int8_t)
+  CREATE_TUPLE(int16_t)
+  CREATE_TUPLE(int32_t)
+  CREATE_TUPLE(int64_t)
+  CREATE_TUPLE(uint8_t)
+  CREATE_TUPLE(uint16_t)
+  CREATE_TUPLE(uint32_t)
+  CREATE_TUPLE(uint64_t)
+  CREATE_TUPLE(std::string)
+  CREATE_TUPLE_INPUT(std::string_view)
+  CREATE_TUPLE(Ort::Float8E4M3FN_t)
+  CREATE_TUPLE(Ort::Float8E4M3FNUZ_t)
+  CREATE_TUPLE(Ort::Float8E5M2_t)
+  CREATE_TUPLE(Ort::Float8E5M2FNUZ_t)
+  // ParseArgs ...
+  template <typename... Ts>
+  static typename std::enable_if<0 == sizeof...(Ts)>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>&, std::vector<ONNXTensorElementDataType>&) {
+  }
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, OrtKernelContext*>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, OrtKernelContext&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+#ifdef ORT_CUDA_CTX
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const CudaContext&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+#endif
+#ifdef ORT_ROCM_CTX
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const RocmContext&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+#endif
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const TensorArray&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    input_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const TensorArray*>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    input_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, TensorArray&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    output_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, TensorArray*>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    output_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+#define PARSE_INPUT_BASE(pack_type, onnx_type)                                                                           \
+  template <typename T, typename... Ts>                                                                                  \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, pack_type>::value>::type                          \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) { \
+    input_types.push_back(onnx_type);                                                                                    \
+    ParseArgs<Ts...>(input_types, output_types);                                                                         \
+  }                                                                                                                      \
+  template <typename T, typename... Ts>                                                                                  \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const std::optional<pack_type>>::value>::type     \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) { \
+    input_types.push_back(onnx_type);                                                                                    \
+    ParseArgs<Ts...>(input_types, output_types);                                                                         \
+  }                                                                                                                      \
+  template <typename T, typename... Ts>                                                                                  \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, std::optional<pack_type>>::value>::type           \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) { \
+    input_types.push_back(onnx_type);                                                                                    \
+    ParseArgs<Ts...>(input_types, output_types);                                                                         \
+  }
+#define PARSE_INPUT(data_type, onnx_type)                       \
+  PARSE_INPUT_BASE(const Custom::Tensor<data_type>*, onnx_type) \
+  PARSE_INPUT_BASE(const Custom::Tensor<data_type>&, onnx_type) \
+  PARSE_INPUT_BASE(const Custom::Span<data_type>*, onnx_type)   \
+  PARSE_INPUT_BASE(const Custom::Span<data_type>&, onnx_type)   \
+  PARSE_INPUT_BASE(data_type, onnx_type)
+#define PARSE_OUTPUT(data_type, onnx_type)                                                                                      \
+  template <typename T, typename... Ts>                                                                                         \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, Custom::Tensor<data_type>*>::value>::type                \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {        \
+    output_types.push_back(onnx_type);                                                                                          \
+    ParseArgs<Ts...>(input_types, output_types);                                                                                \
+  }                                                                                                                             \
+  template <typename T, typename... Ts>                                                                                         \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, Custom::Tensor<data_type>&>::value>::type                \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {        \
+    output_types.push_back(onnx_type);                                                                                          \
+    ParseArgs<Ts...>(input_types, output_types);                                                                                \
+  }                                                                                                                             \
+  template <typename T, typename... Ts>                                                                                         \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, std::optional<Custom::Tensor<data_type>*>>::value>::type \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {        \
+    output_types.push_back(onnx_type);                                                                                          \
+    ParseArgs<Ts...>(input_types, output_types);                                                                                \
+  }
+#define PARSE_ARGS(data_type, onnx_type) \
+  PARSE_INPUT(data_type, onnx_type)      \
+  PARSE_OUTPUT(data_type, onnx_type)
+  PARSE_ARGS(bool, ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL)
+  PARSE_ARGS(float, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT)
+  PARSE_ARGS(Ort::Float16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16)
+  PARSE_ARGS(Ort::BFloat16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16)
+  PARSE_ARGS(double, ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE)
+  PARSE_ARGS(int8_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8)
+  PARSE_ARGS(int16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16)
+  PARSE_ARGS(int32_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32)
+  PARSE_ARGS(int64_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64)
+  PARSE_ARGS(uint8_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8)
+  PARSE_ARGS(uint16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16)
+  PARSE_ARGS(uint32_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32)
+  PARSE_ARGS(uint64_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64)
+  PARSE_ARGS(std::string, ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING)
+  PARSE_ARGS(std::string_view, ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING)  // todo - remove string_view output
+  PARSE_ARGS(Ort::Float8E4M3FN_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN)
+  PARSE_ARGS(Ort::Float8E4M3FNUZ_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ)
+  PARSE_ARGS(Ort::Float8E5M2_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2)
+  PARSE_ARGS(Ort::Float8E5M2FNUZ_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ)
+  OrtLiteCustomOp(const char* op_name,
+                  const char* execution_provider,
+                  ShapeInferFn shape_infer_fn,
+                  int start_ver = 1,
+                  int end_ver = MAX_CUSTOM_OP_END_VER) : op_name_(op_name),
+                                                         execution_provider_(execution_provider),
+                                                         shape_infer_fn_(shape_infer_fn),
+                                                         start_ver_(start_ver),
+                                                         end_ver_(end_ver) {
+    OrtCustomOp::version = ORT_API_VERSION;
+    OrtCustomOp::GetName = [](const OrtCustomOp* op) { return static_cast<const OrtLiteCustomOp*>(op)->op_name_.c_str(); };
+    OrtCustomOp::GetExecutionProviderType = [](const OrtCustomOp* op) { return ((OrtLiteCustomOp*)op)->execution_provider_.c_str(); };
+    OrtCustomOp::GetInputMemoryType = [](const OrtCustomOp*, size_t) { return OrtMemTypeDefault; };
+    OrtCustomOp::GetInputTypeCount = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->input_types_.size();
+    };
+    OrtCustomOp::GetInputType = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->input_types_[indice];
+    };
+    OrtCustomOp::GetOutputTypeCount = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->output_types_.size();
+    };
+    OrtCustomOp::GetOutputType = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->output_types_[indice];
+    };
+    OrtCustomOp::GetInputCharacteristic = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->input_types_[indice] == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED ? INPUT_OUTPUT_VARIADIC : INPUT_OUTPUT_OPTIONAL;
+    };
+    OrtCustomOp::GetOutputCharacteristic = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->output_types_[indice] == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED ? INPUT_OUTPUT_VARIADIC : INPUT_OUTPUT_OPTIONAL;
+    };
+    OrtCustomOp::GetVariadicInputMinArity = [](const OrtCustomOp*) {
+      return 1;
+    };
+    OrtCustomOp::GetVariadicInputHomogeneity = [](const OrtCustomOp*) {
+      return 0;
+    };
+    OrtCustomOp::GetVariadicOutputMinArity = [](const OrtCustomOp*) {
+      return 1;
+    };
+    OrtCustomOp::GetVariadicOutputHomogeneity = [](const OrtCustomOp*) {
+      return 0;
+    };
+    OrtCustomOp::GetVariadicInputMinArity = [](const OrtCustomOp*) { return 0; };
+    OrtCustomOp::GetVariadicInputHomogeneity = [](const OrtCustomOp*) { return 0; };
+    OrtCustomOp::GetVariadicOutputMinArity = [](const OrtCustomOp*) { return 0; };
+    OrtCustomOp::GetVariadicOutputHomogeneity = [](const OrtCustomOp*) { return 0; };
+    OrtCustomOp::CreateKernelV2 = {};
+    OrtCustomOp::KernelComputeV2 = {};
+    OrtCustomOp::KernelCompute = {};
+    OrtCustomOp::InferOutputShapeFn = {};
+    OrtCustomOp::GetStartVersion = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->start_ver_;
+    };
+    OrtCustomOp::GetEndVersion = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->end_ver_;
+    };
+    OrtCustomOp::GetMayInplace = {};
+    OrtCustomOp::ReleaseMayInplace = {};
+    OrtCustomOp::GetAliasMap = {};
+    OrtCustomOp::ReleaseAliasMap = {};
+  }
+  const std::string op_name_;
+  const std::string execution_provider_;
+  std::vector<ONNXTensorElementDataType> input_types_;
+  std::vector<ONNXTensorElementDataType> output_types_;
+  ShapeInferFn shape_infer_fn_ = {};
+  int start_ver_ = 1;
+  int end_ver_ = MAX_CUSTOM_OP_END_VER;
+  void* compute_fn_ = {};
+  void* compute_fn_return_status_ = {};
+};
+//////////////////////////// OrtLiteCustomFunc ////////////////////////////////
+// The struct is to implement function-as-op.
+// E.g. a function might be defined as:
+//   void Filter(const Ort::Custom::Tensor<float>& floats_in, Ort::Custom::Tensor<float>& floats_out) { ... }
+// It could be registered this way:
+//   Ort::CustomOpDomain v2_domain{"v2"};
+//   std::unique_ptr<OrtLiteCustomOp> fil_op_ptr{Ort::Custom::CreateLiteCustomOp("Filter", "CPUExecutionProvider", Filter)};
+//   v2_domain.Add(fil_op_ptr.get());
+//   session_options.Add(v2_domain);
+// For the complete example, please search keyword "LiteCustomOpTest" under "<cloned_src_dir>/onnxruntime/test/".
+template <typename... Args>
+struct OrtLiteCustomFunc : public OrtLiteCustomOp {
+  using ComputeFn = void (*)(Args...);
+  using ComputeFnReturnStatus = Status (*)(Args...);
+  using MyType = OrtLiteCustomFunc<Args...>;
+  struct Kernel {
+    size_t num_input_{};
+    size_t num_output_{};
+    ComputeFn compute_fn_{};
+    ComputeFnReturnStatus compute_fn_return_status_{};
+    std::string ep_{};
+  };
+  OrtLiteCustomFunc(const char* op_name,
+                    const char* execution_provider,
+                    ComputeFn compute_fn,
+                    ShapeInferFn shape_infer_fn = {},
+                    int start_ver = 1,
+                    int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, shape_infer_fn, start_ver, end_ver) {
+    compute_fn_ = reinterpret_cast<void*>(compute_fn);
+    ParseArgs<Args...>(input_types_, output_types_);
+    OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      std::vector<ArgPtr> args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      std::apply([kernel](Args const&... t_args) { kernel->compute_fn_(t_args...); }, t);
+    };
+    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
+      auto kernel = std::make_unique<Kernel>();
+      auto me = static_cast<const MyType*>(this_);
+      kernel->compute_fn_ = reinterpret_cast<ComputeFn>(me->compute_fn_);
+      Ort::ThrowOnError(ort_api->KernelInfo_GetInputCount(info, &kernel->num_input_));
+      Ort::ThrowOnError(ort_api->KernelInfo_GetOutputCount(info, &kernel->num_output_));
+      auto self = static_cast<const OrtLiteCustomFunc*>(this_);
+      kernel->ep_ = self->execution_provider_;
+      return reinterpret_cast<void*>(kernel.release());
+    };
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) {
+      delete reinterpret_cast<Kernel*>(op_kernel);
+    };
+    if (shape_infer_fn_) {
+      OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp* op, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
+        auto shape_info_fn = static_cast<const MyType*>(op)->shape_infer_fn_;
+        ShapeInferContext ctx(&GetApi(), ort_ctx);
+        return shape_info_fn(ctx);
+      };
+    }
+  }
+  OrtLiteCustomFunc(const char* op_name,
+                    const char* execution_provider,
+                    ComputeFnReturnStatus compute_fn_return_status,
+                    ShapeInferFn shape_infer_fn = {},
+                    int start_ver = 1,
+                    int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, shape_infer_fn, start_ver, end_ver) {
+    compute_fn_return_status_ = reinterpret_cast<void*>(compute_fn_return_status);
+    ParseArgs<Args...>(input_types_, output_types_);
+    OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      std::vector<ArgPtr> args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      return std::apply([kernel](Args const&... t_args) { Status status = kernel->compute_fn_return_status_(t_args...); return status.release(); }, t);
+    };
+    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
+      auto kernel = std::make_unique<Kernel>();
+      auto me = static_cast<const MyType*>(this_);
+      kernel->compute_fn_return_status_ = reinterpret_cast<ComputeFnReturnStatus>(me->compute_fn_return_status_);
+      Ort::ThrowOnError(ort_api->KernelInfo_GetInputCount(info, &kernel->num_input_));
+      Ort::ThrowOnError(ort_api->KernelInfo_GetOutputCount(info, &kernel->num_output_));
+      auto self = static_cast<const OrtLiteCustomFunc*>(this_);
+      kernel->ep_ = self->execution_provider_;
+      return reinterpret_cast<void*>(kernel.release());
+    };
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) {
+      delete reinterpret_cast<Kernel*>(op_kernel);
+    };
+    if (shape_infer_fn_) {
+      OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp* op, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
+        auto shape_info_fn = static_cast<const MyType*>(op)->shape_infer_fn_;
+        ShapeInferContext ctx(&GetApi(), ort_ctx);
+        return shape_info_fn(ctx);
+      };
+    }
+  }
+};  // struct OrtLiteCustomFunc
+/////////////////////////// OrtLiteCustomStruct ///////////////////////////
+// The struct is to implement struct-as-op.
+// E.g. a struct might be defined as:
+//   struct Merge {
+//      Merge(const OrtApi* ort_api, const OrtKernelInfo* info) {...}
+//      void Compute(const Ort::Custom::Tensor<std::string_view>& strings_in,
+//                   std::string_view string_in,
+//                   Ort::Custom::Tensor<std::string>* strings_out) {...}
+//      bool reverse_ = false;
+//   };
+// It could be registered this way:
+//   Ort::CustomOpDomain v2_domain{"v2"};
+//   std::unique_ptr<OrtLiteCustomOp> mrg_op_ptr{Ort::Custom::CreateLiteCustomOp<Merge>("Merge", "CPUExecutionProvider")};
+//   v2_domain.Add(mrg_op_ptr.get());
+//   session_options.Add(v2_domain);
+// For the complete example, please search keyword "LiteCustomOpTest" under "<cloned_src_dir>/onnxruntime/test/".
+template <typename CustomOp>
+struct OrtLiteCustomStruct : public OrtLiteCustomOp {
+  template <typename... Args>
+  using CustomComputeFn = void (CustomOp::*)(Args...);
+  template <typename... Args>
+  using CustomComputeFnReturnStatus = Status (CustomOp::*)(Args...);
+  using MyType = OrtLiteCustomStruct<CustomOp>;
+  struct Kernel {
+    size_t num_input_{};
+    size_t num_output_{};
+    std::unique_ptr<CustomOp> custom_op_;
+    std::string ep_{};
+  };
+  OrtLiteCustomStruct(const char* op_name,
+                      const char* execution_provider,
+                      int start_ver = 1,
+                      int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, {}, start_ver, end_ver) {
+    SetCompute(&CustomOp::Compute);
+    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
+      auto kernel = std::make_unique<Kernel>();
+      Ort::ThrowOnError(ort_api->KernelInfo_GetInputCount(info, &kernel->num_input_));
+      Ort::ThrowOnError(ort_api->KernelInfo_GetOutputCount(info, &kernel->num_output_));
+      kernel->custom_op_ = std::make_unique<CustomOp>(ort_api, info);
+      auto self = static_cast<const OrtLiteCustomStruct*>(this_);
+      kernel->ep_ = self->execution_provider_;
+      return reinterpret_cast<void*>(kernel.release());
+    };
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) {
+      delete reinterpret_cast<Kernel*>(op_kernel);
+    };
+    SetShapeInfer<CustomOp>(0);
+  }
+  template <typename... Args>
+  void SetCompute(CustomComputeFn<Args...>) {
+    ParseArgs<Args...>(input_types_, output_types_);
+    OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      ArgPtrs args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      std::apply([kernel](Args const&... t_args) { kernel->custom_op_->Compute(t_args...); }, t);
+    };
+  }
+  template <typename... Args>
+  void SetCompute(CustomComputeFnReturnStatus<Args...>) {
+    ParseArgs<Args...>(input_types_, output_types_);
+    OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      ArgPtrs args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      return std::apply([kernel](Args const&... t_args) { Status status = kernel->custom_op_->Compute(t_args...); return status.release(); }, t);
+    };
+  }
+  template <typename C>
+  decltype(&C::InferOutputShape) SetShapeInfer(decltype(&C::InferOutputShape)) {
+    OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp*, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
+      ShapeInferContext ctx(&GetApi(), ort_ctx);
+      return C::InferOutputShape(ctx);
+    };
+    return {};
+  }
+  template <typename C>
+  void SetShapeInfer(...) {
+    OrtCustomOp::InferOutputShapeFn = {};
+  }
+};  // struct OrtLiteCustomStruct
+/////////////////////////// CreateLiteCustomOp ////////////////////////////
+template <typename... Args>
+OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
+                                    const char* execution_provider,
+                                    void (*custom_compute_fn)(Args...),
+                                    Status (*shape_infer_fn)(ShapeInferContext&) = {},
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
+  using LiteOp = OrtLiteCustomFunc<Args...>;
+  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn, shape_infer_fn, start_ver, end_ver).release();
+}
+template <typename... Args>
+OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
+                                    const char* execution_provider,
+                                    Status (*custom_compute_fn_v2)(Args...),
+                                    Status (*shape_infer_fn)(ShapeInferContext&) = {},
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
+  using LiteOp = OrtLiteCustomFunc<Args...>;
+  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn_v2, shape_infer_fn, start_ver, end_ver).release();
+}
+template <typename CustomOp>
+OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
+                                    const char* execution_provider,
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
+  using LiteOp = OrtLiteCustomStruct<CustomOp>;
+  return std::make_unique<LiteOp>(op_name, execution_provider, start_ver, end_ver).release();
+}
+}  // namespace Custom
+}  // namespace Ort

v1.23.1/headers/onnxruntime_run_options_config_keys.h ADDED Viewed

	@@ -0,0 +1,54 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+/*
+ * This file defines RunOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a RunOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a RunOptions Config Value is defined individually for each Config.
+ * The maximum length of the Config Value is 1024
+ */
+// Key for enabling shrinkages of user listed device memory arenas.
+// Expects a list of semi-colon separated key value pairs separated by colon in the following format:
+// "device_0:device_id_0;device_1:device_id_1"
+// No white-spaces allowed in the provided list string.
+// Currently, the only supported devices are : "cpu", "gpu" (case sensitive).
+// If "cpu" is included in the list, DisableCpuMemArena() API must not be called (i.e.) arena for cpu should be enabled.
+// Example usage: "cpu:0;gpu:0" (or) "gpu:0"
+// By default, the value for this key is empty (i.e.) no memory arenas are shrunk
+static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memory.enable_memory_arena_shrinkage";
+// Set to '1' to not synchronize execution providers with CPU at the end of session run.
+// Per default it will be set to '0'
+// Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
+static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";
+// Set HTP performance mode for QNN HTP backend before session run.
+// options for HTP performance mode: "burst", "balanced", "default", "high_performance",
+// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
+// "sustained_high_performance". Default to "default".
+static const char* const kOrtRunOptionsConfigQnnPerfMode = "qnn.htp_perf_mode";
+// Set HTP performance mode for QNN HTP backend post session run.
+static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_mode_post_run";
+// Set RPC control latency for QNN HTP backend
+static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
+// Set QNN Lora Config File for apply Lora in QNN context binary
+static const char* const kOrtRunOptionsConfigQnnLoraConfig = "qnn.lora_config";
+// Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
+// The value should be an integer. If the value is not set, the default value is 0 and
+// ORT session only captures one cuda graph before another capture is requested.
+// If the value is set to -1, cuda graph capture/replay is disabled in that run.
+// User are not expected to set the value to 0 as it is reserved for internal use.
+static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "gpu_graph_id";

v1.23.1/headers/onnxruntime_session_options_config_keys.h ADDED Viewed

	@@ -0,0 +1,417 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 1024
+ *
+ * The string format of a SessionOptions Config Value is defined individually for each Config.
+ * The maximum length of the Config Value is 2048
+ */
+// Key for disable PrePacking,
+// If the config value is set to "1" then the prepacking is disabled, otherwise prepacking is enabled (default value)
+static const char* const kOrtSessionOptionsConfigDisablePrepacking = "session.disable_prepacking";
+// A value of "1" means allocators registered in the env will be used. "0" means the allocators created in the session
+// will be used. Use this to override the usage of env allocators on a per session level.
+static const char* const kOrtSessionOptionsConfigUseEnvAllocators = "session.use_env_allocators";
+// Set to 'ORT' (case sensitive) to load an ORT format model.
+// If unset, model type will default to ONNX unless inferred from filename ('.ort' == ORT format) or bytes to be ORT
+static const char* const kOrtSessionOptionsConfigLoadModelFormat = "session.load_model_format";
+// Set to 'ORT' (case sensitive) to save optimized model in ORT format when SessionOptions.optimized_model_path is set.
+// If unset, format will default to ONNX unless optimized_model_filepath ends in '.ort'.
+static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save_model_format";
+// If a value is "1", flush-to-zero and denormal-as-zero are applied. The default is "0".
+// When multiple sessions are created, a main thread doesn't override changes from succeeding session options,
+// but threads in session thread pools follow option changes.
+// When ORT runs with OpenMP, the same rule is applied, i.e. the first session option to flush-to-zero and
+// denormal-as-zero is only applied to global OpenMP thread pool, which doesn't support per-session thread pool.
+// Note that an alternative way not using this option at runtime is to train and export a model without denormals
+// and that's recommended because turning this option on may hurt model accuracy.
+static const char* const kOrtSessionOptionsConfigSetDenormalAsZero = "session.set_denormal_as_zero";
+// It controls to run quantization model in QDQ (QuantizelinearDeQuantizelinear) format or not.
+// "0": enable. ORT does fusion logic for QDQ format.
+// "1": disable. ORT doesn't do fusion logic for QDQ format.
+// Its default value is "0" unless the DirectML execution provider is registered, in which case it defaults to "1".
+static const char* const kOrtSessionOptionsDisableQuantQDQ = "session.disable_quant_qdq";
+// It controls whether to enable Double QDQ remover and Identical Children Consolidation
+// "0": not to disable. ORT does remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
+// "1": disable. ORT doesn't remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
+// Its default value is "0"
+static const char* const kOrtSessionOptionsDisableDoubleQDQRemover = "session.disable_double_qdq_remover";
+// If set to "1", enables the removal of QuantizeLinear/DequantizeLinear node pairs once all QDQ handling has been
+// completed. e.g. If after all QDQ handling has completed and we have -> FloatOp -> Q -> DQ -> FloatOp -> the
+// Q -> DQ could potentially be removed. This will provide a performance benefit by avoiding going from float to
+// 8-bit and back to float, but could impact accuracy. The impact on accuracy will be model specific and depend on
+// other factors like whether the model was created using Quantization Aware Training or Post Training Quantization.
+// As such, it's best to test to determine if enabling this works well for your scenario.
+// The default value is "0"
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsEnableQuantQDQCleanup = "session.enable_quant_qdq_cleanup";
+// Enable or disable gelu approximation in graph optimization. "0": disable; "1": enable. The default is "0".
+// GeluApproximation has side effects which may change the inference results. It is disabled by default due to this.
+static const char* const kOrtSessionOptionsEnableGeluApproximation = "optimization.enable_gelu_approximation";
+// Enable or disable Cast chain elimination in graph optimization. "0": disable; "1": enable. The default is "0".
+// CastElimination with chain elimination has side effects which may change the inference results. It is disabled by default due to this.
+static const char* const kOrtSessionOptionsEnableCastChainElimination = "optimization.enable_cast_chain_elimination";
+// This setting controls whether to enable AheadOfTime function inlining.
+// AOT function inlining examines the graph and attempts to inline as many locally defined functions in the model
+// as possible with the help of enabled execution providers.
+// This can reduce the number of function calls and improve performance because it is done before
+// Level1 optimizers and constant folding. However, under some circumstances, when the EPs are not available,
+// one can disable the AOT inlining, produce an optimized model and postpone AOT until run time.
+// "0": enable; "1": disable.
+// Its default value is "0".
+static const char* const kOrtSessionOptionsDisableAheadOfTimeFunctionInlining = "session.disable_aot_function_inlining";
+#ifdef ENABLE_TRAINING
+// Specifies a path of the file containing a list of memory optimization configurations.
+// The value should be a string indicating the file path of the config file.
+// The content of the config file is a JSON struct like this:
+// [
+//   "Gelu+Cast+:1:0",
+//   "Dropout+:1:1"
+// ]
+// Taking the example of "Gelu+Cast+:1:0",
+// > "Gelu+Cast+" is the subgraph string, a valid "subgraph string" should be one subgraph representation
+//    output by ORT graph transformations.
+// > "1" is "optimization strategy", valid values: 0 - disabled, 1 - recompute.
+// > "0" is "number of subgraph to apply" which is used to control how many subgraphs to apply optimization,
+//    to avoid "oversaving" the memory.
+static const char* const kOrtSessionOptionsMemoryOptimizerApplyConfig = "optimization.memory_optimizer_config";
+// Specifies the config for detecting subgraphs for memory footprint reduction.
+// The value should be a string contains int separated using commas. The default value is "0:0".
+static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimization.enable_memory_probe_recompute_config";
+#endif
+// This setting if set should contain a comma separated list of optimizers names that should be disabled.
+// Optimizers may take time to execute and affect model loading time. If you feel that a specific optimizer
+// does not provider runtime benefits, but affects your model loading time you may disable it using this config
+// entry. This option is not enabled in ORT_MINIMAL_BUILD build.
+// A list of optimizes is available in onnxruntime/core/optimizer/graph_transformer_utils.cc
+//
+// Default is an empty string which means no optimizers are disabled.
+static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
+// It controls whether to run graph optimizations in loop or not.
+//
+// "0": disable. Graph Optimization Loop is disabled.
+// ```
+// Level 2 --> Level 3 --> InsertCastTransforms --> Level 4
+//   ^                                                 |
+//   |                 "No Loop"                       |
+//   |                                                 |
+//   X                xxxxxxxxxxx                      X
+// ```
+// "1": enable. Graph Optimization Loop is enabled, such that, if optimizations at Level 4 are applied then
+// the loop will check for any other valid optimization that can happen.
+// ```
+// Level 2 --> Level 3 --> InsertCastTransforms --> Level 4
+//   ^                                                 |
+//   |        "Loop only depending on Level 4"         |
+//   |                                                 |
+//   ---------------------------------------------------
+// ```
+// "2": enable. Graph Optimization Loop is enabled, such that, if optimizations at Level 2 or above are applied then
+// The loop will check for any other valid optimization that can happen.
+// ```
+// Level 2 --> Level 3 --> InsertCastTransforms --> Level 4
+//   ^                                                 |
+//   |                    "Loop"                       |
+//   |                                                 |
+//   ---------------------------------------------------
+// ```
+// Default value is set to "1".
+static const char* const kOrtSessionOptionsGraphOptimizationsLoopLevel = "session.graph_optimizations_loop_level";
+// Enable or disable using device allocator for allocating initialized tensor memory. "1": enable; "0": disable. The default is "0".
+// Using device allocators means the memory allocation is made using malloc/new.
+static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "session.use_device_allocator_for_initializers";
+// Configure whether to allow the inter_op/intra_op threads spinning a number of times before blocking
+// "0": thread will block if found no job to run
+// "1": thread will spin a number of times before blocking
+// The default is "0" when ORT is built with "ORT_CLIENT_PACKAGE_BUILD" and "1" otherwise.
+// Thread spinning is disabled by default for client/on-device workloads to reduce cpu utilization and improve power efficiency.
+static const char* const kOrtSessionOptionsConfigAllowInterOpSpinning = "session.inter_op.allow_spinning";
+static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session.intra_op.allow_spinning";
+// Key for using model bytes directly for ORT format
+// If a session is created using an input byte array contains the ORT format model data,
+// By default we will copy the model bytes at the time of session creation to ensure the model bytes
+// buffer is valid.
+// Setting this option to "1" will disable copy the model bytes, and use the model bytes directly. The caller
+// has to guarantee that the model bytes are valid until the ORT session using the model bytes is destroyed.
+static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "session.use_ort_model_bytes_directly";
+/// <summary>
+/// Key for using the ORT format model flatbuffer bytes directly for initializers.
+/// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
+/// Requires `session.use_ort_model_bytes_directly` to be true.
+/// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
+/// duration of the InferenceSession.
+/// </summary>
+static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
+    "session.use_ort_model_bytes_for_initializers";
+// This should only be specified when exporting an ORT format model for use on a different platform.
+// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsQDQIsInt8Allowed = "session.qdqisint8allowed";
+// x64 SSE4.1/AVX2/AVX512(with no VNNI) has overflow problem with quantizied matrix multiplication with U8S8.
+// To avoid this we need to use slower U8U8 matrix multiplication instead. This option, if
+// turned on, use slower U8U8 matrix multiplications. Only effective with AVX2 or AVX512
+// platforms.
+static const char* const kOrtSessionOptionsAvx2PrecisionMode = "session.x64quantprecision";
+// Specifies how minimal build graph optimizations are handled in a full build.
+// These optimizations are at the extended level or higher.
+// Possible values and their effects are:
+// "save": Save runtime optimizations when saving an ORT format model.
+// "apply": Only apply optimizations available in a minimal build.
+// ""/<unspecified>: Apply optimizations available in a full build.
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsConfigMinimalBuildOptimizations =
+    "optimization.minimal_build_optimizations";
+// Note: The options specific to an EP should be specified prior to appending that EP to the session options object in
+// order for them to take effect.
+// Specifies a list of stop op types. Nodes of a type in the stop op types and nodes downstream from them will not be
+// run by the NNAPI EP.
+// The value should be a ","-delimited list of op types. For example, "Add,Sub".
+// If not specified, the default set of stop ops is used. To specify an empty stop ops types list and disable stop op
+// exclusion, set the value to "".
+static const char* const kOrtSessionOptionsConfigNnapiEpPartitioningStopOps = "ep.nnapi.partitioning_stop_ops";
+// Enabling dynamic block-sizing for multithreading.
+// With a positive value, thread pool will split a task of N iterations to blocks of size starting from:
+// N / (num_of_threads * dynamic_block_base)
+// As execution progresses, the size will decrease according to the diminishing residual of N,
+// meaning the task will be distributed in smaller granularity for better parallelism.
+// For some models, it helps to reduce the variance of E2E inference latency and boost performance.
+// The feature will not function by default, specify any positive integer, e.g. "4", to enable it.
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsConfigDynamicBlockBase = "session.dynamic_block_base";
+// This option allows to decrease CPU usage between infrequent
+// requests and forces any TP threads spinning stop immediately when the last of
+// concurrent Run() call returns.
+// Spinning is restarted on the next Run() call.
+// Applies only to internal thread-pools
+static const char* const kOrtSessionOptionsConfigForceSpinningStop = "session.force_spinning_stop";
+// "1": all inconsistencies encountered during shape and type inference
+// will result in failures.
+// "0": in some cases warnings will be logged but processing will continue. The default.
+// May be useful to expose bugs in models.
+static const char* const kOrtSessionOptionsConfigStrictShapeTypeInference = "session.strict_shape_type_inference";
+// "1": every model using a more recent opset than the latest released one will fail
+// "0": the model may or may not work if onnxruntime cannot find an implementation, this option
+// is used for development purpose.
+static const char* const kOrtSessionOptionsConfigStrictAllowReleasedOpsetsOnly = "session.allow_released_opsets_only";
+// The file saves configuration for partitioning node among logic streams
+static const char* const kNodePartitionConfigFile = "session.node_partition_config_file";
+// This Option allows setting affinities for intra op threads.
+// Affinity string follows format:
+// logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id
+// Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to.
+// e.g.1,2,3;4,5
+// specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th.
+// To ease the configuration, an "interval" is also allowed:
+// e.g. 1-8;8-16;17-24
+// orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth.
+// Note:
+// 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1, since ort does not set affinity on the main thread which
+//    is started and managed by the calling app;
+// 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors,
+//    an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group.
+//    Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary.
+static const char* const kOrtSessionOptionsConfigIntraOpThreadAffinities = "session.intra_op_thread_affinities";
+// This option will dump out the model to assist debugging any issues with layout transformation,
+// and is primarily intended for developer usage. It is only relevant if an execution provider that requests
+// NHWC layout is enabled such as NNAPI, XNNPACK or QNN.
+//
+// Default is off. Set to "1" to enable.
+//
+// If modified by layout transformation the model will be dumped after these steps:
+//   1) insertion of the layout transformation Transpose nodes
+//   2) after those are optimized using the transpose optimizer,
+//   3) after the L1 transformers are applied to the updated graph.
+// The model will be saved to filename post_layout_transform_step_<step_number>.onnx.
+static const char* const kDebugLayoutTransformation = "session.debug_layout_transformation";
+// Graph nodes that are not supported by the execution providers (EPs) explicitly added to the session are
+// assigned (i.e., "fallback") to the CPU EP by default.
+//
+// This option allows the user to disable the fallback of unsupported graph nodes to the CPU EP.
+// If this option is set to "1", session creation will fail if the execution providers other than the CPU EP cannot
+// fully support all of the nodes in the graph.
+//
+// It is invalid to set this option and explicitly add the CPU EP to the session. In this case, session creation
+// will also fail with an error.
+//
+// Option values:
+// - "0": CPU EP fallback is not disabled. [DEFAULT]
+// - "1": CPU EP fallback is disabled.
+static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disable_cpu_ep_fallback";
+// Use this config when serializing a large model after optimization to specify an external initializers file
+static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
+    "session.optimized_model_external_initializers_file_name";
+// Use this config to control the minimum size of the initializer when externalizing it during serialization
+static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
+    "session.optimized_model_external_initializers_min_size_in_bytes";
+// When loading model from memory buffer and the model has external initializers
+// Use this config to set the external data file folder path
+// All external data files should be in the same folder
+static const char* const kOrtSessionOptionsModelExternalInitializersFileFolderPath =
+    "session.model_external_initializers_file_folder_path";
+// Use this config when saving pre-packed constant initializers to an external data file.
+// This allows you to memory map pre-packed initializers on model load and leave it to
+// to the OS the amount of memory consumed by the pre-packed initializers. Otherwise,
+// pre-packed data resides on the heap.
+//
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+// Use this config when you want to collect memory stats for each node in the graph.
+// The file format is a CSV file with the following columns:
+// The file will be created if it does not exist, and will be overwritten if it does.
+//
+// The content of the file can be used to estimate memory requirements at run time including
+// the temporary allocations. This operation is preferably done on a CPU device, as the model may exceed
+// device memory limits in constrained environments. When enabling this option, it is important to disable
+// memory patterns, as they tend to allocate large blocks to avoid fragmentation and accommodate needs of multiple
+// kernels. Memory patterns may make it difficult to allocate on a device with limited memory.
+//
+// The collected stats then can be used to partition the graph among the devices in a way that only the
+// required memory is allocated on each device.
+//
+// node_name, initializers_memory, dynamic_outputs_sizes, temp_allocations_size
+//
+// - "full path to file": there is not a default for this option. If the file can not be opened for writing, an error will be returned.
+static const char* const kOrtSessionOptionsCollectNodeMemoryStatsToFile = "session.collect_node_memory_stats_to_file";
+/// This is a composite CSV setting formatted as "memory limit in kb,file name for collected stats"
+/// "limit > 0": enables Capacity Aware Partitioning for Cuda EP. `limit` is optional and when absent
+/// the provider may attempt to figure out the memory available automatically.
+/// The setting with no limit is expected to look like: ",file name for collected stats"
+///  The EP will place nodes on device "file name" :
+/// this file is expected to be found at the same folder with the model. The file contains
+/// pre-recorded stats collected when running with kOrtSessionOptionsCollectNodeMemoryStatsToFile enforce (see above)
+static const char* const kOrtSessionOptionsResourceCudaPartitioningSettings =
+    "session.resource_cuda_partitioning_settings";
+// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
+// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
+// "0": disable. (default)
+// "1": enable.
+static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
+// Specify the file path for the Onnx model which has EP context.
+// Default to original_file_name_ctx.onnx if not specified
+// Folder is not a valid option
+static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";
+// Flag to specify whether to dump the EP context into the Onnx model.
+// "0": dump the EP context into separate file, keep the file name in the Onnx model. (default).
+// "1": dump the EP context into the Onnx model.
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
+// Specify the EPContext node name prefix to make it unique
+// in case user need to merge/connect multiple EPContext nodes in one model
+static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix";
+// Share EP related resources across sessions
+static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
+// Stop to share EP related resources across sessions from then on
+static const char* const kOrtSessionOptionStopShareEpContexts = "ep.stop_share_ep_contexts";
+// Used only for context model generation.
+// This configuration is used when some nodes are partitioned on the CPU EP and those nodes have external initializers.
+// When generating the EP context model, the new model should not rely on the old external data file used by the source ONNX model.
+// Use this setting when dumping the EP context model with an external initializers file.
+// If specified, all initializers will be placed inside the external data file.
+// Otherwise, all initializers will be embedded inside the generated ONNX file.
+// By default, this option is not set, meaning all initializers will be included within the ONNX file.
+static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
+    "ep.context_model_external_initializers_file_name";
+// Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
+// Option values:
+// - "0": Gemm FastMath mode is not enabled. [DEFAULT]
+// - "1": Gemm FastMath mode is enabled.
+static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
+// When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option.
+// Refer to MatMulNBits op schema for more details.
+// If not provided, default is 4.
+static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";
+// THIS OPTION IS NOT A REGULAR SESSION OPTION SINCE IT CAN BE MODIFIED AT ANY TIME
+// Meant to be used with SetEpDynamicOptions
+// Specify the type of workload for this session.
+// "Default": OS determines the scheduling priority and processor performance to service this workload. [Default]
+// "Efficient": OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
+static const char* const kOrtEpDynamicOptionsWorkloadType = "ep.dynamic.workload_type";
+// Disables model compilation during session initialization.
+//
+// If this option is set to "1", inference session creation will fail with error code ORT_MODEL_REQUIRES_COMPILATION
+// if compilation is required to run the model on any Execution Provider added to the session.
+// Only the following kinds of models are valid when this option is set to "1":
+//   - Pre-compiled models that have EPContext nodes for the compiling Execution Providers in the session.
+//   - Non-compiled models that run only on non-compiling Execution Providers, like CPU EP.
+//
+// See \href https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html for details about
+// compiled models with EPContext nodes.
+//
+// Option values:
+// - "0": EP compile is not disabled. [DEFAULT]
+// - "1": EP compile is disabled.
+static const char* const kOrtSessionOptionsDisableModelCompile = "session.disable_model_compile";
+// Controls behavior when compiled model compatibility is SUPPORTED_PREFER_RECOMPILATION.
+// "0": Allow execution with suboptimal performance. [DEFAULT]
+// "1": Fail session creation to require recompilation for optimal performance.
+// Note: UNSUPPORTED models always fail regardless of this setting.
+static const char* const kOrtSessionOptionsFailOnSuboptimalCompiledModel =
+    "session.fail_on_suboptimal_compiled_model";
+// THIS OPTION IS NOT A REGULAR SESSION OPTION SINCE IT CAN BE MODIFIED AT ANY TIME
+// Meant to be used with SetEpDynamicOptions
+// options for HTP performance mode: "burst", "balanced", "default", "high_performance",
+// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
+// "sustained_high_performance". Default to "default".
+static const char* const kOrtEpDynamicOptionsQnnHtpPerformanceMode = "ep.dynamic.qnn_htp_performance_mode";

v1.23.1/jni/arm64-v8a/libonnxruntime.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:629d677b15742620b008fdeafafcf825e6635de5e650f2728b98cc7c659fbbe5
+size 19343456

v1.23.1/jni/arm64-v8a/libonnxruntime4j_jni.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ebb4db00b87e243a6e7f401e30cac6b4172a921a59ba44bb72a53eba55e6920
+size 100648

v1.23.1/jni/armeabi-v7a/libonnxruntime.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c6a7a9d851346abaf9911e6fa1bbb4433ddeb0f5c9d17429cf59942707613bb
+size 13988872

v1.23.1/jni/armeabi-v7a/libonnxruntime4j_jni.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0ce9f1da045c17666540e61d0bd0861d9a6fe9044486ea78c1c34d6a8f9ecc9
+size 73680

v1.23.1/jni/x86/libonnxruntime.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a78e94433c3abb77a9c7ddf2b8b74e94f7af4e6073b3288a32d6d98a68c38ed
+size 22757348

v1.23.1/jni/x86/libonnxruntime4j_jni.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55bd2fc2473495913e3be73b706b8e0bd4138b561c80cacda52e487b2f47e30e
+size 84700

v1.23.1/jni/x86_64/libonnxruntime.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b830967d557bd5ecf90de8357ffac3d6ec8a27f70d48d2e954e1809b0612165
+size 23176928

v1.23.1/jni/x86_64/libonnxruntime4j_jni.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd2404d809a7339bd4ab7719af93c5786d003aff2ea1a1f312b6bc3b1d64366
+size 90728