Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
666 changes: 666 additions & 0 deletions projects/rocr-runtime/rocrtst/suites/functional/counted_queues.cc

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/

#ifndef ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
#define ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H


#include "suites/test_common/test_base.h"

class CountedQueuesTest : public TestBase {
public:
explicit CountedQueuesTest();

// @Brief: Destructor for test case of CountedQueuesTest
virtual ~CountedQueuesTest();

// @Brief: Setup the environment for measurement
virtual void SetUp();

// @Brief: Core measurement execution
virtual void Run();

// @Brief: Clean up and retrive the resource
virtual void Close();

// @Brief: Display results
virtual void DisplayResults() const;

// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);

void CountedQueueBasicApiTest();
void CountedQueues_SamePriority_MaxLimitTest();
void InvalidArgsTest();
void CountedQueuesAllPrioritiesLimitTest();
void CountedQueuesSetPriorityNackTest();
void CountedQueuesSetCUMaskNackTest();
void CountedQueuesDispatchTest();
void CountedQueuesMultithreadedDispatchTest();

private:
void* src_buffer_;
void* dst_buffer_;
};

#endif // ROCRTST_SUITES_FUNCTIONAL_COUNTED_QUEUES_H
57 changes: 57 additions & 0 deletions projects/rocr-runtime/rocrtst/suites/test_common/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
#include "suites/functional/cu_masking.h"
#include "amd_smi/amdsmi.h"
#include "common/common.h"
#include "suites/functional/counted_queues.h"

static RocrTstGlobals *sRocrtstGlvalues = nullptr;

Expand Down Expand Up @@ -480,6 +481,62 @@ TEST(rocrtstFunc, VirtMemory_Interprocess_Test) {
);
}

TEST(rocrtstFunc, Counted_Queue_Basic_Test) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not need to fork.

TEST(rocrtstFunc, Counted_Queue_Basic_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.testFunc();
RunCustomTestEpilog(&cq);
}

Same for the other tests below

CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueueBasicApiTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstFunc, Counted_Queue_Same_Priority_Max_Limit_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueues_SamePriority_MaxLimitTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstFunc, Counted_Queue_Invalid_Args_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.InvalidArgsTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstFunc, Counted_Queue_Multiple_Priorities_Limit_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesAllPrioritiesLimitTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstFunc, Counted_Queue_Set_Priority_Nack_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesSetPriorityNackTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstFunc, Counted_Queue_Set_CUMask_Nack_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesSetCUMaskNackTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstFunc, Counted_Queue_Dispatch_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesDispatchTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstFunc, Counted_Queue_Multithreaded_Dispatch_Test) {
CountedQueuesTest cq;
RunCustomTestProlog(&cq);
cq.CountedQueuesMultithreadedDispatchTest();
RunCustomTestEpilog(&cq);
}

TEST(rocrtstNeg, Memory_Negative_Tests) {
RUN_IF_NOT_EMU_MODE(
MemoryAllocateNegativeTest mt;
Expand Down
1 change: 0 additions & 1 deletion projects/rocr-runtime/rocrtst/suites/test_common/main.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,5 @@
test; \
}


#endif // ROCRTST_SUITES_TEST_COMMON_MAIN_H_

1 change: 1 addition & 0 deletions projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ set ( SRCS core/driver/driver.cpp
core/runtime/cache.cpp
core/runtime/svm_profiler.cpp
core/runtime/thunk_loader.cpp
core/runtime/counted_queue_manager.cpp
core/common/hsa_table_interface.cpp
loader/executable.cpp
libamdhsacode/amd_elf_image.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1350,6 +1350,18 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
size_copied, status);
}

hsa_status_t HSA_API hsa_amd_counted_queue_acquire(
hsa_agent_t agent, hsa_queue_type_t type, hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
uint64_t flags, hsa_queue_t** queue) {
return amdExtTable->hsa_amd_counted_queue_acquire_fn(agent, type, priority, callback, data, flags,
queue);
}

hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue) {
return amdExtTable->hsa_amd_counted_queue_release_fn(queue);
}

// Tools only table interfaces.
namespace rocr {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#include "core/util/locks.h"
#include "core/util/small_heap.h"
#include "pcs/pcs_runtime.h"
#include "core/inc/counted_queue_manager.h"

namespace rocr {
namespace AMD {
Expand Down Expand Up @@ -342,6 +343,19 @@ class GpuAgent : public GpuAgentInt {
void AcquireQueueAltScratch(ScratchInfo& scratch) override;
void ReleaseQueueAltScratch(ScratchInfo& scratch) override;

// @brief Create a pool of shared queues for multiple user applications within a max limit
hsa_status_t AcquireCountedQueue(hsa_queue_type_t type,
hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
void* data, uint64_t flags,
hsa_queue_t** out_queue);

// @brief Release a queue earlier used by application
hsa_status_t ReleaseCountedQueue(hsa_queue_t* queue);

// @brief Get the use count or unique hardware ID of a shared queue
hsa_status_t GetCountedQueueInfo(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, void* value);

// @brief Override from AMD::GpuAgentInt.
void TranslateTime(core::Signal* signal, hsa_amd_profiling_dispatch_time_t& time) override;

Expand Down Expand Up @@ -735,6 +749,9 @@ class GpuAgent : public GpuAgentInt {
// @brief list of AQL queues owned by this agent. Indexed by queue pointer
std::vector<core::Queue*> aql_queues_;

// @brief Pool of shared queues owned by this agent
rocr::core::CountedQueuePoolManager queue_pool_;

// Sets and Tracks pending SDMA status check or request counts
void SetCopyRequestRefCount(bool set);
void SetCopyStatusCheckRefCount(bool set);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/

#ifndef HSA_RUNTME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
#define HSA_RUNTME_CORE_INC_COUNTED_QUEUE_MANAGER_H_


#include "hsa.h"
#include "hsa_ext_amd.h"
#include "core/inc/agent.h"
#include "core/inc/runtime.h"
#include <map>
#include <mutex>
#include <vector>
#include <memory>

namespace rocr {
namespace core {

// Wrapper for a logical counted queue (unique handle + callback)
struct CountedQueue {
core::Queue* hw_queue; // this will store the public handle of HW Queue (hsa_queue_t)
void (*callback)(hsa_status_t, hsa_queue_t*, void*);
void* callback_data;

CountedQueue(core::Queue* hw, void (*cb)(hsa_status_t, hsa_queue_t*, void*), void* data)
: hw_queue(hw), callback(cb), callback_data(data) {}
};

// Manages the pool of counted queues for a single GPU agent
class CountedQueuePoolManager {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make the CountedQueuePoolManager a member of the GpuAgent class and initialize it in the GpuAgent constructor. That way we do not have to make it a singleton and check for IsInstanceCreated() each time.

Then inside hsa_amd_counted_queue_acquire implementation, we would call:

status = agent->CountedQueueCreate(....);

Which will invoke the CountedQueuePoolManager for the GpuAgent.

For CPU agents, hsa_amd_counted_queue_acquire can return error.
For AIE agents, you can leave these functions empty for now. We can figure out how to implement it as we closer to finalizing this patch.

public:
explicit CountedQueuePoolManager(core::Agent*);
~CountedQueuePoolManager();

// Acquire a queue (either reuse or create new)
hsa_status_t AcquireQueue(hsa_queue_type_t type, hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*), void* data,
uint64_t flags, hsa_queue_t** out_queue);

// Release a logical queue
hsa_status_t ReleaseQueue(hsa_queue_t* queue);

// Query info (use count, hw id)
hsa_status_t GetQueueInfo(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute,
void* value);

private:
core::Queue* FindOrCreateHardwareQueue(hsa_queue_type_t type, hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
void* data, uint64_t flags);

core::Agent* agent_; // pointer to the gpu agent that owns this pool
uint32_t max_hw_queues_;
std::mutex mutex_;

// Pool of hw queues by priority on the agent
std::map<hsa_amd_queue_priority_t, std::vector<core::Queue*>> hw_queue_pools_;

// Map from unique handle to CountedQueue (hw queue, metadata per acquire request)
std::map<hsa_queue_t*, CountedQueue*> counted_queues_;
};

} // namespace core
} // namespace rocr

#endif // HSA_RUNTME_CORE_INC_COUNTED_QUEUE_MANAGER_H_
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,15 @@ hsa_status_t HSA_API hsa_amd_ais_file_read(hsa_amd_ais_file_handle_t handle, voi
uint64_t size, int64_t file_offset,
uint64_t *size_copied, int32_t *status);

// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_counted_queue_acquire(hsa_agent_t agent, hsa_queue_type_t type,
hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t status,
hsa_queue_t* source, void* data),
void* data, uint64_t flags, hsa_queue_t** queue);
// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_counted_queue_release(hsa_queue_t* queue);

// Mirrors Amd Extension Apis
hsa_status_t HSA_API hsa_amd_enable_logging(uint8_t* flags, void* file);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class QueueWrapper : public Queue {
explicit QueueWrapper(std::unique_ptr<Queue> queue)
: Queue(static_cast<core::SharedQueue*>(core::Runtime::runtime_singleton_->system_allocator()(
sizeof(core::SharedQueue), 4096, 0, 0)),
0),
0, nullptr),
wrapped(std::move(queue)) {
memcpy(&amd_queue_, &wrapped->amd_queue_, sizeof(amd_queue_));
wrapped->set_public_handle(wrapped.get(), public_handle_);
Expand Down
20 changes: 16 additions & 4 deletions projects/rocr-runtime/runtime/hsa-runtime/core/inc/queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,17 @@ All funtions other than Convert and public_handle must be virtual.
*/
class Queue : public Checked<0xFA3906A679F9DB49> {
public:
Queue(SharedQueue* shared_queue, uint64_t queue_flags)
: Queue(shared_queue, queue_flags, false) {}
Queue(SharedQueue* shared_queue, uint64_t queue_flags, core::Agent* agent)
: Queue(shared_queue, queue_flags, false, agent) {}

Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering)
Queue(SharedQueue* shared_queue, uint64_t queue_flags, bool pcie_write_ordering, core::Agent* agent)
: amd_queue_(shared_queue->amd_queue),
shared_queue_(shared_queue),
agent_(agent),
flags_(queue_flags),
pcie_write_ordering_(pcie_write_ordering) {
pcie_write_ordering_(pcie_write_ordering),
use_count(0),
is_counted_queue(false) {
public_handle_ = Convert(this);
shared_queue->core_queue = this;
}
Expand Down Expand Up @@ -372,6 +375,13 @@ class Queue : public Checked<0xFA3906A679F9DB49> {

hsa_queue_t* public_handle() const { return public_handle_; }

// Get a pointer to the agent that owns this queue
core::Agent* GetAgent() { return agent_; }

// @brief Attributes specifically for counted queue types
uint32_t use_count;
bool is_counted_queue;

typedef void* rtti_t;

bool IsType(rtti_t id) { return _IsA(id); }
Expand Down Expand Up @@ -401,6 +411,8 @@ class Queue : public Checked<0xFA3906A679F9DB49> {

SharedQueue* shared_queue_;

core::Agent* agent_; // pointer to the agent that owns this queue

hsa_queue_t* public_handle_;

/// Next available queue id.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ namespace AMD {

AieAqlQueue::AieAqlQueue(core::SharedQueue* shared_queue, AieAgent* agent, size_t req_size_pkts,
uint32_t node_id, uint64_t flags)
: Queue(shared_queue, flags),
: Queue(shared_queue, flags, agent),
LocalSignal(0, false),
DoorbellSignal(signal()),
agent_(*agent),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ namespace AMD {
AqlQueue::AqlQueue(core::SharedQueue* shared_queue, GpuAgent* agent, size_t req_size_pkts,
HSAuint32 node_id, ScratchInfo& scratch, core::HsaEventCallback callback,
void* err_data, uint64_t flags)
: Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu()),
: Queue(shared_queue, flags, !agent->is_xgmi_cpu_gpu(), agent),
LocalSignal(0, false),
DoorbellSignal(signal()),
ring_buf_(nullptr),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
scratch_used_large_(0),
queues_(),
queue_pool_(this),
trap_code_buf_(NULL),
trap_code_buf_size_(0),
doorbell_queue_map_(NULL),
Expand Down Expand Up @@ -1811,6 +1812,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u

auto aql_queue = new AqlQueue(shared_queue, this, size, node_id(), scratch, event_callback, data,
flags);

*queue = aql_queue;
aql_queues_.push_back(aql_queue);

Expand Down Expand Up @@ -3499,5 +3501,21 @@ hsa_status_t GpuAgent::PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& sessi
return HSA_STATUS_SUCCESS;
}

hsa_status_t GpuAgent::AcquireCountedQueue(hsa_queue_type_t type,
hsa_amd_queue_priority_t priority,
void (*callback)(hsa_status_t, hsa_queue_t*, void*),
void* data, uint64_t flags,
hsa_queue_t** out_queue) {
return queue_pool_.AcquireQueue(type, priority, callback, data, flags, out_queue);
}

hsa_status_t GpuAgent::ReleaseCountedQueue(hsa_queue_t* queue) {
return queue_pool_.ReleaseQueue(queue);
}

hsa_status_t GpuAgent::GetCountedQueueInfo(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, void* value) {
return queue_pool_.GetQueueInfo(queue, attribute, value);
}

} // namespace amd
} // namespace rocr
Loading