Skip to content

Support rocprofiler-sdk registration #25

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ set (HEATMAP_SRC
)

find_package(Python COMPONENTS Development)
find_package(rocprofiler-sdk REQUIRED)

include_directories(${CMAKE_CURRENT_BINARY_DIR})

Expand Down Expand Up @@ -122,6 +123,7 @@ target_link_libraries(
${DH_COMMS_LIBRARIES}/libdh_comms.so
kernelDB64
amd_comgr
rocprofiler-sdk::rocprofiler-sdk
)

find_path(LIBDWARF_INCLUDE_DIR
Expand Down
140 changes: 120 additions & 20 deletions src/interceptor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ THE SOFTWARE.

#include "inc/interceptor.h"

#include <rocprofiler-sdk/intercept_table.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>

using namespace std;
namespace fs = std::filesystem;
Expand Down Expand Up @@ -140,9 +143,9 @@ void hsaInterceptor::cleanup()
}


hsaInterceptor::hsaInterceptor(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) :
signal_runner_(signal_runner), cache_watcher_(cache_watcher), kernel_cache_(table), allocator_(table, std::cerr),
comms_mgr_(table), comms_runner_(comms_runner, std::ref(comms_mgr_))
hsaInterceptor::hsaInterceptor(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) :
signal_runner_(signal_runner), cache_watcher_(cache_watcher), kernel_cache_(table), allocator_(table, std::cerr),
comms_mgr_(table), comms_runner_(comms_runner, std::ref(comms_mgr_))
{
apiTable_ = table;
getLogDurConfig(config_);
Expand Down Expand Up @@ -171,7 +174,7 @@ hsaInterceptor::hsaInterceptor(HsaApiTable* table, uint64_t runtime_version, uin
hsa_device_type_t type;
hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, static_cast<void *>(&type));
if (status == HSA_STATUS_SUCCESS && type == HSA_DEVICE_TYPE_GPU)
agents->emplace_back(agent);
agents->emplace_back(agent);
return HSA_STATUS_SUCCESS;
}, reinterpret_cast<void *>(&gpus))== HSA_STATUS_SUCCESS)
{
Expand Down Expand Up @@ -204,7 +207,7 @@ hsaInterceptor::hsaInterceptor(HsaApiTable* table, uint64_t runtime_version, uin
* these files will cause addFile() to throw a runtime exception.
* these exceptions are benign for our purposes because any shared lib
* we might be interested in (i.e. the ones that contain .hip_fatbin sections)
* will enumerate from getSharedLibraries with a full path to the file.
* will enumerate from getSharedLibraries with a full path to the file.
* so we catch this exception and continue */
continue;
}
Expand Down Expand Up @@ -267,7 +270,7 @@ bool hsaInterceptor::addCodeObject(const std::string& name)
if (it != kdbs_.end())
it->second.get()->addFile(name, agent, config_["LOGDUR_FILTER"]);
else
kdbs_[agent] = std::make_unique<kernelDB::kernelDB>(agent, name);
kdbs_[agent] = std::make_unique<kernelDB::kernelDB>(agent, name);
}
}
}
Expand Down Expand Up @@ -308,7 +311,7 @@ void hsaInterceptor::signalCompleted(const hsa_signal_t sig)
{
kernel_info_t ki = it->second;
pending_signals_.erase(sig);
// If the application originally provided a completion_signal
// If the application originally provided a completion_signal
// We need to decrement it to ensure application behavior isn't affected.
if (ki.signal_.handle)
{
Expand Down Expand Up @@ -372,9 +375,9 @@ void cache_watcher()
exit(EXIT_FAILURE);
}
auto files = util_get_directory_files(dir, true);
for (const auto& entry : files)
for (const auto& entry : files)
{
if (util_is_directory(entry))
if (util_is_directory(entry))
{
int wd = inotify_add_watch(fd, entry.c_str(), IN_CREATE | IN_DELETE | IN_MODIFY | IN_MOVED_FROM | IN_MOVED_TO);
if (wd != -1)
Expand Down Expand Up @@ -477,7 +480,7 @@ void cache_watcher()
if (strFileName.ends_with(".hsaco"))
{
// cerr << "I CAN SEE JITTED CODE OBJECT " << strFileName << std::endl;
me->addCodeObject(strFileName);
me->addCodeObject(strFileName);
}
//else
// cerr << "The file/directory " << event->name << " was moved to directory " << watch_map[event->wd] << std::endl;
Expand Down Expand Up @@ -640,7 +643,7 @@ void hsaInterceptor::fixupKernArgs(void *dst, void *src, void *comms, arg_descri
//void *hidden_args_src = &(((void **)src)[desc.explicit_args_count - 1]);
void *hidden_args_src = &(((char *)src)[desc.explicit_args_length - sizeof(void *)]);
// In Triton, for some reason we sometimes get non-instrumented kernsl with no hidden arguments
// So we only want to copy hidden arguments if there ARE some. If there are, the length to
// So we only want to copy hidden arguments if there ARE some. If there are, the length to
// copy is the original size of the kernarg data - the size of explicit arguments. But since its
// a kernarg segment from a non-instrumented clone, we subtract one from the arg count
if (desc.clone_hidden_args_length)
Expand All @@ -652,17 +655,17 @@ void hsaInterceptor::fixupKernArgs(void *dst, void *src, void *comms, arg_descri
/* The weird thing here is that, apparently, kernel arguments are 4-byteb aligned
* regardless of the actual argument size. This really bit me working on this code
* because the metadata on kernel objects that is retrievable from comgr shows argument lengths
* and at first I was using the argument length to repack the kernel arguments with the
* and at first I was using the argument length to repack the kernel arguments with the
* newly inserted void * created by the instrumentation code. But after staring
* at hex dumps, I realized that all of the kernel arguments (at least the explicit arguments,
* at hex dumps, I realized that all of the kernel arguments (at least the explicit arguments,
* I'm not sure about the hidden arguments) are 4-byte aligned regardless of the inherent argument
* size. I don't know how portable this is between code object versions. I'm assuming it is some
* aspect of code object first combined with the expecations of the GPU firmware.
* */
//void **comms_loc = &(((void **)dst)[desc.explicit_args_count - 1]);
// This computation using explicit_args_length is more adaptable to changes in the way the compiler
// and runtime pack kernel arguments. For example 2 four-byte args might be packed into a single
// 64 bit slot and the individual parms might not be 64-bit aligned. For any kernel where that
// 64 bit slot and the individual parms might not be 64-bit aligned. For any kernel where that
// turns out to be the case, this address calculation with be resilient whether the args
// are packed or not.
void **comms_loc = (void **)&(((char *)dst)[desc.explicit_args_length - sizeof(void *)]);
Expand All @@ -672,11 +675,11 @@ void hsaInterceptor::fixupKernArgs(void *dst, void *src, void *comms, arg_descri

/*
This function is the core of functionality for logDuration. It's where completion signals are set up for tracking so that
at kernel completion we can extract start/stop times from the signal. It's also where "alternative" kernels - those found
at kernel completion we can extract start/stop times from the signal. It's also where "alternative" kernels - those found
in the kernel cache pointed to by LOGDUR_KERNEL_CACHE - are used to replace the kernel_object in the dispatch packet with
the kernel cache alternative. Also, whenever replacing the original kernel_object with an alternative, this function
the kernel cache alternative. Also, whenever replacing the original kernel_object with an alternative, this function
allocates a new kernarg structure, initializes it to zeros, and copies the original kernarg buffer into the new one.
Pending signals and the alternative kernarg buffers are stored and processed later when the kernel completes and
Pending signals and the alternative kernarg buffers are stored and processed later when the kernel completes and
hsaIntereceptor::signalComplete is called.
*/
hsa_kernel_dispatch_packet_t * hsaInterceptor::fixupPacket(const hsa_kernel_dispatch_packet_t *packet, hsa_queue_t *queue, uint64_t dispatch_id)
Expand Down Expand Up @@ -803,10 +806,10 @@ void hsaInterceptor::doPackets(hsa_queue_t *queue, const packet_t *packet, uint6

void hsaInterceptor::addQueue(hsa_queue_t *queue, hsa_agent_t agent)
{
// This call results in completion signals having start and stop timestamps on dispatches
// This call results in completion signals having start and stop timestamps on dispatches
auto result = (*(apiTable_->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn))(queue, true);
assert(result == HSA_STATUS_SUCCESS && "Couldn't enable queue for profiling");

lock_guard<mutex> lock(mutex_);

queues_[queue] = agent;
Expand Down Expand Up @@ -875,7 +878,7 @@ void hsaInterceptor::addKernel(uint64_t kernelObject, std::string& name, hsa_exe
kernel_objects_[kernelObject] = {thisName.length() ? thisName : name, symbol, agent, kernarg_size};
}
}

hsa_status_t hsaInterceptor::hsa_executable_symbol_get_info(hsa_executable_symbol_t symbol, hsa_executable_symbol_info_t attribute, void *data)
{
hsa_status_t result = HSA_STATUS_SUCCESS;
Expand Down Expand Up @@ -970,3 +973,100 @@ extern "C" {
cerr << "hsaInterceptor: Application elapsed usecs: " << globalTime.getElapsedNanos() / 1000 << "us\n";
}*/
}

namespace
{
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_client_finalize_t client_fini = nullptr;

int
rocp_sdk_tool_init(rocprofiler_client_finalize_t fini_func, void* /*tool_data*/)
{
// save the function pointer for explicit finalization
client_fini = fini_func;

// not necessary but this is how you force finalizing a rocprofiler-sdk client
std::atexit([]() {
if(client_id && client_fini) client_fini(*client_id);
});

// no errors
return 0;
}

void
rocp_sdk_tool_fini(void* tool_data)
{
// set to nullptr if automatically invoked by rocprofiler-sdk before atexit check
client_id = nullptr;
client_fini = nullptr;

hsaInterceptor::cleanup();
cerr << "hsaInterceptor: Application elapsed usecs: " << std::dec
<< globalTime.getElapsedNanos() / 1000 << "us\n";
}

void
rocp_sdk_api_registration_callback(rocprofiler_intercept_table_t type,
uint64_t lib_version, uint64_t lib_instance,
void** tables, uint64_t num_tables, void* user_data)
{
if(type != ROCPROFILER_HSA_TABLE) {
std::cerr << "Error: unexpected library type: "
<< static_cast<int>(type) << std::endl;
std::abort();
}

uint32_t major = lib_version / 10000;
uint32_t minor = (lib_version % 10000) / 100;
uint32_t patch = lib_version % 100;

const char* table_name = nullptr;
rocprofiler_query_intercept_table_name(type, &table_name, nullptr);

clog << client_id->name << " is using " << table_name << " v" << major << "." << minor
<< "." << patch << '\n'
<< std::flush;

auto* table = static_cast<HsaApiTable*>(tables[0]);
hsaInterceptor* hook = hsaInterceptor::getInstance(table, lib_version, 0, nullptr);
}
} // namespace

extern "C"
{
rocprofiler_tool_configure_result_t* rocprofiler_configure(
uint32_t version, const char* runtime_version, uint32_t priority,
rocprofiler_client_id_t* id)
{
// set the client name
id->name = "Omniprobe";

// save client info
client_id = id;

// compute major/minor/patch version info
uint32_t major = version / 10000;
uint32_t minor = (version % 10000) / 100;
uint32_t patch = version % 100;

// generate info string
std::clog << id->name << " (priority=" << priority
<< ") is using rocprofiler-sdk v" << major << "." << minor << "."
<< patch << " (" << runtime_version << ")" << std::endl;

auto status = rocprofiler_at_intercept_table_registration(
rocp_sdk_api_registration_callback, ROCPROFILER_HSA_TABLE, nullptr);

if(status != ROCPROFILER_STATUS_SUCCESS) return nullptr;

// create configure data
static auto cfg = rocprofiler_tool_configure_result_t{
sizeof(rocprofiler_tool_configure_result_t), &rocp_sdk_tool_init,
&rocp_sdk_tool_fini, nullptr
};

// return pointer to configure data
return &cfg;
}
}
Loading