From cb5847eb31fd2ac611968ee93ce58c8e503d05cc Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Thu, 3 Mar 2022 15:47:52 -0500 Subject: [PATCH 1/9] set -eE in slurm.sim template --- util/job_launching/slurm.sim | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/util/job_launching/slurm.sim b/util/job_launching/slurm.sim index 341706c19..6b1b8e94a 100644 --- a/util/job_launching/slurm.sim +++ b/util/job_launching/slurm.sim @@ -18,7 +18,9 @@ copy_output() { trap copy_output ERR -set -e +#citing https://stackoverflow.com/questions/35800082/how-to-trap-err-when-using-set-e-in-bash +#Setting -E alongside -e makes any trap on ERR inherited by shell funcs, command substitutions and commands executed in a subshell environment +set -eE if [ "$GPGPUSIM_SETUP_ENVIRONMENT_WAS_RUN" != "1" ]; then export GPGPUSIM_ROOT=REPLACE_GPGPUSIM_ROOT From 1cb3db16de6c7f83f04156298546bbbae796d2ee Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Thu, 3 Mar 2022 15:49:34 -0500 Subject: [PATCH 2/9] make main.cc return 0 instead of 1 --- gpu-simulator/main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu-simulator/main.cc b/gpu-simulator/main.cc index aed4c9113..99d1b308c 100644 --- a/gpu-simulator/main.cc +++ b/gpu-simulator/main.cc @@ -184,7 +184,7 @@ int main(int argc, const char **argv) { printf("GPGPU-Sim: *** exit detected ***\n"); fflush(stdout); - return 1; + return 0; } From 0c1c57addff386fb83f28353bc3b8c194dab15ae Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Sat, 26 Mar 2022 16:17:38 -0400 Subject: [PATCH 3/9] fix for issue 107 --- gpu-simulator/main.cc | 63 ++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/gpu-simulator/main.cc b/gpu-simulator/main.cc index 99d1b308c..472600673 100644 --- a/gpu-simulator/main.cc +++ b/gpu-simulator/main.cc @@ -81,41 +81,42 @@ int main(int argc, const char **argv) { unsigned i = 0; while (i < commandlist.size() || !kernels_info.empty()) { - trace_kernel_info_t *kernel_info = NULL; - if (commandlist[i].m_type == command_type::cpu_gpu_mem_copy) { - size_t addre, Bcount; - tracer.parse_memcpy_info(commandlist[i].command_string, addre, Bcount); - std::cout << "launching memcpy command : " << commandlist[i].command_string << std::endl; - m_gpgpu_sim->perf_memcpy_to_gpu(addre, Bcount); - i++; - continue; - } else if (commandlist[i].m_type == command_type::kernel_launch) { - // Read trace header info for window_size number of kernels - while (kernels_info.size() < window_size && i < commandlist.size()) { - kernel_trace_t* kernel_trace_info = tracer.parse_kernel_info(commandlist[i].command_string); - kernel_info = create_kernel_info(kernel_trace_info, m_gpgpu_context, &tconfig, &tracer); - kernels_info.push_back(kernel_info); - std::cout << "Header info loaded for kernel command : " << commandlist[i].command_string << std::endl; + //gulp up as many commands as possible - either cpu_gpu_mem_copy + //or kernel_launch - until the vector "kernels_info" has exceeded + //the window_size or no command is left in commandlist + while (kernels_info.size() < window_size && i < commandlist.size()) { + trace_kernel_info_t *kernel_info = NULL; + if (commandlist[i].m_type == command_type::cpu_gpu_mem_copy) { + size_t addre, Bcount; + tracer.parse_memcpy_info(commandlist[i].command_string, addre, Bcount); + std::cout << "launching memcpy command : " << commandlist[i].command_string << std::endl; + m_gpgpu_sim->perf_memcpy_to_gpu(addre, Bcount); i++; + continue; + } else if (commandlist[i].m_type == command_type::kernel_launch) { + // Read trace header info for window_size number of kernels + kernel_trace_t* kernel_trace_info = tracer.parse_kernel_info(commandlist[i].command_string); + kernel_info = create_kernel_info(kernel_trace_info, m_gpgpu_context, &tconfig, &tracer); + kernels_info.push_back(kernel_info); + std::cout << "Header info loaded for kernel command : " << commandlist[i].command_string << std::endl; + i++; } - - // Launch all kernels within window that are on a stream that isn't already running - for (auto k : kernels_info) { - bool stream_busy = false; - for (auto s: busy_streams) { - if (s == k->get_cuda_stream_id()) - stream_busy = true; - } - if (!stream_busy && m_gpgpu_sim->can_start_kernel() && !k->was_launched()) { - std::cout << "launching kernel name: " << k->get_name() << " uid: " << k->get_uid() << std::endl; - m_gpgpu_sim->launch(k); - k->set_launched(); - busy_streams.push_back(k->get_cuda_stream_id()); - } + } + + // Launch all kernels within window that are on a stream that isn't already running + for (auto k : kernels_info) { + bool stream_busy = false; + for (auto s: busy_streams) { + if (s == k->get_cuda_stream_id()) + stream_busy = true; + } + if (!stream_busy && m_gpgpu_sim->can_start_kernel() && !k->was_launched()) { + std::cout << "launching kernel name: " << k->get_name() << " uid: " << k->get_uid() << std::endl; + m_gpgpu_sim->launch(k); + k->set_launched(); + busy_streams.push_back(k->get_cuda_stream_id()); } } - else if (kernels_info.empty()) - assert(0 && "Undefined Command"); bool active = false; bool sim_cycles = false; From 11f9ab5e6e0ce5e31a2f4d5f988fd6b04fb4b162 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Sat, 26 Mar 2022 16:59:45 -0400 Subject: [PATCH 4/9] remove redundant instruction --- gpu-simulator/main.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/gpu-simulator/main.cc b/gpu-simulator/main.cc index 472600673..35fec0782 100644 --- a/gpu-simulator/main.cc +++ b/gpu-simulator/main.cc @@ -92,7 +92,6 @@ int main(int argc, const char **argv) { std::cout << "launching memcpy command : " << commandlist[i].command_string << std::endl; m_gpgpu_sim->perf_memcpy_to_gpu(addre, Bcount); i++; - continue; } else if (commandlist[i].m_type == command_type::kernel_launch) { // Read trace header info for window_size number of kernels kernel_trace_t* kernel_trace_info = tracer.parse_kernel_info(commandlist[i].command_string); From d353b1369d0bfae6cafb25433c1dfab2ad24471b Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Mon, 28 Mar 2022 14:36:33 -0400 Subject: [PATCH 5/9] created my own app for debugging pusposes --- util/job_launching/apps/define-all-apps.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml index ae8174e47..663c9774f 100644 --- a/util/job_launching/apps/define-all-apps.yml +++ b/util/job_launching/apps/define-all-apps.yml @@ -105,6 +105,14 @@ GPU_Microbenchmark: - args: accel-sim-mem: 1G +Deepbench_subcore_test: + exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/" + data_dirs: "$GPUAPPS_ROOT/data_dirs/" + execs: + - gemm_bench-tencore: + - args: inference half 35 1500 2560 0 0 + accel-sim-mem: 2G + Deepbench_nvidia_tencore: exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/" data_dirs: "$GPUAPPS_ROOT/data_dirs/" From 2ec2aeea6758cdccb11b50f94bcb2d2da97b275b Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 13:45:08 -0400 Subject: [PATCH 6/9] use range-based for loop to handle wrap-arounds in hwtid and warp-id --- gpu-simulator/trace-driven/trace_driven.cc | 31 ++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/gpu-simulator/trace-driven/trace_driven.cc b/gpu-simulator/trace-driven/trace_driven.cc index ff0035167..73253baff 100644 --- a/gpu-simulator/trace-driven/trace_driven.cc +++ b/gpu-simulator/trace-driven/trace_driven.cc @@ -537,6 +537,14 @@ unsigned trace_shader_core_ctx::sim_init_thread( return 1; } +/** + * @brief Must be called once for each CTA. + * + * In the event of hwtid wrap-around due to subcore scheduling, + * end_thread is smaller than start_thread. This case is accetable + * and will be resolved by callee methods. + * + */ void trace_shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread, unsigned end_thread, unsigned ctaid, int cta_size, kernel_info_t &kernel) { @@ -565,20 +573,39 @@ void trace_shader_core_ctx::updateSIMTStack(unsigned warpId, // No SIMT-stack in trace-driven mode } +/** + * @brief Calls trace_parser::get_next_threadblock_traces to parse an entire + * CTA section in the *.traceg file. This means the range of warp ids covered + * by [start_warp, end_warp) must cover all warps of the CTA. + * + * With the subcore model, special case is given since wrap-arounds can + * happen, which means end_warp < start_warp. This is done by generating + * a const vec of warp ids to iterate over in a range-based for loop, instead + * of looping from start_warp to end_warp. + * + * @param start_warp Warp id calculated from hwtid by dividing it by warp-size + * @param end_warp Calculated from hwtid in the same way + * @param kernel + */ void trace_shader_core_ctx::init_traces(unsigned start_warp, unsigned end_warp, kernel_info_t &kernel) { std::vector *> threadblock_traces; - for (unsigned i = start_warp; i < end_warp; ++i) { + + auto wrap_ids = get_index_vector_from_range_with_wrap_around + (start_warp, end_warp, m_config->max_warps_per_shader); + + for (unsigned i : wrap_ids) { trace_shd_warp_t *m_trace_warp = static_cast(m_warp[i]); m_trace_warp->clear(); threadblock_traces.push_back(&(m_trace_warp->warp_traces)); } + trace_kernel_info_t &trace_kernel = static_cast(kernel); trace_kernel.get_next_threadblock_traces(threadblock_traces); // set the pc from the traces and ignore the functional model - for (unsigned i = start_warp; i < end_warp; ++i) { + for (unsigned i : wrap_ids) { trace_shd_warp_t *m_trace_warp = static_cast(m_warp[i]); m_trace_warp->set_next_pc(m_trace_warp->get_start_trace_pc()); m_trace_warp->set_kernel(&trace_kernel); From b9baf3939d803629ea4490c1b3cc187222303d88 Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 14:02:37 -0400 Subject: [PATCH 7/9] fixed typo --- gpu-simulator/trace-driven/trace_driven.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu-simulator/trace-driven/trace_driven.cc b/gpu-simulator/trace-driven/trace_driven.cc index 73253baff..dc9dca269 100644 --- a/gpu-simulator/trace-driven/trace_driven.cc +++ b/gpu-simulator/trace-driven/trace_driven.cc @@ -591,10 +591,10 @@ void trace_shader_core_ctx::init_traces(unsigned start_warp, unsigned end_warp, kernel_info_t &kernel) { std::vector *> threadblock_traces; - auto wrap_ids = get_index_vector_from_range_with_wrap_around + auto warp_ids = get_index_vector_from_range_with_wrap_around (start_warp, end_warp, m_config->max_warps_per_shader); - for (unsigned i : wrap_ids) { + for (unsigned i : warp_ids) { trace_shd_warp_t *m_trace_warp = static_cast(m_warp[i]); m_trace_warp->clear(); threadblock_traces.push_back(&(m_trace_warp->warp_traces)); @@ -605,7 +605,7 @@ void trace_shader_core_ctx::init_traces(unsigned start_warp, unsigned end_warp, trace_kernel.get_next_threadblock_traces(threadblock_traces); // set the pc from the traces and ignore the functional model - for (unsigned i : wrap_ids) { + for (unsigned i : warp_ids) { trace_shd_warp_t *m_trace_warp = static_cast(m_warp[i]); m_trace_warp->set_next_pc(m_trace_warp->get_start_trace_pc()); m_trace_warp->set_kernel(&trace_kernel); From 872000d01a3228fd857d93fe5018db353539aabd Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Wed, 30 Mar 2022 23:26:45 -0400 Subject: [PATCH 8/9] use functional programming for hwtid that can wrap-around --- gpu-simulator/trace-driven/trace_driven.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gpu-simulator/trace-driven/trace_driven.cc b/gpu-simulator/trace-driven/trace_driven.cc index dc9dca269..f4f24ccea 100644 --- a/gpu-simulator/trace-driven/trace_driven.cc +++ b/gpu-simulator/trace-driven/trace_driven.cc @@ -591,25 +591,25 @@ void trace_shader_core_ctx::init_traces(unsigned start_warp, unsigned end_warp, kernel_info_t &kernel) { std::vector *> threadblock_traces; - auto warp_ids = get_index_vector_from_range_with_wrap_around - (start_warp, end_warp, m_config->max_warps_per_shader); + //WrappableUnsignedRange is defined in gpgpu-sim/gpu-sim.h + WrappableUnsignedRange warp_id_range(start_warp, end_warp, m_config->max_warps_per_shader); - for (unsigned i : warp_ids) { + warp_id_range.loop([&](const unsigned i){ trace_shd_warp_t *m_trace_warp = static_cast(m_warp[i]); m_trace_warp->clear(); - threadblock_traces.push_back(&(m_trace_warp->warp_traces)); - } + threadblock_traces.push_back(&(m_trace_warp->warp_traces)); + }); trace_kernel_info_t &trace_kernel = static_cast(kernel); trace_kernel.get_next_threadblock_traces(threadblock_traces); // set the pc from the traces and ignore the functional model - for (unsigned i : warp_ids) { + warp_id_range.loop([&](const unsigned i){ trace_shd_warp_t *m_trace_warp = static_cast(m_warp[i]); m_trace_warp->set_next_pc(m_trace_warp->get_start_trace_pc()); m_trace_warp->set_kernel(&trace_kernel); - } + }); } void trace_shader_core_ctx::checkExecutionStatusAndUpdate(warp_inst_t &inst, From 6f3e442f05e64fba533d48a76d834d2a1e1bc6aa Mon Sep 17 00:00:00 2001 From: Fangjia Shen Date: Sun, 3 Apr 2022 13:11:09 -0400 Subject: [PATCH 9/9] Revert "created my own app for debugging pusposes" ...So the branch can be merged to dev branch This reverts commit d353b1369d0bfae6cafb25433c1dfab2ad24471b. --- util/job_launching/apps/define-all-apps.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml index 663c9774f..ae8174e47 100644 --- a/util/job_launching/apps/define-all-apps.yml +++ b/util/job_launching/apps/define-all-apps.yml @@ -105,14 +105,6 @@ GPU_Microbenchmark: - args: accel-sim-mem: 1G -Deepbench_subcore_test: - exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/" - data_dirs: "$GPUAPPS_ROOT/data_dirs/" - execs: - - gemm_bench-tencore: - - args: inference half 35 1500 2560 0 0 - accel-sim-mem: 2G - Deepbench_nvidia_tencore: exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/" data_dirs: "$GPUAPPS_ROOT/data_dirs/"