- 
                Notifications
    You must be signed in to change notification settings 
- Fork 69
AddingPasses
A compiler works like a stream system: we have a chain of filters, and the program, which we can see as a stream of instructions, is fed to each of these filters sequentially. A pass can analyze the program, or it can transform the program. The divergence analysis that we have is just an analysis: it does not change the program at all. On the other hand, the register allocators do change the program, inserting load and store instructions into the original code. In this page I will show you how to add a new pass onto Ocelot. This is a very simple pass that only prints the instructions in the Dataflow Graph that describes a program. Writing the pass
The first step is, naturally, to write the pass itself. The pass is composed of two files, a .cpp and a .h, the latter containing the headers of the constructors, destructors and functions implemented by the classes of your pass, which are all implemented in the .cpp file, in standart C++ fashion. It’s important to notice that the classes you create will have to extend the class kernelPass, like on the example below.
#include "ocelot/transforms/interface/PrettyPrinterPass.h"
#include "ocelot/analysis/interface/DivergenceAnalysis.h"
#include "iostream"
 namespace transforms {
   PrettyPrinterPass::PrettyPrinterPass()
     : KernelPass(Analysis::DivergenceAnalysis, "PrettyPrinterPass") {
   }
   void PrettyPrinter::runOnKernel( ir::IRKernel& k ) {
     Analysis* dfg_structure = getAnalysis(Analysis::DataflowGraphAnalysis);
     assert(dfg_structure != 0);
     analysis::DataflowGraph& dfg =
       *static_cast(dfg_structure);
     analysis::DataflowGraph::iterator block = ++dfg.begin();
     analysis::DataflowGraph::iterator blockEnd = --dfg.end();
     for (; block != blockEnd; block++) {
       std::cout << "New Basic Block:\n";
       std::_List_iterator i = block->block()->instructions.begin();
       std::_List_iterator e = block->block()->instructions.end();
       while (i != e) {
         ir::PTXInstruction* inst = static_cast(*i);
         std::cout <toString() << std::endl;
         i++;
       }
     }
  }
}
#ifndef PRETTY_PRINTER_PASS_H_
#define PRETTY_PRINTER_PASS_H_
#include "ocelot/transforms/interface/Pass.h"
namespace transforms {
/*! \brief This pass prints the instructions in the Dataflow Graph
 */
class PrettyPrinterPass: public KernelPass
{
private:
public:
  PrettyPrinterPass();
  virtual ~PrettyPrinterPass() {};
  virtual void initialize( const ir::Module& m ){};
  virtual void runOnKernel( ir::IRKernel& k );
  virtual void finalize(){};
};
}
#endif /* BLOCKUNIFICATIONPASS_H_ */
To add the pass to the Ocelot toolchain, several little additions have to be made to certain key parts of ocelot. For instance, changes will have to be done to ocelot/tools/PTXOPtimizer.cpp/h. On the .cpp file, your header file has to be included, like on the example below:
/*! \file PTXOptimzer.cpp
  \date Thursday December 31, 2009
  \author Gregory Diamos
  \brief The source file for the Ocelot PTX optimizer
*/
#ifndef PTX_OPTIMIZER_CPP_INCLUDED
#define PTX_OPTIMIZER_CPP_INCLUDED
// Ocelot Includes
#include "ocelot/tools/PTXOptimizer.h"
#include "ocelot/transforms/interface/PassManager.h"
#include "ocelot/transforms/interface/RemoveBarrierPass.h"
#include "ocelot/transforms/interface/StructuralTransform.h"
#include "ocelot/transforms/interface/ConvertPredicationToSelectPass.h"
//Include your header here.
Also, you have to add your file to the if chains on the functions PTXOptimizer::optimize() and parsePassTypes( const std::string& passList ). The first function guarantees that your pass will be read by the compiler, and the latter makes sure you’ll be able to execute your pass by writing a new customized shortcut on the command line.
   //PTXOptimizer::optimize() function.
   if( passes & SplitBasicBlocks )
    {
      transforms::Pass* pass = new transforms::SplitBasicBlockPass(
        basicBlockSize );
      manager.addPass( *pass );
    }
    if( passes & SyncElimination )
    {
      transforms::Pass* pass = new transforms::SyncEliminationPass;
      manager.addPass( *pass );
    }
    if( passes & PrettyPrint )
    {
      transforms::Pass* pass = new transforms::PrettyPrinterPass;
      manager.addPass( *pass );
    }
    //Your if should go here, with a similar format.
    //parsePassTypes( const std::string& passList ) function
    else if( *pass == "split-blocks" )
    {
      report( "  Matched split-blocks." );
      types |= tools::PTXOptimizer::SplitBasicBlocks;
    }
    else if( *pass == "sync-elimination" )
    {
      report( "  Matched sync-elimination." );
      types |= tools::PTXOptimizer::SyncElimination;
    }
    else if( *pass == "pretty-print" )
    {
      report( "  Matched pretty-print." );
      types |= tools::PTXOptimizer::PrettyPrint;
    }
    //Your pass should go here, above that if( !pass->empty() ).
    else if( !pass->empty() )
    {
      std::cout << "==Ocelot== Warning: Unknown pass name - '" << *pass
        << "'\n";
    }
Besides that, there is an addition to be made on ocelot/tools/PTXOPtimizer.h: you have to, on enum PassType, attribute an hexadecimal adress to a variable representing your pass:
     /*! \brief The possible PTX to PTX passes */
      enum PassType
      {
        InvalidPassType      = 0x0,
        RemoveBarriers       = 0x1,
        ReverseIfConversion  = 0x2,
        SubkernelFormation   = 0x4,
        StructuralTransform  = 0x8,
        MIMDThreadScheduling = 0x10,
        DeadCodeElimination  = 0x20,
        SplitBasicBlocks     = 0x40,
        SyncElimination      = 0x80,
        PrettyPrint          = 0x100
        //Insert the adress to your variable here.
      };
The last change that needs to be done is on ocelot/api/interface/OcelotRuntime.h, where you should add the declaration of the function representing your pass:
#include "ocelot/api/interface/OcelotConfiguration.h"
#include "ocelot/trace/interface/MemoryChecker.h"
#include "ocelot/trace/interface/MemoryRaceDetector.h"
#include "ocelot/trace/interface/InteractiveDebugger.h"
#include "ocelot/transforms/interface/StructuralTransform.h"
#include "ocelot/transforms/interface/ConvertPredicationToSelectPass.h"
#include "ocelot/transforms/interface/LinearScanRegisterAllocationPass.h"
#include "ocelot/transforms/interface/MIMDThreadSchedulingPass.h"
#include "ocelot/transforms/interface/SyncEliminationPass.h"
#include "ocelot/transforms/interface/PrettyPrinterPass.h"
namespace ocelot
{
  /*! \brief This is an interface for managing state associated with Ocelot */
  class OcelotRuntime {
  private:
    trace::MemoryChecker _memoryChecker;
    trace::MemoryRaceDetector _raceDetector;
    trace::InteractiveDebugger _debugger;
    transforms::StructuralTransform _structuralTransform;
    transforms::ConvertPredicationToSelectPass _predicationToSelect;
    transforms::LinearScanRegisterAllocationPass _linearScanAllocation;
    transforms::MIMDThreadSchedulingPass _mimdThreadScheduling;
    transforms::SyncEliminationPass _syncElimination;
    transforms::PrettyPrinterPass _prettyPrinterPass;
    //Insert your declaration here.
    bool _initialized;
  public:
    //! \brief initializes Ocelot runtime state
    OcelotRuntime();
    //! \brief initializes the Ocelot runtime object with the
    //         Ocelot configuration object
    void configure( const api::OcelotConfiguration &c );
  };
}
#endif
The last step is to test your pass. To do that, you can execute it with the benchmark your choice, for example, compute_20_ORI_babda9cd76164acf. With that benchmark chosen, execute PTXOptimizer -i compute_20_ORI_babda9cd76164acf -p pass-shortcut-here -o out.ptx on the /home/ocelot/tests/ok/sdk3.1/ directory. Below we present a small benchmark example:
.version 2.2
   .target sm_20
   .file 1 ""
   .file 2 "/tmp/tmpxft_000013cc_00000000-6_vectorAdd.cudafe2.gpu"
   .file 3 "/usr/lib/gcc/i486-linux-gnu/4.4.3/include/stddef.h"
   .file 4 "/usr/local/cuda/include/crt/device_runtime.h"
   .file 5 "/usr/local/cuda/include/host_defines.h"
   .file 6 "/usr/local/cuda/include/builtin_types.h"
   .file 7 "/usr/local/cuda/include/device_types.h"
   .file 8 "/usr/local/cuda/include/driver_types.h"
   .file 9 "/usr/local/cuda/include/surface_types.h"
   .file 10  "/usr/local/cuda/include/texture_types.h"
   .file 11  "/usr/local/cuda/include/vector_types.h"
   .file 12  "/usr/local/cuda/include/device_launch_parameters.h"
   .file 13  "/usr/local/cuda/include/crt/storage_class.h"
   .file 14  "/usr/include/bits/types.h"
   .file 15  "/usr/include/time.h"
   .file 16  "/usr/local/cuda/include/texture_fetch_functions.h"
   .file 17  "/usr/local/cuda/include/common_functions.h"
   .file 18  "/usr/local/cuda/include/math_functions.h"
   .file 19  "/usr/local/cuda/include/math_constants.h"
   .file 20  "/usr/local/cuda/include/device_functions.h"
   .file 21  "/usr/local/cuda/include/sm_11_atomic_functions.h"
   .file 22  "/usr/local/cuda/include/sm_12_atomic_functions.h"
   .file 23  "/usr/local/cuda/include/sm_13_double_functions.h"
   .file 24  "/usr/local/cuda/include/sm_20_atomic_functions.h"
   .file 25  "/usr/local/cuda/include/sm_20_intrinsics.h"
   .file 26  "/usr/local/cuda/include/surface_functions.h"
   .file 27  "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
   .file 28  "vectorAdd.cu"
   .entry _Z6VecAddPKfS0_Pfi (
     .param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_A,
     .param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_B,
     .param .u32 __cudaparm__Z6VecAddPKfS0_Pfi_C,
     .param .s32 __cudaparm__Z6VecAddPKfS0_Pfi_N)
   {
   .reg .u32 %r;
   .reg .f32 %f;
   .reg .pred %p;
   .loc  28  43  0
 $LDWbegin__Z6VecAddPKfS0_Pfi:
   mov.u32   %r1, %ctaid.x;
   mov.u32   %r2, %ntid.x;
   mul.lo.u32  %r3, %r1, %r2;
   mov.u32   %r4, %tid.x;
.u32   %r5, %r4, %r3;
   ld.param.s32  %r6, [__cudaparm__Z6VecAddPKfS0_Pfi_N];
   setp.le.s32   %p1, %r6, %r5;
   @%p1 bra  $Lt_0_1026;
   .loc  28  47  0
   mul.lo.u32  %r7, %r5, 4;
   ld.param.u32  %r8, [__cudaparm__Z6VecAddPKfS0_Pfi_A];
   add.u32   %r9, %r8, %r7;
   ld.global.f32   %f1, [%r9+0];
   ld.param.u32  %r10, [__cudaparm__Z6VecAddPKfS0_Pfi_B];
   add.u32   %r11, %r10, %r7;
   ld.global.f32   %f2, [%r11+0];
   add.f32   %f3, %f1, %f2;
   ld.param.u32  %r12, [__cudaparm__Z6VecAddPKfS0_Pfi_C];
   add.u32   %r13, %r12, %r7;
   st.global.f32   [%r13+0], %f3;
 $Lt_0_1026:
   .loc  28  48  0
   exit;
 $LDWend__Z6VecAddPKfS0_Pfi:
   } // _Z6VecAddPKfS0_Pfi
Thanks to Alberto de Sá Cavalcanti de Albuquerque for providing this tutorial, there original version may be found here