Skip to content

Commit f4bc8a7

Browse files
jaladreipsigcbot
authored andcommitted
Reduce rayquery state by only keeping the track of the slot used instead of the whole pointer to the global buffer
Reduce rayquery state by only keeping the track of the slot used instead of the whole pointer to the global buffer. We can remat the pointer at the point of use, this should help a little bit with register pressure.
1 parent bfd5810 commit f4bc8a7

File tree

4 files changed

+98
-82
lines changed

4 files changed

+98
-82
lines changed

IGC/AdaptorCommon/RayTracing/NewTraceRayInlineLoweringPass.cpp

Lines changed: 64 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ bool InlineRaytracing::LowerAllocations(Function &F) {
3535
/*
3636
* struct RQObjectType
3737
* {
38-
* void* addrspace(2) globalBufferPtr;
38+
* uint32_t slot;
3939
* uint32_t rayQueryPackedData;
4040
* }
4141
*/
@@ -51,21 +51,21 @@ bool InlineRaytracing::LowerAllocations(Function &F) {
5151
if (AllocateRQInstructions.empty())
5252
return false;
5353

54+
auto *rtstackTy = IRB.getRTStack2PtrTy(false);
5455
auto *globalBufferPtrTy =
5556
IRB.getRayDispatchGlobalDataPtrTy(*F.getParent(), ADDRESS_SPACE_CONSTANT);
5657

5758
if (!m_RQObjectType)
58-
m_RQObjectType = StructType::create(
59-
*m_pCGCtx->getLLVMContext(),
60-
{globalBufferPtrTy, IRB.getInt32Ty()}, name);
59+
m_RQObjectType =
60+
StructType::create(*m_pCGCtx->getLLVMContext(),
61+
{IRB.getInt32Ty(), IRB.getInt32Ty()}, name);
6162

62-
auto *getGlobalBufferPtrFnTy =
63-
FunctionType::get(globalBufferPtrTy, IRB.getInt32Ty(), false);
64-
auto *getGlobalBufferPtrFn = m_Functions[GET_GLOBAL_BUFFER_PTR] =
65-
Function::Create(getGlobalBufferPtrFnTy, GlobalValue::PrivateLinkage,
66-
VALUE_NAME("getGlobalBufferPtrFn"), F.getParent());
63+
auto *createRQObjectFnTy = FunctionType::get(m_RQObjectType->getPointerTo(),
64+
IRB.getInt32Ty(), false);
65+
auto *createRQObjectFn = m_Functions[CREATE_RQ_OBJECT] =
66+
Function::Create(createRQObjectFnTy, GlobalValue::PrivateLinkage,
67+
VALUE_NAME("createRQObject"), F.getParent());
6768

68-
auto *rtstackTy = IRB.getRTStack2PtrTy(false);
6969
auto *getStackPointerFromGlobalBufferPtrFnTy =
7070
FunctionType::get(rtstackTy, globalBufferPtrTy, false);
7171
auto *getStackPointerFromGlobalBufferPtrFn =
@@ -84,7 +84,6 @@ bool InlineRaytracing::LowerAllocations(Function &F) {
8484

8585
getStackPointerFromGlobalBufferPtrFn->addParamAttr(
8686
0, llvm::Attribute::NoCapture);
87-
getRQHandleFromRQObjectFn->addParamAttr(0, llvm::Attribute::NoCapture);
8887

8988
// allocate rayquery instructions return i32 handle
9089
// we want all rayqueries to be represent via our struct
@@ -98,42 +97,35 @@ bool InlineRaytracing::LowerAllocations(Function &F) {
9897
cast<CallInst>(I)->addFnAttr(llvm::Attribute::ReadOnly);
9998

10099
IRB.SetInsertPoint(F.getEntryBlock().getFirstNonPHI());
101-
auto *aI = IRB.CreateAlloca(m_RQObjectType);
100+
Value *rqObject = nullptr;
102101

103102
IRB.SetInsertPoint(I);
104103
if (m_pCGCtx->syncRTCallsNeedSplitting()) {
105-
// create 2 globals and select the appropriate one depending on the lane
106-
// id
107-
auto *globalBufferPtr1 = IRB.CreateCall(
108-
getGlobalBufferPtrFn, UndefValue::get(IRB.getInt32Ty()));
109-
auto *globalBufferPtr2 = IRB.CreateCall(
110-
getGlobalBufferPtrFn, UndefValue::get(IRB.getInt32Ty()));
104+
// create 2 rq objects and select one based on the lane id
105+
106+
auto *rqObject1 =
107+
IRB.CreateCall(createRQObjectFn, UndefValue::get(IRB.getInt32Ty()));
108+
auto *rqObject2 =
109+
IRB.CreateCall(createRQObjectFn, UndefValue::get(IRB.getInt32Ty()));
110+
111111
auto *laneId = IRB.get32BitLaneID();
112112
auto *cond =
113113
IRB.CreateICmpULT(laneId, IRB.getInt32(numLanes(SIMDMode::SIMD16)));
114-
auto *globalBufferPtr =
115-
IRB.CreateSelect(cond, globalBufferPtr1, globalBufferPtr2,
116-
VALUE_NAME("GlobalBufferPtr"));
117-
IRB.CreateStore(globalBufferPtr, IRB.CreateInBoundsGEP(
118-
m_RQObjectType, aI,
119-
{IRB.getInt32(0), IRB.getInt32(0)}));
114+
rqObject =
115+
IRB.CreateSelect(cond, rqObject1, rqObject2, VALUE_NAME("rqObject"));
120116
} else {
121-
auto *globalBufferPtr = IRB.CreateCall(getGlobalBufferPtrFn,
122-
UndefValue::get(IRB.getInt32Ty()),
123-
VALUE_NAME("GlobalBufferPtr"));
124-
IRB.CreateStore(globalBufferPtr, IRB.CreateInBoundsGEP(
125-
m_RQObjectType, aI,
126-
{IRB.getInt32(0), IRB.getInt32(0)}));
117+
rqObject =
118+
IRB.CreateCall(createRQObjectFn, UndefValue::get(IRB.getInt32Ty()));
127119
}
128120

129121
// iniitalize it to done so when app calls proceed without tracerayinline,
130122
// we dont traverse over garbage
131-
setPackedData(IRB, aI,
123+
setPackedData(IRB, rqObject,
132124
{IRB.getInt32(TRACE_RAY_DONE), IRB.getInt32(0),
133125
IRB.getInt32(0), IRB.getInt32(0),
134126
IRB.getInt32(CommittedHit)});
135127

136-
v2vMap[I] = aI;
128+
v2vMap[I] = rqObject;
137129

138130
SmallVector<Use *> worklist;
139131

@@ -632,7 +624,7 @@ InlineRaytracing::LivenessDataMap
632624
InlineRaytracing::AnalyzeLiveness(Function &F, DominatorTree &DT,
633625
LoopInfo &LI) {
634626
LivenessDataMap data;
635-
for (auto *I : m_Functions[GET_GLOBAL_BUFFER_PTR]->users()) {
627+
for (auto *I : m_Functions[CREATE_RQ_OBJECT]->users()) {
636628
data.insert(
637629
std::make_pair(cast<Instruction>(I),
638630
ProcessInstruction(cast<Instruction>(I), DT, LI)));
@@ -641,8 +633,8 @@ InlineRaytracing::AnalyzeLiveness(Function &F, DominatorTree &DT,
641633
return data;
642634
}
643635

644-
void InlineRaytracing::AssignGlobalBuffers(
645-
Function &F, const LivenessDataMap &livenessDataMap) {
636+
void InlineRaytracing::AssignSlots(Function &F,
637+
const LivenessDataMap &livenessDataMap) {
646638
RTBuilder IRB(&*F.getEntryBlock().begin(), *m_pCGCtx);
647639
SmallVector<SmallVector<const LivenessData *>, 2> occupancyMap;
648640

@@ -707,7 +699,7 @@ void InlineRaytracing::InsertCacheControl(
707699
}
708700

709701
void InlineRaytracing::StopAndStartRayquery(RTBuilder &IRB, Instruction *I,
710-
Instruction *globalBufferPtr,
702+
Value *globalBufferPtr,
711703
bool doSpillFill,
712704
bool doRQCheckRelease) {
713705
IRB.SetInsertPoint(I);
@@ -718,7 +710,7 @@ void InlineRaytracing::StopAndStartRayquery(RTBuilder &IRB, Instruction *I,
718710
auto *stackPtr = static_cast<RTBuilder::SyncStackPointerVal *>(
719711
cast<Value>(IRB.CreateCall(
720712
m_Functions[GET_STACK_POINTER_FROM_GLOBAL_BUFFER_POINTER],
721-
IRB.Insert(globalBufferPtr->clone()))));
713+
globalBufferPtr)));
722714

723715
liveStack = IRB.CreateLoad(IRB.getRTStack2Ty(), stackPtr);
724716

@@ -741,7 +733,7 @@ void InlineRaytracing::StopAndStartRayquery(RTBuilder &IRB, Instruction *I,
741733
auto *stackPtr = static_cast<RTBuilder::SyncStackPointerVal *>(
742734
cast<Value>(IRB.CreateCall(
743735
m_Functions[GET_STACK_POINTER_FROM_GLOBAL_BUFFER_POINTER],
744-
IRB.Insert(globalBufferPtr->clone()))));
736+
globalBufferPtr)));
745737

746738
IRB.CreateStore(liveStack, stackPtr);
747739
}
@@ -781,7 +773,7 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
781773
for (auto &entry : livenessDataMap) {
782774
bool cfgChanged = false;
783775

784-
auto *globalBufferPtr = entry.first;
776+
auto *rqObject = entry.first;
785777
auto *LD = &entry.second;
786778

787779
// process the allocation acquire point
@@ -796,10 +788,12 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
796788
for (auto *I : LD->lifetimeEndInstructions) {
797789
IRB.SetInsertPoint(I->getNextNode());
798790

791+
auto *globalBufferPtr =
792+
getGlobalBufferPtr(IRB, IRB.Insert(rqObject->clone()));
799793
auto *stackPtr = static_cast<RTBuilder::SyncStackPointerVal *>(
800794
cast<Value>(IRB.CreateCall(
801795
m_Functions[GET_STACK_POINTER_FROM_GLOBAL_BUFFER_POINTER],
802-
IRB.Insert(globalBufferPtr->clone()))));
796+
globalBufferPtr)));
803797

804798
// handle cache control
805799
InsertCacheControl(IRB, stackPtr);
@@ -824,10 +818,13 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
824818

825819
IRB.SetInsertPoint(succ->getFirstNonPHI());
826820

821+
auto *globalBufferPtr =
822+
getGlobalBufferPtr(IRB, IRB.Insert(rqObject->clone()));
823+
827824
auto *stackPtr = static_cast<RTBuilder::SyncStackPointerVal *>(
828825
cast<Value>(IRB.CreateCall(
829826
m_Functions[GET_STACK_POINTER_FROM_GLOBAL_BUFFER_POINTER],
830-
IRB.Insert(globalBufferPtr->clone()))));
827+
globalBufferPtr)));
831828

832829
// handle cache control
833830
InsertCacheControl(IRB, stackPtr);
@@ -844,6 +841,9 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
844841
if (!LD->ContainsInstruction(*I))
845842
continue;
846843

844+
auto *globalBufferPtr =
845+
getGlobalBufferPtr(IRB, IRB.Insert(rqObject->clone()));
846+
847847
StopAndStartRayquery(IRB, I, globalBufferPtr, true, doRQCheckRelease);
848848
}
849849

@@ -852,6 +852,9 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
852852
if (!LD->ContainsInstruction(*I))
853853
continue;
854854

855+
auto *globalBufferPtr =
856+
getGlobalBufferPtr(IRB, IRB.Insert(rqObject->clone()));
857+
855858
StopAndStartRayquery(IRB, I, globalBufferPtr, true, doRQCheckRelease);
856859
}
857860

@@ -860,6 +863,9 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
860863
if (!LD->ContainsInstruction(*I))
861864
continue;
862865

866+
auto *globalBufferPtr =
867+
getGlobalBufferPtr(IRB, IRB.Insert(rqObject->clone()));
868+
863869
StopAndStartRayquery(IRB, I, globalBufferPtr, true, doRQCheckRelease);
864870
}
865871

@@ -868,11 +874,14 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
868874
if (!LD->ContainsInstruction(*I))
869875
continue;
870876

877+
auto *globalBufferPtr =
878+
getGlobalBufferPtr(IRB, IRB.Insert(rqObject->clone()));
879+
871880
StopAndStartRayquery(IRB, I, globalBufferPtr, false, doRQCheckRelease);
872881
}
873882

874883
if (cfgChanged) {
875-
auto nextentry = livenessDataMap.find(globalBufferPtr);
884+
auto nextentry = livenessDataMap.find(rqObject);
876885

877886
// TODO: can we incrementally update LoopInfo and DomTree?
878887
DT.recalculate(F);
@@ -887,43 +896,23 @@ void InlineRaytracing::HandleOptimizationsAndSpills(
887896
}
888897
}
889898

890-
void InlineRaytracing::LowerGlobalBufferPtrs(Function &F) {
899+
void InlineRaytracing::LowerSlotAssignments(Function &F) {
891900
RTBuilder IRB(&*F.getEntryBlock().begin(), *m_pCGCtx);
892-
SmallVector<Instruction *> getGBPInstructions;
901+
SmallVector<Instruction *> createRQInstructions;
893902

894-
for (auto &U : m_Functions[GET_GLOBAL_BUFFER_PTR]->uses())
895-
getGBPInstructions.push_back(cast<Instruction>(U.getUser()));
896-
897-
for (auto &I : getGBPInstructions) {
898-
uint32_t slot = static_cast<uint32_t>(
899-
cast<ConstantInt>(I->getOperand(0))->getZExtValue());
903+
for (auto &U : m_Functions[CREATE_RQ_OBJECT]->uses())
904+
createRQInstructions.push_back(cast<Instruction>(U.getUser()));
900905

906+
for (auto *I : createRQInstructions) {
901907
IRB.SetInsertPoint(I);
902-
auto *pFunc = GenISAIntrinsic::getDeclaration(
903-
F.getParent(), GenISAIntrinsic::GenISA_RuntimeValue,
904-
IRB.getRayDispatchGlobalDataPtrTy(*m_pCGCtx->getModule(),
905-
ADDRESS_SPACE_CONSTANT));
906-
907-
auto *mainGlobalBufferPtr = IRB.CreateCall(
908-
pFunc,
909-
IRB.getInt32(
910-
m_pCGCtx->getModuleMetaData()->pushInfo.inlineRTGlobalPtrOffset));
911-
912-
uint32_t offset =
913-
slot * IGC::Align(sizeof(RayDispatchGlobalData), IGC::RTGlobalsAlign);
914-
915-
auto *globalBufferPtr = IRB.CreateBitCast(
916-
mainGlobalBufferPtr, IRB.getInt8PtrTy(ADDRESS_SPACE_CONSTANT));
917-
globalBufferPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), globalBufferPtr,
918-
IRB.getInt32(offset));
919-
globalBufferPtr = IRB.CreateBitCast(
920-
globalBufferPtr, mainGlobalBufferPtr->getType(),
921-
VALUE_NAME("globalBuffer[" + std::to_string(slot) + "]"));
922-
923-
I->replaceAllUsesWith(globalBufferPtr);
908+
auto *rqObject = IRB.CreateAlloca(m_RQObjectType);
909+
auto *slotPtr = getAtIndexFromRayQueryObject(IRB, rqObject, 0);
910+
IRB.CreateStore(I->getOperand(0), slotPtr);
911+
912+
I->replaceAllUsesWith(rqObject);
924913
}
925914

926-
llvm::for_each(getGBPInstructions,
915+
llvm::for_each(createRQInstructions,
927916
[](Instruction *I) { I->eraseFromParent(); });
928917
}
929918

@@ -975,9 +964,9 @@ bool InlineRaytracing::runOnFunction(Function &F) {
975964
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
976965

977966
auto livenessData = AnalyzeLiveness(F, DT, LI);
978-
AssignGlobalBuffers(F, livenessData);
967+
AssignSlots(F, livenessData);
979968
HandleOptimizationsAndSpills(F, livenessData, DT, LI);
980-
LowerGlobalBufferPtrs(F);
969+
LowerSlotAssignments(F);
981970
LowerStackPtrs(F);
982971

983972
// set relevant metadata

IGC/AdaptorCommon/RayTracing/NewTraceRayInlineLoweringPass.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,13 @@ class InlineRaytracing : public AllocationLivenessAnalyzer {
4343
bool LowerAllocations(llvm::Function &F);
4444
LivenessDataMap AnalyzeLiveness(llvm::Function &F, llvm::DominatorTree &DT,
4545
llvm::LoopInfo &LI);
46-
void AssignGlobalBuffers(llvm::Function &F,
46+
void AssignSlots(llvm::Function &F,
4747
const LivenessDataMap &livenessDataMap);
4848
void HandleOptimizationsAndSpills(llvm::Function &F,
4949
LivenessDataMap &livenessDataMap,
5050
llvm::DominatorTree &DT,
5151
llvm::LoopInfo &LI);
52-
void LowerGlobalBufferPtrs(llvm::Function &F);
52+
void LowerSlotAssignments(llvm::Function &F);
5353
void LowerStackPtrs(llvm::Function &F);
5454

5555
enum RQTraceRayCtrl : uint8_t {
@@ -65,9 +65,9 @@ class InlineRaytracing : public AllocationLivenessAnalyzer {
6565
};
6666

6767
enum Functions : uint8_t {
68-
GET_GLOBAL_BUFFER_PTR,
6968
GET_STACK_POINTER_FROM_GLOBAL_BUFFER_POINTER,
7069
GET_RQ_HANDLE_FROM_RQ_OJECT,
70+
CREATE_RQ_OBJECT,
7171
NUM_FUNCTIONS
7272
};
7373

@@ -92,8 +92,8 @@ class InlineRaytracing : public AllocationLivenessAnalyzer {
9292
}
9393

9494
llvm::Value *getGlobalBufferPtr(llvm::RTBuilder &IRB, llvm::Value *rqObject) {
95-
return IRB.CreateLoad(m_Functions[GET_GLOBAL_BUFFER_PTR]->getReturnType(),
96-
getAtIndexFromRayQueryObject(IRB, rqObject, 0));
95+
auto *slot = IRB.CreateLoad(IRB.getInt32Ty(), getAtIndexFromRayQueryObject(IRB, rqObject, 0));
96+
return IRB.getGlobalBufferPtrForSlot(ADDRESS_SPACE_CONSTANT, slot);
9797
}
9898

9999
struct UnpackedData {
@@ -180,7 +180,7 @@ class InlineRaytracing : public AllocationLivenessAnalyzer {
180180
void InsertCacheControl(llvm::RTBuilder &IRB,
181181
llvm::RTBuilder::SyncStackPointerVal *stackPtr);
182182
void StopAndStartRayquery(llvm::RTBuilder &IRB, llvm::Instruction *I,
183-
llvm::Instruction *globalBufferPtr,
183+
llvm::Value *globalBufferPtr,
184184
bool doSpillFill, bool doRQCheckRelease);
185185

186186
};

IGC/AdaptorCommon/RayTracing/RTBuilder.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1636,6 +1636,31 @@ Value* RTBuilder::getGlobalBufferPtr(IGC::ADDRESS_SPACE Addrspace)
16361636
return CI;
16371637
}
16381638

1639+
Value *RTBuilder::getGlobalBufferPtrForSlot(IGC::ADDRESS_SPACE Addrspace, Value* slot) {
1640+
auto *pFunc = GenISAIntrinsic::getDeclaration(
1641+
Ctx.getModule(), GenISAIntrinsic::GenISA_RuntimeValue,
1642+
getRayDispatchGlobalDataPtrTy(*Ctx.getModule(),
1643+
ADDRESS_SPACE_CONSTANT));
1644+
1645+
auto *mainGlobalBufferPtr = CreateCall(
1646+
pFunc,
1647+
getInt32(Ctx.getModuleMetaData()->pushInfo.inlineRTGlobalPtrOffset),
1648+
VALUE_NAME("globalBufferPtrFromRuntimeValue"));
1649+
1650+
auto *offset = CreateMul(
1651+
slot,
1652+
getInt32(IGC::Align(sizeof(RayDispatchGlobalData), IGC::RTGlobalsAlign)));
1653+
1654+
auto *globalBufferPtr = CreateBitCast(
1655+
mainGlobalBufferPtr, getInt8PtrTy(ADDRESS_SPACE_CONSTANT));
1656+
globalBufferPtr = CreateInBoundsGEP(getInt8Ty(), globalBufferPtr, offset);
1657+
globalBufferPtr = CreateBitCast(
1658+
globalBufferPtr, mainGlobalBufferPtr->getType(),
1659+
VALUE_NAME("globalBuffer[]"));
1660+
1661+
return globalBufferPtr;
1662+
}
1663+
16391664

16401665
Value* RTBuilder::getSyncStackID()
16411666
{

IGC/AdaptorCommon/RayTracing/RTBuilder.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,9 @@ class RTBuilder : public IGCIRBuilder<>
361361
Value* getHitBaryCentric(StackPointerVal* StackPointer, uint32_t idx, Value* CommittedHit);
362362

363363

364-
Value* getGlobalBufferPtr(IGC::ADDRESS_SPACE Addrspace = IGC::ADDRESS_SPACE_CONSTANT);
364+
Value *getGlobalBufferPtr(
365+
IGC::ADDRESS_SPACE Addrspace = IGC::ADDRESS_SPACE_CONSTANT);
366+
Value *getGlobalBufferPtrForSlot(IGC::ADDRESS_SPACE Addrspace, Value *slot);
365367
Value* getSyncStackID();
366368
Value* getSyncStackOffset(bool rtMemBasePtr = true);
367369
SpillValueIntrinsic* getSpillValue(Value* Val, uint64_t Offset);

0 commit comments

Comments
 (0)