diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp index 88aef89ddd2b9..50d4c035d30a1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp @@ -671,9 +671,12 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo, return RValue::get(results[0]); } - case cir::TEK_Complex: - cgm.errorNYI(loc, "unsupported evaluation kind of function call result"); - return getUndefRValue(retTy); + case cir::TEK_Complex: { + mlir::ResultRange results = theCall->getOpResults(); + assert(!results.empty() && + "Expected at least one result for complex rvalue"); + return RValue::getComplex(results[0]); + } } llvm_unreachable("Invalid evaluation kind"); } diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h index 9acf8b1f20e79..61a09a59b05c0 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h @@ -15,6 +15,7 @@ #define CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H #include "Address.h" +#include "CIRGenModule.h" #include "EHScopeStack.h" #include "mlir/IR/Value.h" @@ -257,5 +258,53 @@ inline void EHScopeStack::popCatch() { deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers())); } +/// The exceptions personality for a function. +struct EHPersonality { + const char *personalityFn = nullptr; + + // If this is non-null, this personality requires a non-standard + // function for rethrowing an exception after a catchall cleanup. + // This function must have prototype void(void*). + const char *catchallRethrowFn = nullptr; + + static const EHPersonality &get(CIRGenModule &cgm, + const clang::FunctionDecl *fd); + static const EHPersonality &get(CIRGenFunction &cgf); + + static const EHPersonality GNU_C; + static const EHPersonality GNU_C_SJLJ; + static const EHPersonality GNU_C_SEH; + static const EHPersonality GNU_ObjC; + static const EHPersonality GNU_ObjC_SJLJ; + static const EHPersonality GNU_ObjC_SEH; + static const EHPersonality GNUstep_ObjC; + static const EHPersonality GNU_ObjCXX; + static const EHPersonality NeXT_ObjC; + static const EHPersonality GNU_CPlusPlus; + static const EHPersonality GNU_CPlusPlus_SJLJ; + static const EHPersonality GNU_CPlusPlus_SEH; + static const EHPersonality MSVC_except_handler; + static const EHPersonality MSVC_C_specific_handler; + static const EHPersonality MSVC_CxxFrameHandler3; + static const EHPersonality GNU_Wasm_CPlusPlus; + static const EHPersonality XL_CPlusPlus; + static const EHPersonality ZOS_CPlusPlus; + + /// Does this personality use landingpads or the family of pad instructions + /// designed to form funclets? + bool usesFuncletPads() const { + return isMSVCPersonality() || isWasmPersonality(); + } + + bool isMSVCPersonality() const { + return this == &MSVC_except_handler || this == &MSVC_C_specific_handler || + this == &MSVC_CxxFrameHandler3; + } + + bool isWasmPersonality() const { return this == &GNU_Wasm_CPlusPlus; } + + bool isMSVCXXPersonality() const { return this == &MSVC_CxxFrameHandler3; } +}; + } // namespace clang::CIRGen #endif // CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H diff --git a/clang/lib/CIR/CodeGen/CIRGenException.cpp b/clang/lib/CIR/CodeGen/CIRGenException.cpp index 717a3e0032cea..67f46ffde8fda 100644 --- a/clang/lib/CIR/CodeGen/CIRGenException.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenException.cpp @@ -18,6 +18,171 @@ using namespace clang; using namespace clang::CIRGen; +const EHPersonality EHPersonality::GNU_C = {"__gcc_personality_v0", nullptr}; +const EHPersonality EHPersonality::GNU_C_SJLJ = {"__gcc_personality_sj0", + nullptr}; +const EHPersonality EHPersonality::GNU_C_SEH = {"__gcc_personality_seh0", + nullptr}; +const EHPersonality EHPersonality::NeXT_ObjC = {"__objc_personality_v0", + nullptr}; +const EHPersonality EHPersonality::GNU_CPlusPlus = {"__gxx_personality_v0", + nullptr}; +const EHPersonality EHPersonality::GNU_CPlusPlus_SJLJ = { + "__gxx_personality_sj0", nullptr}; +const EHPersonality EHPersonality::GNU_CPlusPlus_SEH = { + "__gxx_personality_seh0", nullptr}; +const EHPersonality EHPersonality::GNU_ObjC = {"__gnu_objc_personality_v0", + "objc_exception_throw"}; +const EHPersonality EHPersonality::GNU_ObjC_SJLJ = { + "__gnu_objc_personality_sj0", "objc_exception_throw"}; +const EHPersonality EHPersonality::GNU_ObjC_SEH = { + "__gnu_objc_personality_seh0", "objc_exception_throw"}; +const EHPersonality EHPersonality::GNU_ObjCXX = { + "__gnustep_objcxx_personality_v0", nullptr}; +const EHPersonality EHPersonality::GNUstep_ObjC = { + "__gnustep_objc_personality_v0", nullptr}; +const EHPersonality EHPersonality::MSVC_except_handler = {"_except_handler3", + nullptr}; +const EHPersonality EHPersonality::MSVC_C_specific_handler = { + "__C_specific_handler", nullptr}; +const EHPersonality EHPersonality::MSVC_CxxFrameHandler3 = { + "__CxxFrameHandler3", nullptr}; +const EHPersonality EHPersonality::GNU_Wasm_CPlusPlus = { + "__gxx_wasm_personality_v0", nullptr}; +const EHPersonality EHPersonality::XL_CPlusPlus = {"__xlcxx_personality_v1", + nullptr}; +const EHPersonality EHPersonality::ZOS_CPlusPlus = {"__zos_cxx_personality_v2", + nullptr}; + +static const EHPersonality &getCPersonality(const TargetInfo &target, + const CodeGenOptions &cgOpts) { + const llvm::Triple &triple = target.getTriple(); + if (triple.isWindowsMSVCEnvironment()) + return EHPersonality::MSVC_CxxFrameHandler3; + if (cgOpts.hasSjLjExceptions()) + return EHPersonality::GNU_C_SJLJ; + if (cgOpts.hasDWARFExceptions()) + return EHPersonality::GNU_C; + if (cgOpts.hasSEHExceptions()) + return EHPersonality::GNU_C_SEH; + return EHPersonality::GNU_C; +} + +static const EHPersonality &getObjCPersonality(const TargetInfo &target, + const LangOptions &langOpts, + const CodeGenOptions &cgOpts) { + const llvm::Triple &triple = target.getTriple(); + if (triple.isWindowsMSVCEnvironment()) + return EHPersonality::MSVC_CxxFrameHandler3; + + switch (langOpts.ObjCRuntime.getKind()) { + case ObjCRuntime::FragileMacOSX: + return getCPersonality(target, cgOpts); + case ObjCRuntime::MacOSX: + case ObjCRuntime::iOS: + case ObjCRuntime::WatchOS: + return EHPersonality::NeXT_ObjC; + case ObjCRuntime::GNUstep: + if (langOpts.ObjCRuntime.getVersion() >= VersionTuple(1, 7)) + return EHPersonality::GNUstep_ObjC; + [[fallthrough]]; + case ObjCRuntime::GCC: + case ObjCRuntime::ObjFW: + if (cgOpts.hasSjLjExceptions()) + return EHPersonality::GNU_ObjC_SJLJ; + if (cgOpts.hasSEHExceptions()) + return EHPersonality::GNU_ObjC_SEH; + return EHPersonality::GNU_ObjC; + } + llvm_unreachable("bad runtime kind"); +} + +static const EHPersonality &getCXXPersonality(const TargetInfo &target, + const CodeGenOptions &cgOpts) { + const llvm::Triple &triple = target.getTriple(); + if (triple.isWindowsMSVCEnvironment()) + return EHPersonality::MSVC_CxxFrameHandler3; + if (triple.isOSAIX()) + return EHPersonality::XL_CPlusPlus; + if (cgOpts.hasSjLjExceptions()) + return EHPersonality::GNU_CPlusPlus_SJLJ; + if (cgOpts.hasDWARFExceptions()) + return EHPersonality::GNU_CPlusPlus; + if (cgOpts.hasSEHExceptions()) + return EHPersonality::GNU_CPlusPlus_SEH; + if (cgOpts.hasWasmExceptions()) + return EHPersonality::GNU_Wasm_CPlusPlus; + return EHPersonality::GNU_CPlusPlus; +} + +/// Determines the personality function to use when both C++ +/// and Objective-C exceptions are being caught. +static const EHPersonality &getObjCXXPersonality(const TargetInfo &target, + const LangOptions &langOpts, + const CodeGenOptions &cgOpts) { + if (target.getTriple().isWindowsMSVCEnvironment()) + return EHPersonality::MSVC_CxxFrameHandler3; + + switch (langOpts.ObjCRuntime.getKind()) { + // In the fragile ABI, just use C++ exception handling and hope + // they're not doing crazy exception mixing. + case ObjCRuntime::FragileMacOSX: + return getCXXPersonality(target, cgOpts); + + // The ObjC personality defers to the C++ personality for non-ObjC + // handlers. Unlike the C++ case, we use the same personality + // function on targets using (backend-driven) SJLJ EH. + case ObjCRuntime::MacOSX: + case ObjCRuntime::iOS: + case ObjCRuntime::WatchOS: + return getObjCPersonality(target, langOpts, cgOpts); + + case ObjCRuntime::GNUstep: + return EHPersonality::GNU_ObjCXX; + + // The GCC runtime's personality function inherently doesn't support + // mixed EH. Use the ObjC personality just to avoid returning null. + case ObjCRuntime::GCC: + case ObjCRuntime::ObjFW: + return getObjCPersonality(target, langOpts, cgOpts); + } + llvm_unreachable("bad runtime kind"); +} + +static const EHPersonality &getSEHPersonalityMSVC(const llvm::Triple &triple) { + return triple.getArch() == llvm::Triple::x86 + ? EHPersonality::MSVC_except_handler + : EHPersonality::MSVC_C_specific_handler; +} + +const EHPersonality &EHPersonality::get(CIRGenModule &cgm, + const FunctionDecl *fd) { + const llvm::Triple &triple = cgm.getTarget().getTriple(); + const LangOptions &langOpts = cgm.getLangOpts(); + const CodeGenOptions &cgOpts = cgm.getCodeGenOpts(); + const TargetInfo &target = cgm.getTarget(); + + // Functions using SEH get an SEH personality. + if (fd && fd->usesSEHTry()) + return getSEHPersonalityMSVC(triple); + + if (langOpts.ObjC) { + return langOpts.CPlusPlus ? getObjCXXPersonality(target, langOpts, cgOpts) + : getObjCPersonality(target, langOpts, cgOpts); + } + return langOpts.CPlusPlus ? getCXXPersonality(target, cgOpts) + : getCPersonality(target, cgOpts); +} + +const EHPersonality &EHPersonality::get(CIRGenFunction &cgf) { + const auto *fg = cgf.curCodeDecl; + // For outlined finallys and filters, use the SEH personality in case they + // contain more SEH. This mostly only affects finallys. Filters could + // hypothetically use gnu statement expressions to sneak in nested SEH. + fg = fg ? fg : cgf.curSEHParent.getDecl(); + return get(cgf.cgm, dyn_cast_or_null(fg)); +} + void CIRGenFunction::emitCXXThrowExpr(const CXXThrowExpr *e) { const llvm::Triple &triple = getTarget().getTriple(); if (cgm.getLangOpts().OpenMPIsTargetDevice && diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index d7911302df45c..c3fcd1a69a88e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -120,6 +120,8 @@ class CIRGenFunction : public CIRGenTypeCache { /// Tracks function scope overall cleanup handling. EHScopeStack ehStack; + GlobalDecl curSEHParent; + llvm::DenseMap lambdaCaptureFields; clang::FieldDecl *lambdaThisCaptureField = nullptr; diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp index 4e89af44dcd18..3fb78dc871904 100644 --- a/clang/test/CIR/CodeGen/complex.cpp +++ b/clang/test/CIR/CodeGen/complex.cpp @@ -1468,3 +1468,30 @@ void calling_function_with_default_arg() { // OGCG: store float 0x40019999A0000000, ptr %[[DEFAULT_ARG_IMAG_PTR]], align 4 // OGCG: %[[TMP_DEFAULT_ARG:.*]] = load <2 x float>, ptr %[[DEFAULT_ARG_ADDR]], align 4 // OGCG: call void @_Z33function_with_complex_default_argCf(<2 x float> {{.*}} %[[TMP_DEFAULT_ARG]]) + +void calling_function_that_return_complex() { + float _Complex a = complex_type_return_type(); +} + +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a", init] +// CIR: %[[RESULT:.*]] = cir.call @_Z24complex_type_return_typev() : () -> !cir.complex +// CIR: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.complex, !cir.ptr> + +// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering, + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[RESULT:.*]] = call { float, float } @_Z24complex_type_return_typev() +// LLVM: store { float, float } %[[RESULT]], ptr %[[A_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[RESULT_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[RESULT:.*]] = call noundef <2 x float> @_Z24complex_type_return_typev() +// OGCG: store <2 x float> %[[RESULT]], ptr %[[RESULT_ADDR]], align 4 +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT_ADDR]], i32 0, i32 0 +// OGCG: %[[RESULT_REAL:.*]] = load float, ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT_ADDR]], i32 0, i32 1 +// OGCG: %[[RESULT_IMAG:.*]] = load float, ptr %[[RESULT_IMAG_PTR]], align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4 diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index 25b5262800a10..baa91f3a5f533 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -353,6 +353,12 @@ class DenseMapBase : public DebugEpochBase { DenseMapBase() = default; void destroyAll() { + // No need to iterate through the buckets if both KeyT and ValueT are + // trivially destructible. + if constexpr (std::is_trivially_destructible_v && + std::is_trivially_destructible_v) + return; + if (getNumBuckets() == 0) // Nothing to do. return; @@ -767,37 +773,6 @@ class DenseMap : public DenseMapBase, return *this; } - void copyFrom(const DenseMap &other) { - this->destroyAll(); - deallocateBuckets(); - if (allocateBuckets(other.NumBuckets)) { - this->BaseT::copyFrom(other); - } else { - NumEntries = 0; - NumTombstones = 0; - } - } - - void grow(unsigned AtLeast) { - unsigned OldNumBuckets = NumBuckets; - BucketT *OldBuckets = Buckets; - - allocateBuckets(std::max( - 64, static_cast(NextPowerOf2(AtLeast - 1)))); - assert(Buckets); - if (!OldBuckets) { - this->BaseT::initEmpty(); - return; - } - - this->moveFromOldBuckets( - llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets)); - - // Free the old table. - deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets, - alignof(BucketT)); - } - void shrink_and_clear() { unsigned OldNumBuckets = NumBuckets; unsigned OldNumEntries = NumEntries; @@ -855,6 +830,37 @@ class DenseMap : public DenseMapBase, NumTombstones = 0; } } + + void copyFrom(const DenseMap &other) { + this->destroyAll(); + deallocateBuckets(); + if (allocateBuckets(other.NumBuckets)) { + this->BaseT::copyFrom(other); + } else { + NumEntries = 0; + NumTombstones = 0; + } + } + + void grow(unsigned AtLeast) { + unsigned OldNumBuckets = NumBuckets; + BucketT *OldBuckets = Buckets; + + allocateBuckets(std::max( + 64, static_cast(NextPowerOf2(AtLeast - 1)))); + assert(Buckets); + if (!OldBuckets) { + this->BaseT::initEmpty(); + return; + } + + this->moveFromOldBuckets( + llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets)); + + // Free the old table. + deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets, + alignof(BucketT)); + } }; template destroyAll(); - deallocateBuckets(); - allocateBuckets(other.getNumBuckets()); - this->BaseT::copyFrom(other); - } - - void init(unsigned InitNumEntries) { - auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries); - allocateBuckets(InitBuckets); - this->BaseT::initEmpty(); - } - - void grow(unsigned AtLeast) { - if (AtLeast > InlineBuckets) - AtLeast = std::max(64, NextPowerOf2(AtLeast - 1)); - - if (Small) { - // First move the inline buckets into a temporary storage. - AlignedCharArrayUnion TmpStorage; - BucketT *TmpBegin = reinterpret_cast(&TmpStorage); - BucketT *TmpEnd = TmpBegin; - - // Loop over the buckets, moving non-empty, non-tombstones into the - // temporary storage. Have the loop move the TmpEnd forward as it goes. - const KeyT EmptyKey = this->getEmptyKey(); - const KeyT TombstoneKey = this->getTombstoneKey(); - for (BucketT &B : inlineBuckets()) { - if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) && - !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) { - assert(size_t(TmpEnd - TmpBegin) < InlineBuckets && - "Too many inline buckets!"); - ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst())); - ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond())); - ++TmpEnd; - B.getSecond().~ValueT(); - } - B.getFirst().~KeyT(); - } - - // AtLeast == InlineBuckets can happen if there are many tombstones, - // and grow() is used to remove them. Usually we always switch to the - // large rep here. - allocateBuckets(AtLeast); - this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd)); - return; - } - - LargeRep OldRep = std::move(*getLargeRep()); - getLargeRep()->~LargeRep(); - allocateBuckets(AtLeast); - - this->moveFromOldBuckets(OldRep.buckets()); - - // Free the old table. - deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets, - alignof(BucketT)); - } - void shrink_and_clear() { unsigned OldSize = this->size(); this->destroyAll(); @@ -1162,6 +1109,65 @@ class SmallDenseMap new (getLargeRep()) LargeRep{NewBuckets, Num}; } } + + void init(unsigned InitNumEntries) { + auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries); + allocateBuckets(InitBuckets); + this->BaseT::initEmpty(); + } + + void copyFrom(const SmallDenseMap &other) { + this->destroyAll(); + deallocateBuckets(); + allocateBuckets(other.getNumBuckets()); + this->BaseT::copyFrom(other); + } + + void grow(unsigned AtLeast) { + if (AtLeast > InlineBuckets) + AtLeast = std::max(64, NextPowerOf2(AtLeast - 1)); + + if (Small) { + // First move the inline buckets into a temporary storage. + AlignedCharArrayUnion TmpStorage; + BucketT *TmpBegin = reinterpret_cast(&TmpStorage); + BucketT *TmpEnd = TmpBegin; + + // Loop over the buckets, moving non-empty, non-tombstones into the + // temporary storage. Have the loop move the TmpEnd forward as it goes. + const KeyT EmptyKey = this->getEmptyKey(); + const KeyT TombstoneKey = this->getTombstoneKey(); + for (BucketT &B : inlineBuckets()) { + if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) && + !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) { + assert(size_t(TmpEnd - TmpBegin) < InlineBuckets && + "Too many inline buckets!"); + ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst())); + ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond())); + ++TmpEnd; + B.getSecond().~ValueT(); + } + B.getFirst().~KeyT(); + } + + // AtLeast == InlineBuckets can happen if there are many tombstones, + // and grow() is used to remove them. Usually we always switch to the + // large rep here. + allocateBuckets(AtLeast); + this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd)); + return; + } + + LargeRep OldRep = std::move(*getLargeRep()); + getLargeRep()->~LargeRep(); + allocateBuckets(AtLeast); + + this->moveFromOldBuckets(OldRep.buckets()); + + // Free the old table. + deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets, + alignof(BucketT)); + } }; template > FailedSymsForQuery; diff --git a/llvm/include/llvm/Support/Recycler.h b/llvm/include/llvm/Support/Recycler.h index b51c58678e653..6502a70dbf89c 100644 --- a/llvm/include/llvm/Support/Recycler.h +++ b/llvm/include/llvm/Support/Recycler.h @@ -19,6 +19,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include +#include namespace llvm { @@ -72,19 +73,19 @@ class Recycler { /// deleted; calling clear is one way to ensure this. template void clear(AllocatorType &Allocator) { - while (FreeList) { - T *t = reinterpret_cast(pop_val()); - Allocator.Deallocate(t, Size, Align); + if constexpr (std::is_same_v, + BumpPtrAllocator>) { + // For BumpPtrAllocator, Deallocate is a no-op, so just drop the free + // list. + FreeList = nullptr; + } else { + while (FreeList) { + T *t = reinterpret_cast(pop_val()); + Allocator.Deallocate(t, Size, Align); + } } } - /// Special case for BumpPtrAllocator which has an empty Deallocate() - /// function. - /// - /// There is no need to traverse the free list, pulling all the objects into - /// cache. - void clear(BumpPtrAllocator &) { FreeList = nullptr; } - template SubClass *Allocate(AllocatorType &Allocator) { static_assert(alignof(SubClass) <= Align, diff --git a/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/llvm/lib/CodeGen/RegisterUsageInfo.cpp index 7a4628a6e91d4..2ef380fc7cad4 100644 --- a/llvm/lib/CodeGen/RegisterUsageInfo.cpp +++ b/llvm/lib/CodeGen/RegisterUsageInfo.cpp @@ -44,7 +44,7 @@ void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) { } bool PhysicalRegisterUsageInfo::doInitialization(Module &M) { - RegMasks.grow(M.size()); + RegMasks.reserve(M.size()); return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ffa24daeb3166..386bcda145a54 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2015,9 +2015,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { Register InReg = FuncInfo.InitializeRegForValue(Inst); std::optional CallConv; - auto *CI = dyn_cast(Inst); - if (CI && !CI->isInlineAsm()) - CallConv = CI->getCallingConv(); + auto *CB = dyn_cast(Inst); + if (CB && !CB->isInlineAsm()) + CallConv = CB->getCallingConv(); RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg, Inst->getType(), CallConv); diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index 8d413a35f5a93..d029ac587fb9a 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -2901,13 +2901,23 @@ ExecutionSession::IL_emit(MaterializationResponsibility &MR, for (auto &SN : ER.Ready) IL_collectQueries( - EQ.Updated, SN->defs(), + EQ.Completed, SN->defs(), [](JITDylib::SymbolTableEntry &E) { E.setState(SymbolState::Ready); }, [](AsynchronousSymbolQuery &Q, JITDylib &JD, NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) { Q.notifySymbolMetRequiredState(SymbolStringPtr(Name), E.getSymbol()); }); + // std::erase_if is not available in C++17, and llvm::erase_if does not work + // here. + for (auto it = EQ.Completed.begin(), end = EQ.Completed.end(); it != end;) { + if ((*it)->isComplete()) { + ++it; + } else { + it = EQ.Completed.erase(it); + } + } + #ifdef EXPENSIVE_CHECKS verifySessionState("exiting ExecutionSession::IL_emit"); #endif @@ -3043,9 +3053,8 @@ Error ExecutionSession::OL_notifyEmitted( } } - for (auto &UQ : EmitQueries->Updated) - if (UQ->isComplete()) - UQ->handleComplete(*this); + for (auto &UQ : EmitQueries->Completed) + UQ->handleComplete(*this); // If there are any bad dependencies then return an error. if (!BadDeps.empty()) { diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index d1832f4469b77..f680a5e52551d 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -651,7 +651,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo { AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~AArch64FunctionInfo() = default; + ~AArch64FunctionInfo() override = default; }; template <> struct MappingTraits { diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 72a7676241770..47d76f361e8cc 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -154,7 +154,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo { bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override; - virtual bool isIgnoredCVReg(MCRegister LLVMReg) const override; + bool isIgnoredCVReg(MCRegister LLVMReg) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 1b4b113fad61c..6bad4dbdf5321 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -131,7 +131,7 @@ class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4 public: MetadataStreamerMsgPackV4() = default; - ~MetadataStreamerMsgPackV4() = default; + ~MetadataStreamerMsgPackV4() override = default; bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; @@ -154,7 +154,7 @@ class MetadataStreamerMsgPackV5 : public MetadataStreamerMsgPackV4 { public: MetadataStreamerMsgPackV5() = default; - ~MetadataStreamerMsgPackV5() = default; + ~MetadataStreamerMsgPackV5() override = default; }; class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 { @@ -163,7 +163,7 @@ class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 { public: MetadataStreamerMsgPackV6() = default; - ~MetadataStreamerMsgPackV6() = default; + ~MetadataStreamerMsgPackV6() override = default; void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF, msgpack::MapDocNode Kern) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index c5c9473752529..0804133faca43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -26,20 +26,19 @@ struct PerFunctionMIParsingState; class AMDGPUMIRFormatter final : public MIRFormatter { public: AMDGPUMIRFormatter() = default; - virtual ~AMDGPUMIRFormatter() = default; + ~AMDGPUMIRFormatter() override = default; /// Implement target specific printing for machine operand immediate value, so /// that we can have more meaningful mnemonic than a 64-bit integer. Passing /// None to OpIdx means the index is unknown. - virtual void printImm(raw_ostream &OS, const MachineInstr &MI, - std::optional OpIdx, - int64_t Imm) const override; + void printImm(raw_ostream &OS, const MachineInstr &MI, + std::optional OpIdx, int64_t Imm) const override; /// Implement target specific parsing of immediate mnemonics. The mnemonic is /// a string with a leading dot. - virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, - StringRef Src, int64_t &Imm, - ErrorCallbackType ErrorCallback) const override; + bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx, + StringRef Src, int64_t &Imm, + ErrorCallbackType ErrorCallback) const override; /// Implement target specific parsing of target custom pseudo source value. bool diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index cbc7427ce6cdf..4d0c163c5ea74 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -32,7 +32,7 @@ class AMDGPUInstrPostProcess : public InstrPostProcess { AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~AMDGPUInstrPostProcess() = default; + ~AMDGPUInstrPostProcess() override = default; void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; @@ -88,7 +88,7 @@ class AMDGPUCustomBehaviour : public CustomBehaviour { AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII); - ~AMDGPUCustomBehaviour() = default; + ~AMDGPUCustomBehaviour() override = default; /// This method is used to determine if an instruction /// should be allowed to be dispatched. The return value is /// how many cycles until the instruction can be dispatched. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 54fcd2af49ecd..246a3f88ebce4 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -64,7 +64,7 @@ class AMDGPUMCExpr : public MCTargetExpr { ArrayRef Args; AMDGPUMCExpr(VariantKind Kind, ArrayRef Args, MCContext &Ctx); - ~AMDGPUMCExpr(); + ~AMDGPUMCExpr() override; bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const; bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 145ce9eca7f45..211642d7c4460 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1657,7 +1657,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned *PredCost = nullptr) const override; InstructionUniformity - getInstructionUniformity(const MachineInstr &MI) const override final; + getInstructionUniformity(const MachineInstr &MI) const final; InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 4be92bf502f4b..419bf533510dd 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -311,7 +311,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { const llvm::MachineFunction &MF); void mappingImpl(yaml::IO &YamlIO) override; - ~SIMachineFunctionInfo() = default; + ~SIMachineFunctionInfo() override = default; }; template <> struct MappingTraits { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 07264d973648f..a177a424bdfce 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -640,7 +640,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool finalizeStore(MachineInstr &MI, bool Atomic) const override; - virtual bool handleCooperativeAtomic(MachineInstr &MI) const override; + bool handleCooperativeAtomic(MachineInstr &MI) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index a9c4b53f9ae64..72eb3d0f8b7f4 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -312,7 +312,7 @@ struct ARMFunctionInfo final : public yaml::MachineFunctionInfo { ARMFunctionInfo(const llvm::ARMFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~ARMFunctionInfo() = default; + ~ARMFunctionInfo() override = default; }; template <> struct MappingTraits { diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp index 3a840a3714975..5548ad1ebff5e 100644 --- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp +++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp @@ -40,7 +40,7 @@ class AVRDisassembler : public MCDisassembler { public: AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - virtual ~AVRDisassembler() = default; + ~AVRDisassembler() override = default; DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp index 619efb376c613..03c60e8b76fa5 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp @@ -24,7 +24,7 @@ class AVRELFObjectWriter : public MCELFObjectTargetWriter { public: AVRELFObjectWriter(uint8_t OSABI); - virtual ~AVRELFObjectWriter() = default; + ~AVRELFObjectWriter() override = default; unsigned getRelocType(const MCFixup &, const MCValue &, bool IsPCRel) const override; diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h index 0cfb2839c8ff9..90ef2073609a6 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.h +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h @@ -32,7 +32,7 @@ class BPFAsmPrinter : public AsmPrinter { void emitInstruction(const MachineInstr *MI) override; MCSymbol *getJTPublicSymbol(unsigned JTI); - virtual void emitJumpTableInfo() override; + void emitJumpTableInfo() override; static char ID; diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp index e3c39a1b8dda5..b12985d81fa75 100644 --- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp +++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp @@ -46,7 +46,7 @@ class BPFCheckAndAdjustIR final : public ModulePass { public: static char ID; BPFCheckAndAdjustIR() : ModulePass(ID) {} - virtual void getAnalysisUsage(AnalysisUsage &AU) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override; private: void checkIR(Module &M); diff --git a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h index f3064c0c8cb8a..af3542ed77389 100644 --- a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h +++ b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h @@ -16,7 +16,7 @@ namespace llvm { class BPFTargetLoweringObjectFileELF : public TargetLoweringObjectFileELF { public: - virtual MCSection * + MCSection * getSectionForJumpTable(const Function &F, const TargetMachine &TM, const MachineJumpTableEntry *JTE) const override; }; diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp index 4aecaf18db480..8e791e6986272 100644 --- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -635,7 +635,7 @@ class InstManager : public GISelChangeObserver { B.setChangeObserver(*this); } - ~InstManager() { B.stopObservingChanges(); } + ~InstManager() override { B.stopObservingChanges(); } void createdInstr(MachineInstr &MI) override { InstList.insert(&MI); } void erasingInstr(MachineInstr &MI) override {} diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h index 34efa0b2ebad5..4ccd3cfc185a6 100644 --- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h +++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h @@ -33,7 +33,7 @@ class RISCVLMULInstrument : public Instrument { explicit RISCVLMULInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {} - ~RISCVLMULInstrument() = default; + ~RISCVLMULInstrument() override = default; uint8_t getLMUL() const; }; @@ -45,7 +45,7 @@ class RISCVSEWInstrument : public Instrument { explicit RISCVSEWInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {} - ~RISCVSEWInstrument() = default; + ~RISCVSEWInstrument() override = default; uint8_t getSEW() const; }; diff --git a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h index b69904d661fbe..b2ce57a35002f 100644 --- a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h +++ b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h @@ -37,7 +37,7 @@ class RISCVConstantPoolValue : public MachineConstantPoolValue { RISCVCPKind Kind; public: - ~RISCVConstantPoolValue() = default; + ~RISCVConstantPoolValue() override = default; static RISCVConstantPoolValue *Create(const GlobalValue *GV); static RISCVConstantPoolValue *Create(LLVMContext &C, StringRef S); diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h index 4fa93f157f52b..f9be80feae211 100644 --- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h +++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h @@ -31,7 +31,7 @@ struct RISCVMachineFunctionInfo final : public yaml::MachineFunctionInfo { RISCVMachineFunctionInfo(const llvm::RISCVMachineFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~RISCVMachineFunctionInfo() = default; + ~RISCVMachineFunctionInfo() override = default; }; template <> struct MappingTraits { diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 28a1690ef0be1..6e444c98de8da 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -347,7 +347,7 @@ class SPIRVLegalizePointerCast : public FunctionPass { public: SPIRVLegalizePointerCast(SPIRVTargetMachine *TM) : FunctionPass(ID), TM(TM) {} - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { const SPIRVSubtarget &ST = TM->getSubtarget(F); GR = ST.getSPIRVGlobalRegistry(); DeadInstructions.clear(); diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp index 60d39c9005b5f..aba9cf7962e68 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp @@ -234,7 +234,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass { } #endif - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { LoopInfo &LI = getAnalysis().getLoopInfo(); auto *TopLevelRegion = getAnalysis() diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp index e621bcd442e9b..b1a8d1ab8a297 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp @@ -28,7 +28,7 @@ class SPIRVStripConvergentIntrinsics : public FunctionPass { SPIRVStripConvergentIntrinsics() : FunctionPass(ID) {} - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { DenseSet ToRemove; // Is the instruction is a convergent intrinsic, add it to kill-list and diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 5b149f8897eec..ea634fb616f10 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -1113,7 +1113,7 @@ class SPIRVStructurizer : public FunctionPass { SPIRVStructurizer() : FunctionPass(ID) {} - virtual bool runOnFunction(Function &F) override { + bool runOnFunction(Function &F) override { bool Modified = false; // In LLVM, Switches are allowed to have several cases branching to the same diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index e585e5af42d32..b4dadaad8914c 100644 --- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -36,7 +36,7 @@ class SparcDisassembler : public MCDisassembler { public: SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - virtual ~SparcDisassembler() = default; + ~SparcDisassembler() override = default; DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h index 1ff6cc81503ce..ba325b5d22951 100644 --- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h +++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h @@ -111,7 +111,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy { public: SystemZPostRASchedStrategy(const MachineSchedContext *C); - virtual ~SystemZPostRASchedStrategy(); + ~SystemZPostRASchedStrategy() override; /// Called for a region before scheduling. void initPolicy(MachineBasicBlock::iterator Begin, diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h index b1de145db3d31..bea8b9f8ae74a 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -101,7 +101,7 @@ class SystemZXPLINK64Registers : public SystemZCallingConventionRegisters { int getStackPointerBias() final { return 2048; } /// Destroys the object. Bogus destructor overriding base class destructor - ~SystemZXPLINK64Registers() = default; + ~SystemZXPLINK64Registers() override = default; }; /// ELF calling convention specific use registers @@ -124,7 +124,7 @@ class SystemZELFRegisters : public SystemZCallingConventionRegisters { int getStackPointerBias() final { return 0; } /// Destroys the object. Bogus destructor overriding base class destructor - ~SystemZELFRegisters() = default; + ~SystemZELFRegisters() override = default; }; struct SystemZRegisterInfo : public SystemZGenRegisterInfo { diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp index aad826b5f285f..465e074c2b50e 100644 --- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp +++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp @@ -36,7 +36,7 @@ class VEDisassembler : public MCDisassembler { public: VEDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - virtual ~VEDisassembler() = default; + ~VEDisassembler() override = default; DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 45bbf128ce0b7..9175b2731dac0 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -102,7 +102,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { WebAssemblyOperand(SMLoc Start, SMLoc End, CaLOp C) : Kind(CatchList), StartLoc(Start), EndLoc(End), CaL(C) {} - ~WebAssemblyOperand() { + ~WebAssemblyOperand() override { if (isBrList()) BrL.~BrLOp(); if (isCatchList()) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index 40ae4aef1d7f2..ff4d64693284a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -192,7 +192,7 @@ struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo { const llvm::WebAssemblyFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~WebAssemblyFunctionInfo() = default; + ~WebAssemblyFunctionInfo() override = default; }; template <> struct MappingTraits { diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index d6197f3344bbb..05a1c22736a17 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -37,7 +37,7 @@ class X86InstrPostProcess : public InstrPostProcess { X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~X86InstrPostProcess() = default; + ~X86InstrPostProcess() override = default; void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 1c06dc4923c2d..af5a69899844c 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -491,7 +491,7 @@ namespace X86_MC { class X86MCInstrAnalysis : public MCInstrAnalysis { X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete; X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete; - virtual ~X86MCInstrAnalysis() = default; + ~X86MCInstrAnalysis() override = default; public: X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 5f974e5de9a19..1bda505ed39f1 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -43,7 +43,7 @@ struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo { X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI); void mappingImpl(yaml::IO &YamlIO) override; - ~X86MachineFunctionInfo() = default; + ~X86MachineFunctionInfo() override = default; }; template <> struct MappingTraits { diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 077d29f7499a4..3b59ebbbb9322 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -272,6 +272,9 @@ AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA, } if (RangePtr && !RangePtr->offsetOrSizeAreUnknown()) { + int64_t StorageSize = DL.getTypeStoreSize(&Ty); + if (StorageSize != RangePtr->Size) + return nullptr; APInt Offset = APInt(64, RangePtr->Offset); return ConstantFoldLoadFromConst(Initializer, &Ty, Offset, DL); } diff --git a/llvm/test/Transforms/Attributor/range-and-constant-fold.ll b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll new file mode 100644 index 0000000000000..a8f33092d0d0b --- /dev/null +++ b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=attributor %s -o - | FileCheck %s + +@g = internal unnamed_addr addrspace(4) constant [3 x i8] c"12\00", align 16 + +define void @foo(i32 %a, i32 %b, ptr %p.0, ptr %p.1) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], ptr nofree nonnull writeonly captures(none) dereferenceable(1) [[P_0:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(8) [[P_1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[A]], [[B]] +; CHECK-NEXT: br i1 [[CMP]], label %[[L1:.*]], label %[[L2:.*]] +; CHECK: [[L1]]: +; CHECK-NEXT: br label %[[L3:.*]] +; CHECK: [[L2]]: +; CHECK-NEXT: br label %[[L3]] +; CHECK: [[L3]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr addrspace(4) [ @g, %[[L1]] ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %[[L2]] ] +; CHECK-NEXT: [[LOAD_SMALL:%.*]] = load i8, ptr addrspace(4) [[PHI]], align 4 +; CHECK-NEXT: store i8 [[LOAD_SMALL]], ptr [[P_0]], align 1 +; CHECK-NEXT: [[LOAD_LARGE:%.*]] = load i64, ptr addrspace(4) [[PHI]], align 4 +; CHECK-NEXT: store i64 [[LOAD_LARGE]], ptr [[P_1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %cmp = icmp ne i32 %a, %b + br i1 %cmp, label %l1, label %l2 +l1: + br label %l3 +l2: + br label %l3 +l3: + %phi = phi ptr addrspace(4) [ @g, %l1 ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %l2 ] + %load.small = load i8, ptr addrspace(4) %phi + store i8 %load.small, ptr %p.0 + %load.large = load i64, ptr addrspace(4) %phi + store i64 %load.large, ptr %p.1 + ret void +} diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp index 50e9c6e138ef1..aceb4f30d878d 100644 --- a/llvm/unittests/ADT/DenseMapTest.cpp +++ b/llvm/unittests/ADT/DenseMapTest.cpp @@ -70,6 +70,16 @@ class CtorTester { int getValue() const { return Value; } bool operator==(const CtorTester &RHS) const { return Value == RHS.Value; } + + // Return the number of live CtorTester objects, excluding the empty and + // tombstone keys. + static size_t getNumConstructed() { + return std::count_if(Constructed.begin(), Constructed.end(), + [](const CtorTester *Obj) { + int V = Obj->getValue(); + return V != -1 && V != -2; + }); + } }; std::set CtorTester::Constructed; @@ -1031,4 +1041,64 @@ TEST(SmallDenseMapCustomTest, InitSize) { } } +TEST(DenseMapCustomTest, KeyDtor) { + // This test relies on CtorTester being non-trivially destructible. + static_assert(!std::is_trivially_destructible_v, + "CtorTester must not be trivially destructible"); + + // Test that keys are destructed on scope exit. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap Map; + Map.try_emplace(CtorTester(0), 1); + Map.try_emplace(CtorTester(1), 2); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + + // Test that keys are destructed on erase and shrink_and_clear. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap Map; + Map.try_emplace(CtorTester(0), 1); + Map.try_emplace(CtorTester(1), 2); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + Map.erase(CtorTester(1)); + EXPECT_EQ(1u, CtorTester::getNumConstructed()); + Map.shrink_and_clear(); + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); +} + +TEST(DenseMapCustomTest, ValueDtor) { + // This test relies on CtorTester being non-trivially destructible. + static_assert(!std::is_trivially_destructible_v, + "CtorTester must not be trivially destructible"); + + // Test that values are destructed on scope exit. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap Map; + Map.try_emplace(0, CtorTester(1)); + Map.try_emplace(1, CtorTester(2)); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + + // Test that values are destructed on erase and shrink_and_clear. + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + { + DenseMap Map; + Map.try_emplace(0, CtorTester(1)); + Map.try_emplace(1, CtorTester(2)); + EXPECT_EQ(2u, CtorTester::getNumConstructed()); + Map.erase(1); + EXPECT_EQ(1u, CtorTester::getNumConstructed()); + Map.shrink_and_clear(); + EXPECT_EQ(0u, CtorTester::getNumConstructed()); + } + EXPECT_EQ(0u, CtorTester::getNumConstructed()); +} + } // namespace diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index b570f8de06880..151b06573d961 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -71,6 +71,7 @@ CodeGen/Hexagon/autohvx/interleave.ll CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll CodeGen/Hexagon/loop-idiom/memmove-rt-check.ll +CodeGen/Hexagon/masked_gather.ll CodeGen/NVPTX/lower-ctor-dtor.ll CodeGen/RISCV/zmmul.ll CodeGen/WebAssembly/memory-interleave.ll @@ -331,6 +332,10 @@ Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll Instrumentation/MemorySanitizer/AArch64/module-flags-aarch64.ll Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll Instrumentation/MemorySanitizer/AArch64/qshrn.ll +Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll +Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll +Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll +Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll Instrumentation/MemorySanitizer/AArch64/vararg.ll Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll Instrumentation/MemorySanitizer/abs-vector.ll diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 1437c39c69da2..37db096f1ba75 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1111,9 +1111,9 @@ def AMDGPU_ScaledMFMAOp : AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>, Pure]>, Arguments<(ins - I32Attr:$m, - I32Attr:$n, - I32Attr:$k, + ConfinedAttr]>:$m, + ConfinedAttr]>:$n, + ConfinedAttr]>:$k, ScaledMFMAInTypes:$sourceA, ScaledMFMAInTypes:$sourceB, ScaledMFMAOutTypes:$destC, @@ -1126,8 +1126,8 @@ def AMDGPU_ScaledMFMAOp : let summary = "MLIR wrapper for CDNA scaled mfma instructions"; let description = [{ The `amdgpu.scaled_mfma` op is an MLIR wrapper around intrinsics - for various scaled versions of `mfma` instructions in the CDNA architecture, which perform - multiple outer products in order to allow fast matrix multiplication. + for various scaled versions of `mfma` instructions in the CDNA architecture, which + perform multiple outer products in order to allow fast matrix multiplication. The wrapper will select an appropriate `mfma` instruction, if one is available, based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the @@ -1142,15 +1142,23 @@ def AMDGPU_ScaledMFMAOp : This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences: - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and - fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile - size. + fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as + their tile size. - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` - are omitted from this wrapper. - - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for - double-precision operations on gfx94x and so are not included here. + are omitted from this wrapper. + - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported + for double-precision operations on gfx94x and so are not included here. + + Example: + ```mlir + %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 + : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> + ``` }]; let assemblyFormat = [{ - `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC + custom($m, $n, $k) ` ` + `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` + `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC attr-dict `:` type($scalesA) `,` type($sourceA) `,` type($scalesB) `,` type($sourceB) `,` type($destC) }]; diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir index 6de55d534affb..c746d7690b00d 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir @@ -55,10 +55,10 @@ func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>, // CHECK-LABEL: func @scaled_mfma_to_rocdl( // CHECK-SAME: %[[ARG0:.*]]: vector<16xf32>, %[[ARG1:.*]]: vector<4xf32>, %[[ARG2:.*]]: vector<32xf8E4M3FN>, %[[ARG3:.*]]: vector<32xf8E5M2>, %[[ARG4:.*]]: vector<32xf6E2M3FN>, %[[ARG5:.*]]: vector<32xf6E3M2FN>, %[[ARG6:.*]]: vector<32xf4E2M1FN>, %[[ARG7:.*]]: vector<4xf8E8M0FNU>, %[[ARG8:.*]]: f8E8M0FNU func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>, - %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>, - %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>, - %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>, - %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) { + %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>, + %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>, + %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>, + %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) { // CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32 @@ -66,39 +66,39 @@ func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>, // CHECK: %[[z0:.+]] = llvm.zext {{.*}} : i8 to i32 // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: llvm.mlir.constant(3 : i32) : i32 // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32> // CHECK: llvm.bitcast // CHECK: llvm.mlir.constant(4 : i32) : i32 // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32> + amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32> // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> - amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32> + amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32> func.return } diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir index 52d3275dab43b..fee0c00606ab4 100644 --- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir +++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir @@ -165,10 +165,10 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds: // CHECK-LABEL: func @scaled_mfma // CHECK: %[[SCALE_1:.*]] = vector.extract_strided_slice %0 {offsets = [0], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> // CHECK: %[[SCALE_2:.*]] = vector.extract_strided_slice %2 {offsets = [4], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> -// CHECK: amdgpu.scaled_mfma(%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}} +// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}} // CHECK: %[[SCALE_3:.*]] = vector.extract_strided_slice %5 {offsets = [8], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> // CHECK: %[[SCALE_4:.*]] = vector.extract_strided_slice %7 {offsets = [12], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU> -// CHECK: amdgpu.scaled_mfma(%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}} +// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}} func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2x1x8x1xf8E8M0FNU>, %scalesB: vector<2x1x8x1xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>) { %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32> %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU> @@ -176,12 +176,12 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %scaleB = vector.extract %scalesB[0, 0, 6, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU> %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> %scaleC = vector.extract %scalesA[1, 0, 1, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU> %sC = vector.insert %scaleC, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %scaleD = vector.extract %scalesB[1, 0, 4, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU> %sD = vector.insert %scaleD, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - %res_1 = amdgpu.scaled_mfma(%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_1 = amdgpu.scaled_mfma 16x16x128 (%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> return %res_0, %res_1 : vector<4xf32>, vector<4xf32> } @@ -192,7 +192,7 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU> // CHECK: vector.extract {{.*}} : f8E8M0FNU from vector<2xf8E8M0FNU> // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU> -// CHECK: amdgpu.scaled_mfma({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}} +// CHECK: amdgpu.scaled_mfma 16x16x128 ({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}} func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2xf8E8M0FNU>, %scalesB: vector<2xf8E8M0FNU>) -> vector<4xf32> { %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32> %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU> @@ -200,17 +200,17 @@ func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4 %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %scaleB = vector.extract %scalesB[1] : f8E8M0FNU from vector<2xf8E8M0FNU> %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> return %res_0 : vector<4xf32> } // ----- // CHECK-LABEL: func @scaled_mfma_ugly_shapes -// CHECK: amdgpu.scaled_mfma(%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> -// CHECK: amdgpu.scaled_mfma(%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> -// CHECK: amdgpu.scaled_mfma(%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> -// CHECK: amdgpu.scaled_mfma(%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> +// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<5x5xf8E8M0FNU>, %scalesB: vector<7x23xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) { %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32> %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU> @@ -237,10 +237,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4 %sB_6_21 = vector.insert %scaleB_6_21, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %sB_6_20 = vector.insert %scaleB_6_20, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> %sB_6_19 = vector.insert %scaleB_6_19, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU> - - %res_4 = amdgpu.scaled_mfma(%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> - %res_5 = amdgpu.scaled_mfma(%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> - %res_6 = amdgpu.scaled_mfma(%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> - %res_7 = amdgpu.scaled_mfma(%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + + %res_4 = amdgpu.scaled_mfma 16x16x128 (%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_5 = amdgpu.scaled_mfma 16x16x128 (%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_6 = amdgpu.scaled_mfma 16x16x128 (%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> + %res_7 = amdgpu.scaled_mfma 16x16x128 (%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32> return %res_4, %res_5, %res_6, %res_7 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32> } diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index b27dfdf43ca59..57847641a2d03 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -314,3 +314,27 @@ func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16> func.return } + +// ----- + +func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { + // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}} + %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> + func.return %0 : vector<16xf32> +} + +// ----- + +func.func @scaled_mfma_invalid_n(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { + // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}} + %0 = amdgpu.scaled_mfma 32x8x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> + func.return %0 : vector<16xf32> +} + +// ----- + +func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { + // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {64, 128}}} + %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> + func.return %0 : vector<16xf32> +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index d0bf0d73c4875..a33096750ee23 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -609,8 +609,8 @@ func.func @permlane32_swap(%arg0 : f32) -> f32 { // CHECK-LABEL: func @scaled_mfma func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> { - // CHECK: amdgpu.scaled_mfma - %0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> + // CHECK: amdgpu.scaled_mfma 32x32x64 + %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32> func.return %0 : vector<16xf32> }