diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index fe388b9fa045e..ce273c167aa22 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -239,6 +239,8 @@ class HLSLSemanticAttr : public HLSLAnnotationAttr { LLVM_PREFERRED_TYPE(bool) unsigned SemanticExplicitIndex : 1; + Decl *TargetDecl = nullptr; + protected: HLSLSemanticAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, attr::Kind AK, bool IsLateParsed, @@ -259,6 +261,11 @@ class HLSLSemanticAttr : public HLSLAnnotationAttr { unsigned getSemanticIndex() const { return SemanticIndex; } + bool isSemanticIndexExplicit() const { return SemanticExplicitIndex; } + + void setTargetDecl(Decl *D) { TargetDecl = D; } + Decl *getTargetDecl() const { return TargetDecl; } + // Implement isa/cast/dyncast/etc. static bool classof(const Attr *A) { return A->getKind() >= attr::FirstHLSLSemanticAttr && diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index b320f4bedba0c..749f531ec9ab1 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -787,6 +787,8 @@ class HLSLSemanticAttr : HLSLAnnotationAttr { let Spellings = []; let Subjects = SubjectList<[ParmVar, Field, Function]>; let LangOpts = [HLSL]; + let Args = [DeclArgument, IntArgument<"SemanticIndex">, + BoolArgument<"SemanticExplicitIndex">]; } /// A target-specific attribute. This class is meant to be used as a mixin diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td index 64391def905ab..9e344160ff934 100644 --- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td +++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td @@ -404,10 +404,6 @@ def warn_hlsl_langstd_minimal : "recommend using %1 instead">, InGroup; -def err_hlsl_semantic_missing : Error<"semantic annotations must be present " - "for all input and outputs of an entry " - "function or patch constant function">; - // ClangIR frontend errors def err_cir_to_cir_transform_failed : Error< "CIR-to-CIR transformation failed">, DefaultFatal; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 20e34cea72031..d6412dc0eeaf6 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13173,6 +13173,7 @@ def err_hlsl_duplicate_parameter_modifier : Error<"duplicate parameter modifier def err_hlsl_missing_semantic_annotation : Error< "semantic annotations must be present for all parameters of an entry " "function or patch constant function">; +def note_hlsl_semantic_used_here : Note<"%0 used here">; def err_hlsl_unknown_semantic : Error<"unknown HLSL semantic %0">; def err_hlsl_semantic_output_not_supported : Error<"semantic %0 does not support output">; diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 86d09d72fe6ca..2b361ed0982c6 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -4052,6 +4052,43 @@ def CIR_ExpectOp : CIR_Op<"expect", [ }]; } +//===----------------------------------------------------------------------===// +// PrefetchOp +//===----------------------------------------------------------------------===// + +def CIR_PrefetchOp : CIR_Op<"prefetch"> { + let summary = "Prefetch operation"; + let description = [{ + The `cir.prefetch` operation is a hint to the code generator to insert a + prefetch instruction if supported; otherwise, it is a noop. Prefetches + have no effect on the behavior of the program but can change its + performance characteristics. + + ```mlir + cir.prefetch(%0 : !cir.ptr) locality(1) write + ``` + + $locality is a temporal locality specifier ranging from (0) - no locality, + to (3) - extremely local, keep in cache. If $locality is not present, the + default value is 3. + + $isWrite specifies whether the prefetch is for a 'read' or 'write'. If + $isWrite is not specified, it means that prefetch is prepared for 'read'. + }]; + + let arguments = (ins CIR_VoidPtrType:$addr, + DefaultValuedAttr, IntMaxValue<3>]>, + "3">:$locality, + UnitAttr:$isWrite); + + let assemblyFormat = [{ + (`write` $isWrite^) : (`read`)? + `locality` `(` $locality `)` + $addr `:` qualified(type($addr)) + attr-dict + }]; +} + //===----------------------------------------------------------------------===// // PtrDiffOp //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index f9d3a4ea94806..8c3b6ae176389 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -130,9 +130,6 @@ class SemaHLSL : public SemaBase { bool ActOnUninitializedVarDecl(VarDecl *D); void ActOnEndOfTranslationUnit(TranslationUnitDecl *TU); void CheckEntryPoint(FunctionDecl *FD); - bool isSemanticValid(FunctionDecl *FD, DeclaratorDecl *D); - void CheckSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param, - const HLSLAnnotationAttr *AnnotationAttr); bool CheckResourceBinOp(BinaryOperatorKind Opc, Expr *LHSExpr, Expr *RHSExpr, SourceLocation Loc); void DiagnoseAttrStageMismatch( @@ -179,17 +176,17 @@ class SemaHLSL : public SemaBase { bool handleResourceTypeAttr(QualType T, const ParsedAttr &AL); template - T *createSemanticAttr(const ParsedAttr &AL, + T *createSemanticAttr(const AttributeCommonInfo &ACI, NamedDecl *TargetDecl, std::optional Location) { - T *Attr = ::new (getASTContext()) T(getASTContext(), AL); - if (Attr->isSemanticIndexable()) - Attr->setSemanticIndex(Location ? *Location : 0); - else if (Location.has_value()) { + T *Attr = + ::new (getASTContext()) T(getASTContext(), ACI, TargetDecl, + Location.value_or(0), Location.has_value()); + + if (!Attr->isSemanticIndexable() && Location.has_value()) { Diag(Attr->getLocation(), diag::err_hlsl_semantic_indexing_not_supported) << Attr->getAttrName()->getName(); return nullptr; } - return Attr; } @@ -247,10 +244,25 @@ class SemaHLSL : public SemaBase { IdentifierInfo *RootSigOverrideIdent = nullptr; + struct SemanticInfo { + HLSLSemanticAttr *Semantic; + std::optional Index; + }; + private: void collectResourceBindingsOnVarDecl(VarDecl *D); void collectResourceBindingsOnUserRecordDecl(const VarDecl *VD, const RecordType *RT); + + void checkSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param, + const HLSLSemanticAttr *SemanticAttr); + HLSLSemanticAttr *createSemantic(const SemanticInfo &Semantic, + DeclaratorDecl *TargetDecl); + bool determineActiveSemanticOnScalar(FunctionDecl *FD, DeclaratorDecl *D, + SemanticInfo &ActiveSemantic); + bool determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D, + SemanticInfo &ActiveSemantic); + void processExplicitBindingsOnDecl(VarDecl *D); void diagnoseAvailabilityViolations(TranslationUnitDecl *TU); diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index f15b3c1e5aae4..836d22f6e5389 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -1842,7 +1842,6 @@ bool Compiler::visitInitList(ArrayRef Inits, const Expr *Init, PrimType T, bool Activate = false) -> bool { InitStackScope ISS(this, isa(Init)); - InitLinkScope ILS(this, InitLink::Field(FieldToInit->Offset)); if (!this->visit(Init)) return false; @@ -5385,55 +5384,57 @@ bool Compiler::VisitCXXThisExpr(const CXXThisExpr *E) { // instance pointer of the current function frame, but e.g. to the declaration // currently being initialized. Here we emit the necessary instruction(s) for // this scenario. - if (!InitStackActive) + if (!InitStackActive || InitStack.empty()) return this->emitThis(E); - if (!InitStack.empty()) { - // If our init stack is, for example: - // 0 Stack: 3 (decl) - // 1 Stack: 6 (init list) - // 2 Stack: 1 (field) - // 3 Stack: 6 (init list) - // 4 Stack: 1 (field) - // - // We want to find the LAST element in it that's an init list, - // which is marked with the K_InitList marker. The index right - // before that points to an init list. We need to find the - // elements before the K_InitList element that point to a base - // (e.g. a decl or This), optionally followed by field, elem, etc. - // In the example above, we want to emit elements [0..2]. - unsigned StartIndex = 0; - unsigned EndIndex = 0; - // Find the init list. - for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) { - if (InitStack[StartIndex].Kind == InitLink::K_InitList || - InitStack[StartIndex].Kind == InitLink::K_This) { - EndIndex = StartIndex; - --StartIndex; - break; - } + // If our init stack is, for example: + // 0 Stack: 3 (decl) + // 1 Stack: 6 (init list) + // 2 Stack: 1 (field) + // 3 Stack: 6 (init list) + // 4 Stack: 1 (field) + // + // We want to find the LAST element in it that's an init list, + // which is marked with the K_InitList marker. The index right + // before that points to an init list. We need to find the + // elements before the K_InitList element that point to a base + // (e.g. a decl or This), optionally followed by field, elem, etc. + // In the example above, we want to emit elements [0..2]. + unsigned StartIndex = 0; + unsigned EndIndex = 0; + // Find the init list. + for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) { + if (InitStack[StartIndex].Kind == InitLink::K_InitList || + InitStack[StartIndex].Kind == InitLink::K_This) { + EndIndex = StartIndex; + --StartIndex; + break; } + } - // Walk backwards to find the base. - for (; StartIndex > 0; --StartIndex) { - if (InitStack[StartIndex].Kind == InitLink::K_InitList) - continue; + // Walk backwards to find the base. + for (; StartIndex > 0; --StartIndex) { + if (InitStack[StartIndex].Kind == InitLink::K_InitList) + continue; - if (InitStack[StartIndex].Kind != InitLink::K_Field && - InitStack[StartIndex].Kind != InitLink::K_Elem) - break; - } + if (InitStack[StartIndex].Kind != InitLink::K_Field && + InitStack[StartIndex].Kind != InitLink::K_Elem) + break; + } - // Emit the instructions. - for (unsigned I = StartIndex; I != EndIndex; ++I) { - if (InitStack[I].Kind == InitLink::K_InitList) - continue; - if (!InitStack[I].template emit(this, E)) - return false; - } - return true; + if (StartIndex == 0 && EndIndex == 0) + EndIndex = InitStack.size() - 1; + + assert(StartIndex < EndIndex); + + // Emit the instructions. + for (unsigned I = StartIndex; I != (EndIndex + 1); ++I) { + if (InitStack[I].Kind == InitLink::K_InitList) + continue; + if (!InitStack[I].template emit(this, E)) + return false; } - return this->emitThis(E); + return true; } template bool Compiler::visitStmt(const Stmt *S) { @@ -6295,6 +6296,10 @@ bool Compiler::compileConstructor(const CXXConstructorDecl *Ctor) { } assert(NestedField); + unsigned FirstLinkOffset = + R->getField(cast(IFD->chain()[0]))->Offset; + InitStackScope ISS(this, isa(InitExpr)); + InitLinkScope ILS(this, InitLink::Field(FirstLinkOffset)); if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr, IsUnion)) return false; diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index 27c4d11fa233a..62fa04e15c717 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -454,6 +454,27 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, assert(!cir::MissingFeatures::coroSizeBuiltinCall()); return getUndefRValue(e->getType()); } + case Builtin::BI__builtin_prefetch: { + auto evaluateOperandAsInt = [&](const Expr *arg) { + Expr::EvalResult res; + [[maybe_unused]] bool evalSucceed = + arg->EvaluateAsInt(res, cgm.getASTContext()); + assert(evalSucceed && "expression should be able to evaluate as int"); + return res.Val.getInt().getZExtValue(); + }; + + bool isWrite = false; + if (e->getNumArgs() > 1) + isWrite = evaluateOperandAsInt(e->getArg(1)); + + int locality = 3; + if (e->getNumArgs() > 2) + locality = evaluateOperandAsInt(e->getArg(2)); + + mlir::Value address = emitScalarExpr(e->getArg(0)); + cir::PrefetchOp::create(builder, loc, address, locality, isWrite); + return RValue::get(nullptr); + } } // If this is an alias for a lib function (e.g. __builtin_sin), emit diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index a30ae02c895c2..5a6193fa8d840 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1695,6 +1695,15 @@ static uint64_t getTypeSize(mlir::Type type, mlir::Operation &op) { return llvm::divideCeil(layout.getTypeSizeInBits(type), 8); } +mlir::LogicalResult CIRToLLVMPrefetchOpLowering::matchAndRewrite( + cir::PrefetchOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + rewriter.replaceOpWithNewOp( + op, adaptor.getAddr(), adaptor.getIsWrite(), adaptor.getLocality(), + /*DataCache=*/1); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMPtrDiffOpLowering::matchAndRewrite( cir::PtrDiffOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index ecab9336a9f82..945f9e2451bc1 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -562,17 +562,16 @@ static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M, return B.CreateLoad(Ty, GV); } -llvm::Value * -CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type, - const clang::DeclaratorDecl *Decl, - SemanticInfo &ActiveSemantic) { - if (isa(ActiveSemantic.Semantic)) { +llvm::Value *CGHLSLRuntime::emitSystemSemanticLoad( + IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl, + Attr *Semantic, std::optional Index) { + if (isa(Semantic)) { llvm::Function *GroupIndex = CGM.getIntrinsic(getFlattenedThreadIdInGroupIntrinsic()); return B.CreateCall(FunctionCallee(GroupIndex)); } - if (isa(ActiveSemantic.Semantic)) { + if (isa(Semantic)) { llvm::Intrinsic::ID IntrinID = getThreadIdIntrinsic(); llvm::Function *ThreadIDIntrinsic = llvm::Intrinsic::isOverloaded(IntrinID) @@ -581,7 +580,7 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type, return buildVectorInput(B, ThreadIDIntrinsic, Type); } - if (isa(ActiveSemantic.Semantic)) { + if (isa(Semantic)) { llvm::Intrinsic::ID IntrinID = getGroupThreadIdIntrinsic(); llvm::Function *GroupThreadIDIntrinsic = llvm::Intrinsic::isOverloaded(IntrinID) @@ -590,7 +589,7 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type, return buildVectorInput(B, GroupThreadIDIntrinsic, Type); } - if (isa(ActiveSemantic.Semantic)) { + if (isa(Semantic)) { llvm::Intrinsic::ID IntrinID = getGroupIdIntrinsic(); llvm::Function *GroupIDIntrinsic = llvm::Intrinsic::isOverloaded(IntrinID) @@ -599,8 +598,7 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type, return buildVectorInput(B, GroupIDIntrinsic, Type); } - if (HLSLSV_PositionAttr *S = - dyn_cast(ActiveSemantic.Semantic)) { + if (HLSLSV_PositionAttr *S = dyn_cast(Semantic)) { if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Pixel) return createSPIRVBuiltinLoad(B, CGM.getModule(), Type, S->getAttrName()->getName(), @@ -611,29 +609,56 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type, } llvm::Value * -CGHLSLRuntime::handleScalarSemanticLoad(IRBuilder<> &B, llvm::Type *Type, - const clang::DeclaratorDecl *Decl, - SemanticInfo &ActiveSemantic) { - - if (!ActiveSemantic.Semantic) { - ActiveSemantic.Semantic = Decl->getAttr(); - if (!ActiveSemantic.Semantic) { - CGM.getDiags().Report(Decl->getInnerLocStart(), - diag::err_hlsl_semantic_missing); - return nullptr; +CGHLSLRuntime::handleScalarSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD, + llvm::Type *Type, + const clang::DeclaratorDecl *Decl) { + + HLSLSemanticAttr *Semantic = nullptr; + for (HLSLSemanticAttr *Item : FD->specific_attrs()) { + if (Item->getTargetDecl() == Decl) { + Semantic = Item; + break; } - ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex(); } + // Sema must create one attribute per scalar field. + assert(Semantic); - return emitSystemSemanticLoad(B, Type, Decl, ActiveSemantic); + std::optional Index = std::nullopt; + if (Semantic->isSemanticIndexExplicit()) + Index = Semantic->getSemanticIndex(); + return emitSystemSemanticLoad(B, Type, Decl, Semantic, Index); } llvm::Value * -CGHLSLRuntime::handleSemanticLoad(IRBuilder<> &B, llvm::Type *Type, - const clang::DeclaratorDecl *Decl, - SemanticInfo &ActiveSemantic) { - assert(!Type->isStructTy()); - return handleScalarSemanticLoad(B, Type, Decl, ActiveSemantic); +CGHLSLRuntime::handleStructSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD, + llvm::Type *Type, + const clang::DeclaratorDecl *Decl) { + const llvm::StructType *ST = cast(Type); + const clang::RecordDecl *RD = Decl->getType()->getAsRecordDecl(); + + assert(std::distance(RD->field_begin(), RD->field_end()) == + ST->getNumElements()); + + llvm::Value *Aggregate = llvm::PoisonValue::get(Type); + auto FieldDecl = RD->field_begin(); + for (unsigned I = 0; I < ST->getNumElements(); ++I) { + llvm::Value *ChildValue = + handleSemanticLoad(B, FD, ST->getElementType(I), *FieldDecl); + assert(ChildValue); + Aggregate = B.CreateInsertValue(Aggregate, ChildValue, I); + ++FieldDecl; + } + + return Aggregate; +} + +llvm::Value * +CGHLSLRuntime::handleSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD, + llvm::Type *Type, + const clang::DeclaratorDecl *Decl) { + if (Type->isStructTy()) + return handleStructSemanticLoad(B, FD, Type, Decl); + return handleScalarSemanticLoad(B, FD, Type, Decl); } void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD, @@ -680,8 +705,25 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD, } const ParmVarDecl *PD = FD->getParamDecl(Param.getArgNo() - SRetOffset); - SemanticInfo ActiveSemantic = {nullptr, 0}; - Args.push_back(handleSemanticLoad(B, Param.getType(), PD, ActiveSemantic)); + llvm::Value *SemanticValue = nullptr; + if ([[maybe_unused]] HLSLParamModifierAttr *MA = + PD->getAttr()) { + llvm_unreachable("Not handled yet"); + } else { + llvm::Type *ParamType = + Param.hasByValAttr() ? Param.getParamByValType() : Param.getType(); + SemanticValue = handleSemanticLoad(B, FD, ParamType, PD); + if (!SemanticValue) + return; + if (Param.hasByValAttr()) { + llvm::Value *Var = B.CreateAlloca(Param.getParamByValType()); + B.CreateStore(SemanticValue, Var); + SemanticValue = Var; + } + } + + assert(SemanticValue); + Args.push_back(SemanticValue); } CallInst *CI = B.CreateCall(FunctionCallee(Fn), Args, OB); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 103b4a98f6c26..d35df524fdc84 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -144,26 +144,24 @@ class CGHLSLRuntime { protected: CodeGenModule &CGM; - void collectInputSemantic(llvm::IRBuilder<> &B, const DeclaratorDecl *D, - llvm::Type *Type, - SmallVectorImpl &Inputs); - - struct SemanticInfo { - clang::HLSLSemanticAttr *Semantic; - uint32_t Index; - }; - llvm::Value *emitSystemSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl, - SemanticInfo &ActiveSemantic); - - llvm::Value *handleScalarSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type, - const clang::DeclaratorDecl *Decl, - SemanticInfo &ActiveSemantic); - - llvm::Value *handleSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type, - const clang::DeclaratorDecl *Decl, - SemanticInfo &ActiveSemantic); + Attr *Semantic, + std::optional Index); + + llvm::Value *handleScalarSemanticLoad(llvm::IRBuilder<> &B, + const FunctionDecl *FD, + llvm::Type *Type, + const clang::DeclaratorDecl *Decl); + + llvm::Value *handleStructSemanticLoad(llvm::IRBuilder<> &B, + const FunctionDecl *FD, + llvm::Type *Type, + const clang::DeclaratorDecl *Decl); + + llvm::Value *handleSemanticLoad(llvm::IRBuilder<> &B, const FunctionDecl *FD, + llvm::Type *Type, + const clang::DeclaratorDecl *Decl); public: CGHLSLRuntime(CodeGenModule &CGM) : CGM(CGM) {} diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index c97a9e81eb59e..1d0dfd0b9c151 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -4407,8 +4407,12 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, // breaking after it. if (Right.is(TT_SelectorName)) return 0; - if (Left.is(tok::colon) && Left.is(TT_ObjCMethodExpr)) - return Line.MightBeFunctionDecl ? 50 : 500; + if (Left.is(tok::colon)) { + if (Left.is(TT_ObjCMethodExpr)) + return Line.MightBeFunctionDecl ? 50 : 500; + if (Left.is(TT_ObjCSelector)) + return 500; + } // In Objective-C type declarations, avoid breaking after the category's // open paren (we'll prefer breaking after the protocol list's opening @@ -6291,7 +6295,9 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, TT_BitFieldColon)) { return false; } - if (Left.is(tok::colon) && Left.isOneOf(TT_DictLiteral, TT_ObjCMethodExpr)) { + if (Left.is(tok::colon) && Left.isOneOf(TT_ObjCSelector, TT_ObjCMethodExpr)) + return true; + if (Left.is(tok::colon) && Left.is(TT_DictLiteral)) { if (Style.isProto()) { if (!Style.AlwaysBreakBeforeMultilineStrings && Right.isStringLiteral()) return false; diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 5b3e89f936327..2a485da06908d 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -770,23 +770,81 @@ void SemaHLSL::ActOnTopLevelFunction(FunctionDecl *FD) { } } -bool SemaHLSL::isSemanticValid(FunctionDecl *FD, DeclaratorDecl *D) { - const auto *AnnotationAttr = D->getAttr(); - if (AnnotationAttr) { - CheckSemanticAnnotation(FD, D, AnnotationAttr); - return true; +HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info, + DeclaratorDecl *TargetDecl) { + std::string SemanticName = Info.Semantic->getAttrName()->getName().upper(); + + if (SemanticName == "SV_DISPATCHTHREADID") { + return createSemanticAttr( + *Info.Semantic, TargetDecl, Info.Index); + } else if (SemanticName == "SV_GROUPINDEX") { + return createSemanticAttr(*Info.Semantic, TargetDecl, + Info.Index); + } else if (SemanticName == "SV_GROUPTHREADID") { + return createSemanticAttr(*Info.Semantic, + TargetDecl, Info.Index); + } else if (SemanticName == "SV_GROUPID") { + return createSemanticAttr(*Info.Semantic, TargetDecl, + Info.Index); + } else if (SemanticName == "SV_POSITION") { + return createSemanticAttr(*Info.Semantic, TargetDecl, + Info.Index); + } else + Diag(Info.Semantic->getLoc(), diag::err_hlsl_unknown_semantic) + << *Info.Semantic; + + return nullptr; +} + +bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD, + DeclaratorDecl *D, + SemanticInfo &ActiveSemantic) { + if (ActiveSemantic.Semantic == nullptr) { + ActiveSemantic.Semantic = D->getAttr(); + if (ActiveSemantic.Semantic && + ActiveSemantic.Semantic->isSemanticIndexExplicit()) + ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex(); + } + + if (!ActiveSemantic.Semantic) { + Diag(D->getLocation(), diag::err_hlsl_missing_semantic_annotation); + return false; + } + + auto *A = createSemantic(ActiveSemantic, D); + if (!A) + return false; + + checkSemanticAnnotation(FD, D, A); + FD->addAttr(A); + return true; +} + +bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D, + SemanticInfo &ActiveSemantic) { + if (ActiveSemantic.Semantic == nullptr) { + ActiveSemantic.Semantic = D->getAttr(); + if (ActiveSemantic.Semantic && + ActiveSemantic.Semantic->isSemanticIndexExplicit()) + ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex(); } const Type *T = D->getType()->getUnqualifiedDesugaredType(); const RecordType *RT = dyn_cast(T); if (!RT) - return false; + return determineActiveSemanticOnScalar(FD, D, ActiveSemantic); const RecordDecl *RD = RT->getDecl(); for (FieldDecl *Field : RD->fields()) { - if (!isSemanticValid(FD, Field)) + SemanticInfo Info = ActiveSemantic; + if (!determineActiveSemantic(FD, Field, Info)) { + Diag(Field->getLocation(), diag::note_hlsl_semantic_used_here) << Field; return false; + } + if (ActiveSemantic.Semantic) + ActiveSemantic = Info; } + return true; } @@ -853,8 +911,11 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) { } for (ParmVarDecl *Param : FD->parameters()) { - if (!isSemanticValid(FD, Param)) { - Diag(FD->getLocation(), diag::err_hlsl_missing_semantic_annotation); + SemanticInfo ActiveSemantic; + ActiveSemantic.Semantic = nullptr; + ActiveSemantic.Index = std::nullopt; + + if (!determineActiveSemantic(FD, Param, ActiveSemantic)) { Diag(Param->getLocation(), diag::note_previous_decl) << Param; FD->setInvalidDecl(); } @@ -862,31 +923,31 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) { // FIXME: Verify return type semantic annotation. } -void SemaHLSL::CheckSemanticAnnotation( - FunctionDecl *EntryPoint, const Decl *Param, - const HLSLAnnotationAttr *AnnotationAttr) { +void SemaHLSL::checkSemanticAnnotation(FunctionDecl *EntryPoint, + const Decl *Param, + const HLSLSemanticAttr *SemanticAttr) { auto *ShaderAttr = EntryPoint->getAttr(); assert(ShaderAttr && "Entry point has no shader attribute"); llvm::Triple::EnvironmentType ST = ShaderAttr->getType(); - switch (AnnotationAttr->getKind()) { + switch (SemanticAttr->getKind()) { case attr::HLSLSV_DispatchThreadID: case attr::HLSLSV_GroupIndex: case attr::HLSLSV_GroupThreadID: case attr::HLSLSV_GroupID: if (ST == llvm::Triple::Compute) return; - DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Compute}); + DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Compute}); break; case attr::HLSLSV_Position: // TODO(#143523): allow use on other shader types & output once the overall // semantic logic is implemented. if (ST == llvm::Triple::Pixel) return; - DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Pixel}); + DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Pixel}); break; default: - llvm_unreachable("Unknown HLSLAnnotationAttr"); + llvm_unreachable("Unknown SemanticAttr"); } } @@ -1661,28 +1722,30 @@ void SemaHLSL::diagnoseSystemSemanticAttr(Decl *D, const ParsedAttr &AL, diagnoseInputIDType(ValueType, AL); if (IsOutput) Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL; - Attribute = createSemanticAttr(AL, Index); + Attribute = + createSemanticAttr(AL, nullptr, Index); } else if (SemanticName == "SV_GROUPINDEX") { if (IsOutput) Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL; - Attribute = createSemanticAttr(AL, Index); + Attribute = createSemanticAttr(AL, nullptr, Index); } else if (SemanticName == "SV_GROUPTHREADID") { diagnoseInputIDType(ValueType, AL); if (IsOutput) Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL; - Attribute = createSemanticAttr(AL, Index); + Attribute = + createSemanticAttr(AL, nullptr, Index); } else if (SemanticName == "SV_GROUPID") { diagnoseInputIDType(ValueType, AL); if (IsOutput) Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL; - Attribute = createSemanticAttr(AL, Index); + Attribute = createSemanticAttr(AL, nullptr, Index); } else if (SemanticName == "SV_POSITION") { const auto *VT = ValueType->getAs(); if (!ValueType->hasFloatingRepresentation() || (VT && VT->getNumElements() > 4)) Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_type) << AL << "float/float1/float2/float3/float4"; - Attribute = createSemanticAttr(AL, Index); + Attribute = createSemanticAttr(AL, nullptr, Index); } else Diag(AL.getLoc(), diag::err_hlsl_unknown_semantic) << AL; diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp index 48cf81182c1de..00218ba02bb31 100644 --- a/clang/test/AST/ByteCode/records.cpp +++ b/clang/test/AST/ByteCode/records.cpp @@ -1162,6 +1162,19 @@ namespace IndirectFieldInit { static_assert(s2.x == 1 && s2.y == 2 && s2.a == 3 && s2.b == 4); #endif + + + struct B { + struct { + union { + int x = 3; + }; + int y = this->x; + }; + + constexpr B() {} + }; + static_assert(B().y == 3, ""); } namespace InheritedConstructor { @@ -1840,3 +1853,11 @@ namespace DiamondDowncast { constexpr Middle2 &fail = (Middle2&)top1; // both-error {{must be initialized by a constant expression}} \ // both-note {{cannot cast object of dynamic type 'const Bottom' to type 'Middle2'}} } + +namespace PrimitiveInitializedByInitList { + constexpr struct { + int a; + int b{this->a}; + } c{ 17 }; + static_assert(c.b == 17, ""); +} diff --git a/clang/test/CIR/CodeGen/builtin_prefetech.c b/clang/test/CIR/CodeGen/builtin_prefetech.c new file mode 100644 index 0000000000000..cfe85b9ba8104 --- /dev/null +++ b/clang/test/CIR/CodeGen/builtin_prefetech.c @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o - | FileCheck %s -check-prefix=CIR +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - | FileCheck %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=OGCG + +void foo(void *a) { + __builtin_prefetch(a); // rw=0, locality=3 + __builtin_prefetch(a, 0); // rw=0, locality=3 + __builtin_prefetch(a, 1); // rw=1, locality=3 + __builtin_prefetch(a, 1, 1); // rw=1, locality=1 +} + +// CIR-LABEL: cir.func dso_local @foo( +// CIR: %[[ALLOCA:.*]] = cir.alloca !cir.ptr +// CIR: cir.store %arg0, %[[ALLOCA]] : !cir.ptr, !cir.ptr> +// CIR: %[[P1:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr>, !cir.ptr +// CIR: cir.prefetch read locality(3) %[[P1]] : !cir.ptr +// CIR: %[[P2:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr>, !cir.ptr +// CIR: cir.prefetch read locality(3) %[[P2]] : !cir.ptr +// CIR: %[[P3:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr>, !cir.ptr +// CIR: cir.prefetch write locality(3) %[[P3]] : !cir.ptr +// CIR: %[[P4:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr>, !cir.ptr +// CIR: cir.prefetch write locality(1) %[[P4]] : !cir.ptr +// CIR: cir.return + +// LLVM-LABEL: define dso_local void @foo( +// LLVM: [[ALLOCA:%.*]] = alloca ptr, i64 1 +// LLVM: store ptr {{.*}}, ptr [[ALLOCA]] +// LLVM: [[LP1:%.*]] = load ptr, ptr [[ALLOCA]] +// LLVM: call void @llvm.prefetch.p0(ptr [[LP1]], i32 0, i32 3, i32 1) +// LLVM: [[LP2:%.*]] = load ptr, ptr [[ALLOCA]] +// LLVM: call void @llvm.prefetch.p0(ptr [[LP2]], i32 0, i32 3, i32 1) +// LLVM: [[LP3:%.*]] = load ptr, ptr [[ALLOCA]] +// LLVM: call void @llvm.prefetch.p0(ptr [[LP3]], i32 1, i32 3, i32 1) +// LLVM: [[LP4:%.*]] = load ptr, ptr [[ALLOCA]] +// LLVM: call void @llvm.prefetch.p0(ptr [[LP4]], i32 1, i32 1, i32 1) +// LLVM: ret void + +// OGCG-LABEL: define dso_local void @foo(ptr +// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1) +// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1) +// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 3, i32 1) +// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 1, i32 1) +// OGCG: ret void diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-1.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-1.hlsl new file mode 100644 index 0000000000000..ddd0baed41f37 --- /dev/null +++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-1.hlsl @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv + + +struct Input { + uint Idx : SV_DispatchThreadID; + +}; + +// Make sure SV_DispatchThreadID translated into dx.thread.id. + +// CHECK: define void @foo() +// CHECK-DXIL: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0) +// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0) +// CHECK: %[[#TMP:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0 +// CHECK: %[[#VAR:]] = alloca %struct.Input, align 8 +// CHECK: store %struct.Input %[[#TMP]], ptr %[[#VAR]], align 4 +// CHECK-DXIL: call void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +// CHECK-SPIRV: call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +[shader("compute")] +[numthreads(8,8,1)] +void foo(Input input) {} + diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-2.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-2.hlsl new file mode 100644 index 0000000000000..0d9c91e746454 --- /dev/null +++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-2.hlsl @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv + + +struct Input { + uint Idx : SV_DispatchThreadID; + uint Gid : SV_GroupID; +}; + +// Make sure SV_DispatchThreadID translated into dx.thread.id. + +// CHECK: define void @foo() +// CHECK-DXIL: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0) +// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0) +// CHECK: %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0 +// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) +// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.[[TARGET]].group.id.i32(i32 0) +// CHECK: %[[#TMP2:]] = insertvalue %struct.Input %[[#TMP1]], i32 %[[#GID]], 1 +// CHECK: %[[#VAR:]] = alloca %struct.Input, align 8 +// CHECK: store %struct.Input %[[#TMP2]], ptr %[[#VAR]], align 4 +// CHECK-DXIL: call void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +// CHECK-SPIRV: call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +[shader("compute")] +[numthreads(8,8,1)] +void foo(Input input) {} diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-inherit.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-inherit.hlsl new file mode 100644 index 0000000000000..f4c4d86933ca1 --- /dev/null +++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-inherit.hlsl @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv + + +struct Inner { + uint Gid; +}; + +struct Input { + uint Idx : SV_DispatchThreadID; + Inner inner : SV_GroupIndex; +}; + +// Make sure SV_DispatchThreadID translated into dx.thread.id. + +// CHECK: define void @foo() +// CHECK-DXIL: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0) +// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0) +// CHECK: %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0 +// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.dx.flattened.thread.id.in.group() +// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.spv.flattened.thread.id.in.group() +// CHECK: %[[#TMP2:]] = insertvalue %struct.Inner poison, i32 %[[#GID]], 0 +// CHECK: %[[#TMP3:]] = insertvalue %struct.Input %[[#TMP1]], %struct.Inner %[[#TMP2]], 1 +// CHECK: %[[#VAR:]] = alloca %struct.Input, align 8 +// CHECK: store %struct.Input %[[#TMP3]], ptr %[[#VAR]], align 4 +// CHECK-DXIL: call void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +// CHECK-SPIRV: call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +[shader("compute")] +[numthreads(8,8,1)] +void foo(Input input) {} diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-shadow.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-shadow.hlsl new file mode 100644 index 0000000000000..e1344dd87a6ed --- /dev/null +++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-shadow.hlsl @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv + + +struct Inner { + uint Gid : SV_GroupID; +}; + +struct Input { + uint Idx : SV_DispatchThreadID; + Inner inner : SV_GroupIndex; +}; + +// Make sure SV_DispatchThreadID translated into dx.thread.id. + +// CHECK: define void @foo() +// CHECK-DXIL: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0) +// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0) +// CHECK: %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0 +// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.dx.flattened.thread.id.in.group() +// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.spv.flattened.thread.id.in.group() +// CHECK: %[[#TMP2:]] = insertvalue %struct.Inner poison, i32 %[[#GID]], 0 +// CHECK: %[[#TMP3:]] = insertvalue %struct.Input %[[#TMP1]], %struct.Inner %[[#TMP2]], 1 +// CHECK: %[[#VAR:]] = alloca %struct.Input, align 8 +// CHECK: store %struct.Input %[[#TMP3]], ptr %[[#VAR]], align 4 +// CHECK-DXIL: call void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +// CHECK-SPIRV: call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +[shader("compute")] +[numthreads(8,8,1)] +void foo(Input input) {} diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-nested.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested.hlsl new file mode 100644 index 0000000000000..cd6f9460bc617 --- /dev/null +++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested.hlsl @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx +// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv + + +struct Inner { + uint Gid : SV_GroupID; +}; + +struct Input { + uint Idx : SV_DispatchThreadID; + Inner inner; +}; + +// Make sure SV_DispatchThreadID translated into dx.thread.id. + +// CHECK: define void @foo() +// CHECK-DXIL: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0) +// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0) +// CHECK: %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0 +// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.[[TARGET]].group.id(i32 0) +// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.[[TARGET]].group.id.i32(i32 0) +// CHECK: %[[#TMP2:]] = insertvalue %struct.Inner poison, i32 %[[#GID]], 0 +// CHECK: %[[#TMP3:]] = insertvalue %struct.Input %[[#TMP1]], %struct.Inner %[[#TMP2]], 1 +// CHECK: %[[#VAR:]] = alloca %struct.Input, align 8 +// CHECK: store %struct.Input %[[#TMP3]], ptr %[[#VAR]], align 4 +// CHECK-DXIL: call void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +// CHECK-SPIRV: call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]]) +[shader("compute")] +[numthreads(8,8,1)] +void foo(Input input) {} diff --git a/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl b/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl index 393d7300605c0..bcc94f0632d64 100644 --- a/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl +++ b/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl @@ -11,4 +11,9 @@ void CSMain(int GI : SV_GroupIndex, uint ID : SV_DispatchThreadID, uint GID : SV // CHECK-NEXT: HLSLSV_GroupIDAttr // CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:96 GThreadID 'uint' // CHECK-NEXT: HLSLSV_GroupThreadIDAttr + +// CHECK: HLSLSV_GroupIndexAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'GI' 'int' 0 +// CHECK-NEXT: HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'ID' 'uint':'unsigned int' 0 +// CHECK-NEXT: HLSLSV_GroupIDAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'GID' 'uint':'unsigned int' 0 +// CHECK-NEXT: HLSLSV_GroupThreadIDAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'GThreadID' 'uint':'unsigned int' 0 } diff --git a/clang/test/SemaHLSL/Semantics/position.ps.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.hlsl index 32bc5f55b2abd..27a8e4a0e2662 100644 --- a/clang/test/SemaHLSL/Semantics/position.ps.hlsl +++ b/clang/test/SemaHLSL/Semantics/position.ps.hlsl @@ -1,7 +1,10 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s -float4 main(float4 a : SV_Position) { +// FIXME(Keenuts): add mandatory output semantic once those are implemented. +float4 main(float4 a : SV_Position2) { // CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (float4)' // CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:20 a 'float4':'vector' // CHECK-NEXT: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> + +// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'a' 'float4':'vector' 2 SemanticExplicitIndex } diff --git a/clang/test/SemaHLSL/Semantics/position.ps.struct.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.struct.hlsl new file mode 100644 index 0000000000000..9f57231bea0c1 --- /dev/null +++ b/clang/test/SemaHLSL/Semantics/position.ps.struct.hlsl @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s + +struct S { + float4 f0 : SV_Position; +// CHECK: FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:10 f0 'float4':'vector' +// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <<>> 0 + float4 f1 : SV_Position3; +// CHECK: FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:10 f1 'float4':'vector' +// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <<>> 3 SemanticExplicitIndex +}; + +// FIXME(Keenuts): add mandatory output semantic once those are implemented. +float4 main(S s) { +// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (S)' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:15 s 'S' + +// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> Field 0x{{[0-9a-fA-F]+}} 'f0' 'float4':'vector' 0 +// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> Field 0x{{[0-9a-fA-F]+}} 'f1' 'float4':'vector' 3 SemanticExplicitIndex +} diff --git a/clang/test/SemaHLSL/Semantics/struct_input.hlsl b/clang/test/SemaHLSL/Semantics/struct_input.hlsl new file mode 100644 index 0000000000000..66cab9552d4d9 --- /dev/null +++ b/clang/test/SemaHLSL/Semantics/struct_input.hlsl @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -finclude-default-header -o - %s -verify + +struct S { + float4 f0 : SV_Position; +// expected-error@+2 {{semantic annotations must be present for all parameters of an entry function or patch constant function}} +// expected-note@+1 {{'f1' used here}} + float4 f1; +}; + +[shader("pixel")] +// expected-note@+1 {{'s' declared here}} +void main(S s) { +} + +[shader("pixel")] +// expected-error@+2 {{semantic annotations must be present for all parameters of an entry function or patch constant function}} +// expected-note@+1 {{'f' declared here}} +void main2(float4 p : SV_POSITION, float4 f) +{ } diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp index 700d7cf8efca6..cf8143ace7b45 100644 --- a/clang/unittests/Format/FormatTestObjC.cpp +++ b/clang/unittests/Format/FormatTestObjC.cpp @@ -949,6 +949,12 @@ TEST_F(FormatTestObjC, FormatObjCMethodExpr) { "[aaaaaaaaaaaaaaaaaaaaaaaaa\n" " aaaaaaaaaaaaaaaaa:aaaaaaaa\n" " aaa:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa];"); + verifyFormat("[objectName\n" + " respondsToSelector:\n" + " @selector(\n" + " somelonglonglonglongnameeeeeeee:\n" + " loooooooooanotherlonglonglonglongnametopush:\n" + " otherlongnameforlimit:)];"); Style = getChromiumStyle(FormatStyle::LK_ObjC); Style.ColumnLimit = 80; diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 1342e1a6ffb5b..183952af590e1 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -3078,6 +3078,17 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS, OS << " {\n"; + // The generator puts the arguments for each attribute in the child class, + // even if those are set in the inherited attribute class (in the TD + // file). This means I cannot access those from the parent class, and have + // to do this weirdness. Maybe the generator should be changed to + // arguments are put in the class they are declared in inside the TD file? + if (HLSLSemantic) { + OS << " if (SemanticExplicitIndex)\n"; + OS << " setSemanticIndex(SemanticIndex);\n"; + OS << " setTargetDecl(Target);\n"; + } + for (auto const &ai : Args) { if (!shouldEmitArg(ai)) continue; diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 1fc59c702fd81..d7861ac6463c8 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -816,8 +816,9 @@ static void genDeclareDataOperandOperations( Fortran::semantics::FindCommonBlockContaining(symbol)) { emitCommonGlobal( converter, builder, accObject, dataClause, - [&](mlir::OpBuilder &modBuilder, mlir::Location loc, - fir::GlobalOp globalOp, mlir::acc::DataClause clause, + [&](mlir::OpBuilder &modBuilder, [[maybe_unused]] mlir::Location loc, + [[maybe_unused]] fir::GlobalOp globalOp, + [[maybe_unused]] mlir::acc::DataClause clause, std::stringstream &asFortranStr, const std::string &ctorName) { if constexpr (std::is_same_v) { createDeclareGlobalOp< diff --git a/llvm/docs/TableGen/index.rst b/llvm/docs/TableGen/index.rst index e916c152f5a43..e334b2e4b23ac 100644 --- a/llvm/docs/TableGen/index.rst +++ b/llvm/docs/TableGen/index.rst @@ -20,7 +20,7 @@ domain-specific information. Because there may be a large number of these records, it is specifically designed to allow writing flexible descriptions and for common features of these records to be factored out. This reduces the amount of duplication in the description, reduces the chance of error, and makes -it easier to structure domain specific information. +it easier to structure domain-specific information. The TableGen front end parses a file, instantiates the declarations, and hands the result off to a domain-specific `backend`_ for processing. See @@ -44,8 +44,8 @@ distribution, respectively. The TableGen program ==================== -TableGen files are interpreted by the TableGen program: `llvm-tblgen` available -on your build directory under `bin`. It is not installed in the system (or where +TableGen files are interpreted by the TableGen program: ``llvm-tblgen`` available +in your build directory under ``bin``. It is not installed in the system (or where your sysroot is set to), since it has no use beyond LLVM's build process. Running TableGen @@ -86,7 +86,7 @@ the `-dump-json` option. If you plan to use TableGen, you will most likely have to write a `backend`_ that extracts the information specific to what you need and formats it in the -appropriate way. You can do this by extending TableGen itself in C++, or by +appropriate way. You can do this by extending TableGen itself in C++ or by writing a script in any language that can consume the JSON output. Example @@ -152,7 +152,7 @@ of the x86 architecture. ``def ADD32rr`` defines a record named ``ADD32rr``, and the comment at the end of the line indicates the superclasses of the definition. The body of the record contains all of the data that TableGen assembled for the record, indicating that the instruction is part of -the "X86" namespace, the pattern indicating how the instruction is selected by +the ``X86`` namespace, the pattern indicating how the instruction is selected by the code generator, that it is a two-address instruction, has a particular encoding, etc. The contents and semantics of the information in the record are specific to the needs of the X86 backend, and are only shown as an example. @@ -175,13 +175,13 @@ TableGen, all of the information was derived from the following definition: This definition makes use of the custom class ``I`` (extended from the custom class ``X86Inst``), which is defined in the X86-specific TableGen file, to factor out the common features that instructions of its class share. A key -feature of TableGen is that it allows the end-user to define the abstractions +feature of TableGen is that it allows the end user to define the abstractions they prefer to use when describing their information. Syntax ====== -TableGen has a syntax that is loosely based on C++ templates, with built-in +TableGen has a syntax loosely based on C++ templates, with built-in types and specification. In addition, TableGen's syntax introduces some automation concepts like multiclass, foreach, let, etc. @@ -193,41 +193,41 @@ which are considered 'records'. **TableGen records** have a unique name, a list of values, and a list of superclasses. The list of values is the main data that TableGen builds for each -record; it is this that holds the domain specific information for the +record; it is this that holds the domain-specific information for the application. The interpretation of this data is left to a specific `backend`_, -but the structure and format rules are taken care of and are fixed by +but the structure and format rules are taken care of and fixed by TableGen. **TableGen definitions** are the concrete form of 'records'. These generally do -not have any undefined values, and are marked with the '``def``' keyword. +not have any undefined values and are marked with the '``def``' keyword. .. code-block:: text def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", "Enable ARMv8 FP">; -In this example, FeatureFPARMv8 is ``SubtargetFeature`` record initialised +In this example, ``FeatureFPARMv8`` is ``SubtargetFeature`` record initialised with some values. The names of the classes are defined via the -keyword `class` either on the same file or some other included. Most target +keyword `class` either in the same file or some other included. Most target TableGen files include the generic ones in ``include/llvm/Target``. **TableGen classes** are abstract records that are used to build and describe other records. These classes allow the end-user to build abstractions for -either the domain they are targeting (such as "Register", "RegisterClass", and -"Instruction" in the LLVM code generator) or for the implementor to help factor -out common properties of records (such as "FPInst", which is used to represent +either the domain they are targeting (such as ``Register``, ``RegisterClass``, and +``Instruction`` in the LLVM code generator) or for the implementor to help factor +out common properties of records (such as ``FPInst``, which is used to represent floating point instructions in the X86 backend). TableGen keeps track of all of the classes that are used to build up a definition, so the backend can find all -definitions of a particular class, such as "Instruction". +definitions of a particular class, such as ``Instruction``. .. code-block:: text class ProcNoItin Features> : Processor; -Here, the class ProcNoItin, receiving parameters `Name` of type `string` and -a list of target features is specializing the class Processor by passing the -arguments down as well as hard-coding NoItineraries. +Here, the class ``ProcNoItin``, receiving parameters ``Name`` of type ``string`` and +a list of target features is specializing the class ``Processor`` by passing the +arguments down as well as hard-coding ``NoItineraries``. **TableGen multiclasses** are groups of abstract records that are instantiated all at once. Each instantiation can result in multiple TableGen definitions. @@ -295,8 +295,8 @@ TableGen Deficiencies Despite being very generic, TableGen has some deficiencies that have been pointed out numerous times. The common theme is that, while TableGen allows -you to build domain specific languages, the final languages that you create -lack the power of other DSLs, which in turn increase considerably the size +you to build domain-specific languages, the final languages that you create +lack the power of other DSLs, which in turn considerably increases the size and complexity of TableGen files. At the same time, TableGen allows you to create virtually any meaning of @@ -305,6 +305,6 @@ design and make it very hard for newcomers to understand the evil TableGen file. There are some in favor of extending the semantics even more, but making sure -backends adhere to strict rules. Others are suggesting we should move to less, +backends adhere to strict rules. Others suggest moving to fewer, more powerful DSLs designed with specific purposes, or even reusing existing DSLs. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 5d3b233ed6b6a..7b7dc1b46dd80 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -227,6 +227,9 @@ class TargetTransformInfo { /// Get the kind of extension that an instruction represents. LLVM_ABI static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I); + /// Get the kind of extension that a cast opcode represents. + LLVM_ABI static PartialReductionExtendKind + getPartialReductionExtendKind(Instruction::CastOps CastOpc); /// Construct a TTI object using a type implementing the \c Concept /// API below. diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h index 8613ddd8e3b11..f05febfcc96b4 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h @@ -448,7 +448,7 @@ class LLVM_ABI FailedToMaterialize : public ErrorInfo { FailedToMaterialize(std::shared_ptr SSP, std::shared_ptr Symbols); - ~FailedToMaterialize(); + ~FailedToMaterialize() override; std::error_code convertToErrorCode() const override; void log(raw_ostream &OS) const override; const SymbolDependenceMap &getSymbols() const { return *Symbols; } diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h index 254b897bca614..e84eb4bec297a 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h @@ -70,7 +70,7 @@ class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin { DebugObjectManagerPlugin(ExecutionSession &ES, std::unique_ptr Target, bool RequireDebugSections, bool AutoRegisterCode); - ~DebugObjectManagerPlugin(); + ~DebugObjectManagerPlugin() override; void notifyMaterializing(MaterializationResponsibility &MR, jitlink::LinkGraph &G, jitlink::JITLinkContext &Ctx, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h index 179fedc3972ab..f9b1d2c273bb6 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h @@ -32,7 +32,7 @@ class LLVM_ABI PerfSupportPlugin : public ObjectLinkingLayer::Plugin { ExecutorAddr RegisterPerfEndAddr, ExecutorAddr RegisterPerfImplAddr, bool EmitDebugInfo, bool EmitUnwindInfo); - ~PerfSupportPlugin(); + ~PerfSupportPlugin() override; void modifyPassConfig(MaterializationResponsibility &MR, jitlink::LinkGraph &G, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h index fa48480f265a9..031bb27aeb27c 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h @@ -52,7 +52,7 @@ class LLVM_ABI EPCGenericRTDyldMemoryManager EPCGenericRTDyldMemoryManager(EPCGenericRTDyldMemoryManager &&) = delete; EPCGenericRTDyldMemoryManager & operator=(EPCGenericRTDyldMemoryManager &&) = delete; - ~EPCGenericRTDyldMemoryManager(); + ~EPCGenericRTDyldMemoryManager() override; uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h index fecffc2a0bb32..fa4bb9dcecd87 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h @@ -285,7 +285,7 @@ class LLVM_ABI IndirectStubsManager : public RedirectableSymbolManager { /// Map type for initializing the manager. See init. using StubInitsMap = StringMap>; - virtual ~IndirectStubsManager() = default; + ~IndirectStubsManager() override = default; /// Create a single stub with the given name, target address and flags. virtual Error createStub(StringRef StubName, ExecutorAddr StubAddr, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Layer.h b/llvm/include/llvm/ExecutionEngine/Orc/Layer.h index 8dfc12ee9fad4..25df380413d5b 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Layer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Layer.h @@ -136,7 +136,7 @@ class LLVM_ABI ObjectLayer : public RTTIExtends { static char ID; ObjectLayer(ExecutionSession &ES); - virtual ~ObjectLayer(); + ~ObjectLayer() override; /// Returns the execution session for this layer. ExecutionSession &getExecutionSession() { return ES; } diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h index d3643f9301134..2079a359e1e20 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h @@ -88,7 +88,7 @@ class LLVM_ABI LinkGraphLinkingLayer : public LinkGraphLayer, std::unique_ptr MemMgr); /// Destroy the LinkGraphLinkingLayer. - ~LinkGraphLinkingLayer(); + ~LinkGraphLinkingLayer() override; /// Add a plugin. LinkGraphLinkingLayer &addPlugin(std::shared_ptr P) { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h index 1fb472a177d6d..8c6a8f5899c17 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h @@ -58,7 +58,7 @@ class LLVM_ABI RTDyldObjectLinkingLayer RTDyldObjectLinkingLayer(ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager); - ~RTDyldObjectLinkingLayer(); + ~RTDyldObjectLinkingLayer() override; /// Emit the object. void emit(std::unique_ptr R, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h index 7acb6a4db08c2..81761831bdd96 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h @@ -69,7 +69,7 @@ class LLVM_ABI SimpleRemoteEPC : public ExecutorProcessControl, SimpleRemoteEPC &operator=(const SimpleRemoteEPC &) = delete; SimpleRemoteEPC(SimpleRemoteEPC &&) = delete; SimpleRemoteEPC &operator=(SimpleRemoteEPC &&) = delete; - ~SimpleRemoteEPC(); + ~SimpleRemoteEPC() override; Expected runAsMain(ExecutorAddr MainFnAddr, ArrayRef Args) override; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h index 85c2d655837b5..2c385de48ddf6 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h @@ -29,7 +29,7 @@ namespace rt_bootstrap { class LLVM_ABI ExecutorSharedMemoryMapperService final : public ExecutorBootstrapService { public: - ~ExecutorSharedMemoryMapperService(){}; + ~ExecutorSharedMemoryMapperService() override {}; Expected> reserve(uint64_t Size); Expected initialize(ExecutorAddr Reservation, diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h index 7526a29dc1066..7ca2ff2a2fd94 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h @@ -37,7 +37,7 @@ namespace rt_bootstrap { /// Simple page-based allocator. class LLVM_ABI SimpleExecutorDylibManager : public ExecutorBootstrapService { public: - virtual ~SimpleExecutorDylibManager(); + ~SimpleExecutorDylibManager() override; Expected open(const std::string &Path, uint64_t Mode); diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h index 6224e92887147..45256ec42463f 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h @@ -32,7 +32,7 @@ namespace rt_bootstrap { /// Simple page-based allocator. class LLVM_ABI SimpleExecutorMemoryManager : public ExecutorBootstrapService { public: - virtual ~SimpleExecutorMemoryManager(); + ~SimpleExecutorMemoryManager() override; Expected reserve(uint64_t Size); Expected initialize(tpctypes::FinalizeRequest &FR); diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h index 9cf6e00ad7131..b73da194cd187 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h @@ -37,7 +37,7 @@ class LLVM_ABI Task : public RTTIExtends { public: static char ID; - virtual ~Task() = default; + ~Task() override = default; /// Description of the task to be performed. Used for logging. virtual void printDescription(raw_ostream &OS) = 0; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h index 45834f14e9bdf..d8506ce830bdc 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h @@ -186,9 +186,7 @@ template class WaitingOnGraph { SmallVector SortedElems(ContainerElems.begin(), ContainerElems.end()); llvm::sort(SortedElems); - Hash = hash_combine( - Hash, Container, - hash_combine_range(SortedElems.begin(), SortedElems.end())); + Hash = hash_combine(Hash, Container, hash_combine_range(SortedElems)); } return Hash; } diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index cdfee727387a5..eb60bee309cf5 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -178,6 +178,8 @@ struct alignas(8) GlobalValueSummaryInfo { /// only be called prior to index-based internalization and promotion. inline void verifyLocal() const; + bool hasLocal() const { return HasLocal; } + private: /// List of global value summary structures for a particular value held /// in the GlobalValueMap. Requires a vector in the case of multiple @@ -239,6 +241,8 @@ struct ValueInfo { void verifyLocal() const { getRef()->second.verifyLocal(); } + bool hasLocal() const { return getRef()->second.hasLocal(); } + // Even if the index is built with GVs available, we may not have one for // summary entries synthesized for profiled indirect call targets. bool hasName() const { return !haveGVs() || getValue(); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index bf62623099a97..c47a1c1b23a37 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1001,13 +1001,25 @@ InstructionCost TargetTransformInfo::getShuffleCost( TargetTransformInfo::PartialReductionExtendKind TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { - if (isa(I)) - return PR_SignExtend; - if (isa(I)) - return PR_ZeroExtend; + if (auto *Cast = dyn_cast(I)) + return getPartialReductionExtendKind(Cast->getOpcode()); return PR_None; } +TargetTransformInfo::PartialReductionExtendKind +TargetTransformInfo::getPartialReductionExtendKind( + Instruction::CastOps CastOpc) { + switch (CastOpc) { + case Instruction::CastOps::ZExt: + return PR_ZeroExtend; + case Instruction::CastOps::SExt: + return PR_SignExtend; + default: + return PR_None; + } + llvm_unreachable("Unhandled cast opcode"); +} + TTI::CastContextHint TargetTransformInfo::getCastContextHint(const Instruction *I) { if (!I) diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp index 6c7e27e429849..fa04976f06b5a 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp @@ -247,7 +247,7 @@ class InProcessMemoryManager::IPInFlightAlloc StandardSegments(std::move(StandardSegments)), FinalizationSegments(std::move(FinalizationSegments)) {} - ~IPInFlightAlloc() { + ~IPInFlightAlloc() override { assert(!G && "InFlight alloc neither abandoned nor finalized"); } diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp index 75ae80f610a56..4ceff483ad799 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp @@ -38,7 +38,7 @@ class MachODebugObjectSynthesizerBase MachODebugObjectSynthesizerBase(LinkGraph &G, ExecutorAddr RegisterActionAddr) : G(G), RegisterActionAddr(RegisterActionAddr) {} - virtual ~MachODebugObjectSynthesizerBase() = default; + ~MachODebugObjectSynthesizerBase() override = default; Error preserveDebugSections() { if (G.findSectionByName(SynthDebugSectionName)) { diff --git a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp index d1a6eaf914a78..a2990ab3805ad 100644 --- a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp @@ -55,7 +55,7 @@ class LinkGraphLinkingLayer::JITLinkCtx final : public JITLinkContext { Plugins = Layer.Plugins; } - ~JITLinkCtx() { + ~JITLinkCtx() override { // If there is an object buffer return function then use it to // return ownership of the buffer. if (Layer.ReturnObjectBuffer && ObjBuffer) diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp index fd805fbf01fb7..cdde733b3a817 100644 --- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp +++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp @@ -92,7 +92,7 @@ class OrcCAPIMaterializationUnit : public llvm::orc::MaterializationUnit { Name(std::move(Name)), Ctx(Ctx), Materialize(Materialize), Discard(Discard), Destroy(Destroy) {} - ~OrcCAPIMaterializationUnit() { + ~OrcCAPIMaterializationUnit() override { if (Ctx) Destroy(Ctx); } @@ -264,7 +264,7 @@ class CAPIDefinitionGenerator final : public DefinitionGenerator { LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate) : Dispose(Dispose), Ctx(Ctx), TryToGenerate(TryToGenerate) {} - ~CAPIDefinitionGenerator() { + ~CAPIDefinitionGenerator() override { if (Dispose) Dispose(Ctx); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f788c7510f80c..92f260f408674 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4005,24 +4005,20 @@ def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; // load zero-extended i32, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; - +def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; // load zero-extended i16, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; - +def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; // load zero-extended i8, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; - +def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; // load zero-extended i16, bitcast to f32 -def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; - +def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; // load zero-extended i8, bitcast to f32 -def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; // Pre-fetch. def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e3370d31a0e39..2053fc45698f5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1577,18 +1577,26 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { } static bool isAllActivePredicate(Value *Pred) { - // Look through convert.from.svbool(convert.to.svbool(...) chain. Value *UncastedPred; + + // Look through predicate casts that only remove lanes. if (match(Pred, m_Intrinsic( - m_Intrinsic( - m_Value(UncastedPred))))) - // If the predicate has the same or less lanes than the uncasted - // predicate then we know the casting has no effect. - if (cast(Pred->getType())->getMinNumElements() <= - cast(UncastedPred->getType())->getMinNumElements()) - Pred = UncastedPred; + m_Value(UncastedPred)))) { + auto *OrigPredTy = cast(Pred->getType()); + Pred = UncastedPred; + + if (match(Pred, m_Intrinsic( + m_Value(UncastedPred)))) + // If the predicate has the same or less lanes than the uncasted predicate + // then we know the casting has no effect. + if (OrigPredTy->getMinNumElements() <= + cast(UncastedPred->getType()) + ->getMinNumElements()) + Pred = UncastedPred; + } + auto *C = dyn_cast(Pred); - return (C && C->isAllOnesValue()); + return C && C->isAllOnesValue(); } // Simplify `V` by only considering the operations that affect active lanes. diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index d91923b41ddd3..56a38bb49b7e7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1499,18 +1499,25 @@ static bool generateKernelClockInst(const SPIRV::IncomingCall *Call, Register ResultReg = Call->ReturnRegister; - // Deduce the `Scope` operand from the builtin function name. - SPIRV::Scope::Scope ScopeArg = - StringSwitch(Builtin->Name) - .EndsWith("device", SPIRV::Scope::Scope::Device) - .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup) - .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup); - Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR); - - MIRBuilder.buildInstr(SPIRV::OpReadClockKHR) - .addDef(ResultReg) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) - .addUse(ScopeReg); + if (Builtin->Name == "__spirv_ReadClockKHR") { + MIRBuilder.buildInstr(SPIRV::OpReadClockKHR) + .addDef(ResultReg) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(Call->Arguments[0]); + } else { + // Deduce the `Scope` operand from the builtin function name. + SPIRV::Scope::Scope ScopeArg = + StringSwitch(Builtin->Name) + .EndsWith("device", SPIRV::Scope::Scope::Device) + .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup) + .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup); + Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR); + + MIRBuilder.buildInstr(SPIRV::OpReadClockKHR) + .addDef(ResultReg) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(ScopeReg); + } return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 3b8764a6401c6..c259ccee359b4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1174,6 +1174,7 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>; +defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>; //===----------------------------------------------------------------------===// // Class defining an atomic instruction on floating-point numbers. diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index a0f7ec6d5fae3..6909a282b3129 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1161,14 +1161,10 @@ bool DevirtIndex::tryFindVirtualCallTargets( // and therefore the same GUID. This can happen if there isn't enough // distinguishing path when compiling the source file. In that case we // conservatively return false early. + if (P.VTableVI.hasLocal() && P.VTableVI.getSummaryList().size() > 1) + return false; const GlobalVarSummary *VS = nullptr; - bool LocalFound = false; for (const auto &S : P.VTableVI.getSummaryList()) { - if (GlobalValue::isLocalLinkage(S->linkage())) { - if (LocalFound) - return false; - LocalFound = true; - } auto *CurVS = cast(S->getBaseObject()); if (!CurVS->vTableFuncs().empty() || // Previously clang did not attach the necessary type metadata to @@ -1184,6 +1180,7 @@ bool DevirtIndex::tryFindVirtualCallTargets( // with public LTO visibility. if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic) return false; + break; } } // There will be no VS if all copies are available_externally having no @@ -2591,6 +2588,11 @@ void DevirtIndex::run() { if (ExportSummary.typeIdCompatibleVtableMap().empty()) return; + // Assert that we haven't made any changes that would affect the hasLocal() + // flag on the GUID summary info. + assert(!ExportSummary.withInternalizeAndPromote() && + "Expect index-based WPD to run before internalization and promotion"); + DenseMap> NameByGUID; for (const auto &P : ExportSummary.typeIdCompatibleVtableMap()) { NameByGUID[GlobalValue::getGUIDAssumingExternalLinkage(P.first)].push_back( diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 4acc3f2d84690..d347cedb42988 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -614,6 +614,16 @@ static Decomposition decompose(Value *V, return {V, IsKnownNonNegative}; } + if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() && + canUseSExt(CI)) { + Preconditions.emplace_back( + CmpInst::ICMP_UGE, Op0, + ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1)); + if (auto Decomp = MergeResults(Op0, CI, true)) + return *Decomp; + return {V, IsKnownNonNegative}; + } + if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) { if (!isKnownNonNegative(Op0, DL)) Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0, @@ -627,16 +637,6 @@ static Decomposition decompose(Value *V, return {V, IsKnownNonNegative}; } - if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() && - canUseSExt(CI)) { - Preconditions.emplace_back( - CmpInst::ICMP_UGE, Op0, - ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1)); - if (auto Decomp = MergeResults(Op0, CI, true)) - return *Decomp; - return {V, IsKnownNonNegative}; - } - // Decompose or as an add if there are no common bits between the operands. if (match(V, m_DisjointOr(m_Value(Op0), m_ConstantInt(CI)))) { if (auto Decomp = MergeResults(Op0, CI, IsSigned)) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 06bea2fef87aa..a1ad2dbcba8be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2712,7 +2712,8 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || + R->getVPDefID() == VPRecipeBase::VPPartialReductionSC; } static inline bool classof(const VPUser *U) { @@ -2783,7 +2784,10 @@ class VPPartialReductionRecipe : public VPReductionRecipe { Opcode(Opcode), VFScaleFactor(ScaleFactor) { [[maybe_unused]] auto *AccumulatorRecipe = getChainOp()->getDefiningRecipe(); - assert((isa(AccumulatorRecipe) || + // When cloning as part of a VPExpressionRecipe the chain op could have + // replaced by a temporary VPValue, so it doesn't have a defining recipe. + assert((!AccumulatorRecipe || + isa(AccumulatorRecipe) || isa(AccumulatorRecipe)) && "Unexpected operand order for partial reduction recipe"); } @@ -3093,6 +3097,11 @@ class VPExpressionRecipe : public VPSingleDefRecipe { /// removed before codegen. void decompose(); + unsigned getVFScaleFactor() const { + auto *PR = dyn_cast(ExpressionRecipes.back()); + return PR ? PR->getVFScaleFactor() : 1; + } + /// Method for generating code, must not be called as this recipe is abstract. void execute(VPTransformState &State) override { llvm_unreachable("recipe must be removed before execute"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1f1b42bb9c19f..931a5b7582c4e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -168,6 +168,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { return cast(this)->mayHaveSideEffects(); case VPBlendSC: case VPReductionEVLSC: + case VPPartialReductionSC: case VPReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: @@ -300,14 +301,23 @@ InstructionCost VPPartialReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { std::optional Opcode; - VPValue *Op = getOperand(0); - VPRecipeBase *OpR = Op->getDefiningRecipe(); - - // If the partial reduction is predicated, a select will be operand 0 - if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { - OpR = Op->getDefiningRecipe(); + VPValue *Op = getVecOp(); + uint64_t MulConst; + // If the partial reduction is predicated, a select will be operand 1. + // If it isn't predicated and the mul isn't operating on a constant, then it + // should have been turned into a VPExpressionRecipe. + // FIXME: Replace the entire function with this once all partial reduction + // variants are bundled into VPExpressionRecipe. + if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) && + !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { + auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); + auto *InputType = Ctx.Types.inferScalarType(getVecOp()); + return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType, + PhiType, VF, TTI::PR_None, + TTI::PR_None, {}, Ctx.CostKind); } + VPRecipeBase *OpR = Op->getDefiningRecipe(); Type *InputTypeA = nullptr, *InputTypeB = nullptr; TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, ExtBType = TTI::PR_None; @@ -2856,11 +2866,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, cast(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - return Ctx.TTI.getExtendedReductionCost( - Opcode, - cast(ExpressionRecipes.front())->getOpcode() == - Instruction::ZExt, - RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast(ExpressionRecipes[1])->getRecurrenceKind()); + auto *ExtR = cast(ExpressionRecipes[0]); + return isa(ExpressionRecipes.back()) + ? Ctx.TTI.getPartialReductionCost( + Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, + RedTy, VF, + TargetTransformInfo::getPartialReductionExtendKind( + ExtR->getOpcode()), + TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind) + : Ctx.TTI.getExtendedReductionCost( + Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, + SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, @@ -2871,6 +2889,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, Opcode = Instruction::Sub; [[fallthrough]]; case ExpressionTypes::ExtMulAccReduction: { + if (isa(ExpressionRecipes.back())) { + auto *Ext0R = cast(ExpressionRecipes[0]); + auto *Ext1R = cast(ExpressionRecipes[1]); + auto *Mul = cast(ExpressionRecipes[2]); + return Ctx.TTI.getPartialReductionCost( + Opcode, Ctx.Types.inferScalarType(getOperand(0)), + Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF, + TargetTransformInfo::getPartialReductionExtendKind( + Ext0R->getOpcode()), + TargetTransformInfo::getPartialReductionExtendKind( + Ext1R->getOpcode()), + Mul->getOpcode(), Ctx.CostKind); + } return Ctx.TTI.getMulAccReductionCost( cast(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, @@ -2910,12 +2941,13 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, O << " = "; auto *Red = cast(ExpressionRecipes.back()); unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + bool IsPartialReduction = isa(Red); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { getOperand(1)->printAsOperand(O, SlotTracker); - O << " +"; - O << " reduce." << Instruction::getOpcodeName(Opcode) << " ("; + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName(Opcode) << " ("; getOperand(0)->printAsOperand(O, SlotTracker); Red->printFlags(O); @@ -2931,8 +2963,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, } case ExpressionTypes::ExtNegatedMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + reduce." - << Instruction::getOpcodeName( + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " (sub (0, mul"; auto *Mul = cast(ExpressionRecipes[2]); @@ -2956,9 +2988,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent, case ExpressionTypes::MulAccReduction: case ExpressionTypes::ExtMulAccReduction: { getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker); - O << " + "; - O << "reduce." - << Instruction::getOpcodeName( + O << " + " << (IsPartialReduction ? "partial." : "") << "reduce."; + O << Instruction::getOpcodeName( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind())) << " ("; O << "mul"; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f5a3af4e983f6..3e85e6ff514ef 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -3519,18 +3519,31 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VPValue *VecOp = Red->getVecOp(); // Clamp the range if using extended-reduction is profitable. - auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt, - Type *SrcTy) -> bool { + auto IsExtendedRedValidAndClampRange = + [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost( - Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(), - CostKind); + + InstructionCost ExtRedCost; InstructionCost ExtCost = cast(VecOp)->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); + + if (isa(Red)) { + TargetTransformInfo::PartialReductionExtendKind ExtKind = + TargetTransformInfo::getPartialReductionExtendKind(ExtOpc); + // FIXME: Move partial reduction creation, costing and clamping + // here from LoopVectorize.cpp. + ExtRedCost = Ctx.TTI.getPartialReductionCost( + Opcode, SrcTy, nullptr, RedTy, VF, ExtKind, + llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind); + } else { + ExtRedCost = Ctx.TTI.getExtendedReductionCost( + Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy, + Red->getFastMathFlags(), CostKind); + } return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; }, Range); @@ -3541,8 +3554,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) && IsExtendedRedValidAndClampRange( RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()), - cast(VecOp)->getOpcode() == - Instruction::CastOps::ZExt, + cast(VecOp)->getOpcode(), Ctx.Types.inferScalarType(A))) return new VPExpressionRecipe(cast(VecOp), Red); @@ -3560,6 +3572,8 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { + bool IsPartialReduction = isa(Red); + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return nullptr; @@ -3568,16 +3582,41 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, // Clamp the range if using multiply-accumulate-reduction is profitable. auto IsMulAccValidAndClampRange = - [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, - VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { + [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenCastRecipe *OuterExt) -> bool { return LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *SrcTy = Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; - auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); - InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost( - isZExt, Opcode, RedTy, SrcVecTy, CostKind); + InstructionCost MulAccCost; + + if (IsPartialReduction) { + Type *SrcTy2 = + Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr; + // FIXME: Move partial reduction creation, costing and clamping + // here from LoopVectorize.cpp. + MulAccCost = Ctx.TTI.getPartialReductionCost( + Opcode, SrcTy, SrcTy2, RedTy, VF, + Ext0 ? TargetTransformInfo::getPartialReductionExtendKind( + Ext0->getOpcode()) + : TargetTransformInfo::PR_None, + Ext1 ? TargetTransformInfo::getPartialReductionExtendKind( + Ext1->getOpcode()) + : TargetTransformInfo::PR_None, + Mul->getOpcode(), CostKind); + } else { + // Only partial reductions support mixed extends at the moment. + if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode()) + return false; + + bool IsZExt = + !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt; + auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); + MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy, + SrcVecTy, CostKind); + } + InstructionCost MulCost = Mul->computeCost(VF, Ctx); InstructionCost RedCost = Red->computeCost(VF, Ctx); InstructionCost ExtCost = 0; @@ -3611,14 +3650,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present(B->getDefiningRecipe()); auto *Mul = cast(VecOp->getDefiningRecipe()); - // Match reduce.add(mul(ext, ext)). - if (RecipeA && RecipeB && - (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && - match(RecipeA, m_ZExtOrSExt(m_VPValue())) && + // Match reduce.add/sub(mul(ext, ext)). + if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && match(RecipeB, m_ZExtOrSExt(m_VPValue())) && - IsMulAccValidAndClampRange(RecipeA->getOpcode() == - Instruction::CastOps::ZExt, - Mul, RecipeA, RecipeB, nullptr)) { + IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) { if (Sub) return new VPExpressionRecipe(RecipeA, RecipeB, Mul, cast(Sub), Red); @@ -3626,8 +3661,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, } // Match reduce.add(mul). // TODO: Add an expression type for this variant with a negated mul - if (!Sub && - IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) + if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr)) return new VPExpressionRecipe(Mul, Red); } // TODO: Add an expression type for negated versions of other expression @@ -3647,9 +3681,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, cast(Mul->getOperand(1)->getDefiningRecipe()); if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && Ext0->getOpcode() == Ext1->getOpcode() && - IsMulAccValidAndClampRange(Ext0->getOpcode() == - Instruction::CastOps::ZExt, - Mul, Ext0, Ext1, Ext)) { + IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) { auto *NewExt0 = new VPWidenCastRecipe( Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0, *Ext0, Ext0->getDebugLoc()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 32e4b8872b1df..06c3d75e5708c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -151,6 +151,8 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { return RR->getVFScaleFactor(); if (auto *RR = dyn_cast(R)) return RR->getVFScaleFactor(); + if (auto *ER = dyn_cast(R)) + return ER->getVFScaleFactor(); assert( (!isa(R) || cast(R)->getOpcode() != VPInstruction::ReductionStartVector) && diff --git a/llvm/test/Analysis/CostModel/ARM/fparith.ll b/llvm/test/Analysis/CostModel/ARM/fparith.ll index f9424e8b86557..6f2626cd20588 100644 --- a/llvm/test/Analysis/CostModel/ARM/fparith.ll +++ b/llvm/test/Analysis/CostModel/ARM/fparith.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" define void @f32() { ; CHECK-MVE-LABEL: 'f32' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c = fadd float undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %d = fsub float undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %e = fmul float undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'f32' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c = fadd float undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %d = fsub float undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %e = fmul float undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %c = fadd float undef, undef %d = fsub float undef, undef @@ -25,16 +25,16 @@ define void @f32() { define void @f16() { ; CHECK-MVE-LABEL: 'f16' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c = fadd half undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %d = fsub half undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %e = fmul half undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'f16' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c = fadd half undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %d = fsub half undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %e = fmul half undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %c = fadd half undef, undef %d = fsub half undef, undef @@ -44,16 +44,16 @@ define void @f16() { define void @f64() { ; CHECK-MVE-LABEL: 'f64' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %c = fadd double undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %d = fsub double undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 1 for: %e = fmul double undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'f64' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %c = fadd double undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %d = fsub double undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %e = fmul double undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %c = fadd double undef, undef %d = fsub double undef, undef @@ -63,28 +63,28 @@ define void @f64() { define void @vf32() { ; CHECK-MVE-LABEL: 'vf32' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'vf32' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c8 = fadd <8 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d8 = fsub <8 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e8 = fmul <8 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %c8 = fadd <8 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %d8 = fsub <8 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %e8 = fmul <8 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %c2 = fadd <2 x float> undef, undef %d2 = fsub <2 x float> undef, undef @@ -100,28 +100,28 @@ define void @vf32() { define void @vf16() { ; CHECK-MVE-LABEL: 'vf16' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'vf16' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %c8 = fadd <8 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %d8 = fsub <8 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = fmul <8 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c8 = fadd <8 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d8 = fsub <8 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e8 = fmul <8 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %c2 = fadd <2 x half> undef, undef %d2 = fsub <2 x half> undef, undef @@ -137,28 +137,28 @@ define void @vf16() { define void @vf64() { ; CHECK-MVE-LABEL: 'vf64' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'vf64' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %c2 = fadd <2 x double> undef, undef %d2 = fsub <2 x double> undef, undef diff --git a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll index aff7b19a9c87a..af548f6216370 100644 --- a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll +++ b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll @@ -1,213 +1,213 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP define void @casts() { ; CHECK-MVE-LABEL: 'casts' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 406 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 314 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 404 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 626 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 548 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 548 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 548 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 496 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1360 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1304 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 922 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 794 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 844 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 794 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 844 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 794 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 844 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 792 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1352 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1296 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1834 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1672 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1568 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4912 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4800 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 3018 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2760 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2856 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2752 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4880 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4768 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:314 CodeSize:85 Lat:169 SizeLat:169 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:404 CodeSize:59 Lat:133 SizeLat:133 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:376 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:626 CodeSize:169 Lat:337 SizeLat:337 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:548 CodeSize:115 Lat:261 SizeLat:261 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:496 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1360 CodeSize:117 Lat:265 SizeLat:265 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1304 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:922 CodeSize:169 Lat:337 SizeLat:337 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:844 CodeSize:115 Lat:261 SizeLat:261 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:792 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1352 CodeSize:117 Lat:265 SizeLat:265 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1296 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1834 CodeSize:337 Lat:673 SizeLat:673 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1676 CodeSize:226 Lat:515 SizeLat:515 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1676 CodeSize:227 Lat:517 SizeLat:517 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1672 CodeSize:229 Lat:521 SizeLat:521 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1568 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4912 CodeSize:233 Lat:529 SizeLat:529 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4800 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:3018 CodeSize:337 Lat:673 SizeLat:673 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2860 CodeSize:226 Lat:515 SizeLat:515 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2860 CodeSize:227 Lat:517 SizeLat:517 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2760 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2856 CodeSize:229 Lat:521 SizeLat:521 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:2752 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4880 CodeSize:233 Lat:529 SizeLat:529 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4768 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'casts' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 284 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 278 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 234 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1148 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1104 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 762 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 650 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 684 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 650 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 684 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 650 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 684 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 648 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1192 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1152 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4488 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4400 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2698 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2472 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2536 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2464 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4560 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4480 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 3 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:80 CodeSize:35 Lat:45 SizeLat:45 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:94 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:84 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:234 CodeSize:69 Lat:89 SizeLat:89 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:324 CodeSize:43 Lat:53 SizeLat:53 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:304 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1148 CodeSize:43 Lat:53 SizeLat:53 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1104 CodeSize:5 Lat:9 SizeLat:9 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:762 CodeSize:137 Lat:177 SizeLat:177 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:684 CodeSize:83 Lat:101 SizeLat:101 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:648 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1192 CodeSize:85 Lat:105 SizeLat:105 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1152 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4488 CodeSize:85 Lat:105 SizeLat:105 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4400 CodeSize:9 Lat:17 SizeLat:17 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2698 CodeSize:273 Lat:353 SizeLat:353 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2540 CodeSize:162 Lat:195 SizeLat:195 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2540 CodeSize:163 Lat:197 SizeLat:197 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2472 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2536 CodeSize:165 Lat:201 SizeLat:201 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2464 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4560 CodeSize:169 Lat:209 SizeLat:209 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4480 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef) %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef) @@ -324,110 +324,110 @@ define void @casts() { define void @fp16() { ; CHECK-MVE-LABEL: 'fp16' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 406 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 402 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 402 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 550 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1362 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1306 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1250 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 994 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 994 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 992 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1680 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4920 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4808 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:480 CodeSize:169 Lat:337 SizeLat:337 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:550 CodeSize:115 Lat:261 SizeLat:261 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1362 CodeSize:117 Lat:265 SizeLat:265 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1306 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1250 CodeSize:337 Lat:673 SizeLat:673 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1092 CodeSize:226 Lat:515 SizeLat:515 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1092 CodeSize:227 Lat:517 SizeLat:517 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:992 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1680 CodeSize:229 Lat:521 SizeLat:521 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4920 CodeSize:233 Lat:529 SizeLat:529 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:4808 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-MVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-MVEFP-LABEL: 'fp16' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 284 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 278 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1112 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1102 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4484 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4400 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 1 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of 3 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:44 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:10 CodeSize:5 Lat:9 SizeLat:9 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:6 Lat:11 SizeLat:11 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1112 CodeSize:8 Lat:15 SizeLat:15 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:1102 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:124 CodeSize:75 Lat:85 SizeLat:85 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:48 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4484 CodeSize:79 Lat:93 SizeLat:93 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:4400 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-MVEFP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef) diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll deleted file mode 100644 index 6377437c8ca4b..0000000000000 --- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll +++ /dev/null @@ -1,371 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print" 2>&1 -disable-output -cost-kind=throughput < %s | FileCheck %s --check-prefix=THRU -; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print" 2>&1 -disable-output -cost-kind=latency < %s | FileCheck %s --check-prefix=LATE -; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print" 2>&1 -disable-output -cost-kind=code-size < %s | FileCheck %s --check-prefix=SIZE -; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print" 2>&1 -disable-output -cost-kind=size-latency < %s | FileCheck %s --check-prefix=SIZE_LATE - -target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - -; Test a cross-section of intrinsics for various cost-kinds. -; Other test files may check for accuracy of a particular intrinsic -; across subtargets or types. This is just a basic correctness check using an -; ARM target and a legal scalar type (i32/float) and/or an -; illegal vector type (16 x i32/float). - -declare i32 @llvm.smax.i32(i32, i32) -declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) - -declare float @llvm.fmuladd.f32(float, float, float) -declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) - -declare float @llvm.log2.f32(float) -declare <16 x float> @llvm.log2.v16f32(<16 x float>) - -declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) -declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata) - -declare float @llvm.maximum.f32(float, float) -declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>) - -declare i32 @llvm.cttz.i32(i32, i1) -declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1) - -declare i32 @llvm.ctlz.i32(i32, i1) -declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) - -declare i32 @llvm.fshl.i32(i32, i32, i32) -declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) - -declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) -declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>) -declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) - -declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1) - -declare i32 @llvm.ssa.copy.i32(i32) -declare float @llvm.ssa.copy.f32(float) -declare ptr @llvm.ssa.copy.p0(ptr) - -define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) { -; THRU-LABEL: 'smax' -; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b) -; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'smax' -; LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b) -; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'smax' -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b) -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'smax' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call i32 @llvm.smax.i32(i32 %a, i32 %b) - %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb) - ret void -} - -define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) { -; THRU-LABEL: 'fmuladd' -; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c) -; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'fmuladd' -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c) -; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'fmuladd' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c) -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'fmuladd' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c) - %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc) - ret void -} - -define void @log2(float %a, <16 x float> %va) { -; THRU-LABEL: 'log2' -; THRU-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a) -; THRU-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'log2' -; LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a) -; LATE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'log2' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.log2.f32(float %a) -; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'log2' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call float @llvm.log2.f32(float %a) - %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) - ret void -} - -define void @constrained_fadd(float %a, <16 x float> %va) strictfp { -; THRU-LABEL: 'constrained_fadd' -; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; THRU-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'constrained_fadd' -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; LATE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'constrained_fadd' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'constrained_fadd' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") - %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") - ret void -} - -define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { -; THRU-LABEL: 'fmaximum' -; THRU-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; THRU-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'fmaximum' -; LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; LATE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'fmaximum' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'fmaximum' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call float @llvm.maximum.f32(float %a, float %b) - %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) - ret void -} - -define void @cttz(i32 %a, <16 x i32> %va) { -; THRU-LABEL: 'cttz' -; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) -; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'cttz' -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) -; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'cttz' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'cttz' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) - %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) - ret void -} - -define void @ctlz(i32 %a, <16 x i32> %va) { -; THRU-LABEL: 'ctlz' -; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true) -; THRU-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'ctlz' -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true) -; LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'ctlz' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true) -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'ctlz' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true) - %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true) - ret void -} - -define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) { -; THRU-LABEL: 'fshl' -; THRU-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) -; THRU-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'fshl' -; LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) -; LATE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'fshl' -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) -; SIZE-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'fshl' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) - %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) - ret void -} - -define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) { -; THRU-LABEL: 'maskedgather' -; THRU-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'maskedgather' -; LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'maskedgather' -; SIZE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'maskedgather' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) - ret void -} - -define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) { -; THRU-LABEL: 'maskedscatter' -; THRU-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'maskedscatter' -; LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'maskedscatter' -; SIZE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'maskedscatter' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc) - ret void -} - -define void @reduce_fmax(<16 x float> %va) { -; THRU-LABEL: 'reduce_fmax' -; THRU-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'reduce_fmax' -; LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'reduce_fmax' -; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'reduce_fmax' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) - ret void -} - -define void @memcpy(ptr %a, ptr %b, i32 %c) { -; THRU-LABEL: 'memcpy' -; THRU-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'memcpy' -; LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'memcpy' -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'memcpy' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false) - ret void -} - -define void @ssa_copy() { - ; CHECK: %{{.*}} = llvm.intr.ssa.copy %{{.*}} : f32 -; THRU-LABEL: 'ssa_copy' -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef) -; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; LATE-LABEL: 'ssa_copy' -; LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef) -; LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef) -; LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef) -; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE-LABEL: 'ssa_copy' -; SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; SIZE_LATE-LABEL: 'ssa_copy' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %i = call i32 @llvm.ssa.copy.i32(i32 undef) - %f = call float @llvm.ssa.copy.f32(float undef) - %p = call ptr @llvm.ssa.copy.p0(ptr undef) - ret void -} diff --git a/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll new file mode 100644 index 0000000000000..b3ad818df394e --- /dev/null +++ b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print" -cost-kind=all 2>&1 -disable-output < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + +; Test a cross-section of intrinsics for various cost-kinds. +; Other test files may check for accuracy of a particular intrinsic +; across subtargets or types. This is just a basic correctness check using an +; ARM target and a legal scalar type (i32/float) and/or an +; illegal vector type (16 x i32/float). + +declare i32 @llvm.smax.i32(i32, i32) +declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) + +declare float @llvm.fmuladd.f32(float, float, float) +declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) + +declare float @llvm.log2.f32(float) +declare <16 x float> @llvm.log2.v16f32(<16 x float>) + +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata) + +declare float @llvm.maximum.f32(float, float) +declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>) + +declare i32 @llvm.cttz.i32(i32, i1) +declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1) + +declare i32 @llvm.ctlz.i32(i32, i1) +declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) + +declare i32 @llvm.fshl.i32(i32, i32, i32) +declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) +declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) + +declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1) + +declare i32 @llvm.ssa.copy.i32(i32) +declare float @llvm.ssa.copy.f32(float) +declare ptr @llvm.ssa.copy.p0(ptr) + +define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) { +; CHECK-LABEL: 'smax' +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call i32 @llvm.smax.i32(i32 %a, i32 %b) + %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb) + ret void +} + +define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) { +; CHECK-LABEL: 'fmuladd' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c) +; CHECK-NEXT: Cost Model: Found costs of 8 for: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc) + ret void +} + +define void @log2(float %a, <16 x float> %va) { +; CHECK-LABEL: 'log2' +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.log2.f32(float %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:192 CodeSize:48 Lat:192 SizeLat:192 for: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call float @llvm.log2.f32(float %a) + %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) + ret void +} + +define void @constrained_fadd(float %a, <16 x float> %va) strictfp { +; CHECK-LABEL: 'constrained_fadd' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") +; CHECK-NEXT: Cost Model: Found costs of 48 for: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") + %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") + ret void +} + +define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { +; CHECK-LABEL: 'fmaximum' +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.maximum.f32(float %a, float %b) +; CHECK-NEXT: Cost Model: Found costs of RThru:208 CodeSize:64 Lat:208 SizeLat:208 for: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call float @llvm.maximum.f32(float %a, float %b) + %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) + ret void +} + +define void @cttz(i32 %a, <16 x i32> %va) { +; CHECK-LABEL: 'cttz' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) + %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) + ret void +} + +define void @ctlz(i32 %a, <16 x i32> %va) { +; CHECK-LABEL: 'ctlz' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true) + %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true) + ret void +} + +define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) { +; CHECK-LABEL: 'fshl' +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) +; CHECK-NEXT: Cost Model: Found costs of RThru:120 CodeSize:92 Lat:120 SizeLat:120 for: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) + %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) + ret void +} + +define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) { +; CHECK-LABEL: 'maskedgather' +; CHECK-NEXT: Cost Model: Found costs of 176 for: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) + ret void +} + +define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) { +; CHECK-LABEL: 'maskedscatter' +; CHECK-NEXT: Cost Model: Found costs of 176 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc) + ret void +} + +define void @reduce_fmax(<16 x float> %va) { +; CHECK-LABEL: 'reduce_fmax' +; CHECK-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) + ret void +} + +define void @memcpy(ptr %a, ptr %b, i32 %c) { +; CHECK-LABEL: 'memcpy' +; CHECK-NEXT: Cost Model: Found costs of 4 for: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false) + ret void +} + +define void @ssa_copy() { +; CHECK-LABEL: 'ssa_copy' +; CHECK-NEXT: Cost Model: Found costs of 0 for: %i = call i32 @llvm.ssa.copy.i32(i32 undef) +; CHECK-NEXT: Cost Model: Found costs of 0 for: %f = call float @llvm.ssa.copy.f32(float undef) +; CHECK-NEXT: Cost Model: Found costs of 0 for: %p = call ptr @llvm.ssa.copy.p0(ptr undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %i = call i32 @llvm.ssa.copy.i32(i32 undef) + %f = call float @llvm.ssa.copy.f32(float undef) + %p = call ptr @llvm.ssa.copy.p0(ptr undef) + ret void +} diff --git a/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll new file mode 100644 index 0000000000000..d09216a41485f --- /dev/null +++ b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + +define void @intrinsics() { +; CHECK-LABEL: 'intrinsics' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef) + %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef) + %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48) + %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef) + ret void +} + +declare i32 @llvm.arm.ssat(i32, i32) +declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr) +declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32) +declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>) diff --git a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll deleted file mode 100644 index 2d6b3184ccde0..0000000000000 --- a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll +++ /dev/null @@ -1,41 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP -; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP -; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-LAT -; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE - -target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" - -define void @intrinsics() { -; CHECK-THUMB2-RECIP-LABEL: 'intrinsics' -; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef) -; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef) -; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48) -; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef) -; CHECK-THUMB2-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; CHECK-THUMB2-LAT-LABEL: 'intrinsics' -; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef) -; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef) -; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48) -; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef) -; CHECK-THUMB2-LAT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; CHECK-THUMB2-SIZE-LABEL: 'intrinsics' -; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef) -; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef) -; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48) -; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef) -; CHECK-THUMB2-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; - %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef) - %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef) - %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48) - %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef) - ret void -} - -declare i32 @llvm.arm.ssat(i32, i32) -declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr) -declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32) -declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>) diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll index 9193025264cc0..6177ae5f05020 100644 --- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll @@ -112,8 +112,7 @@ entry: define double @load_u64_from_u8_off1(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr b0, [x0, #1] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 1 @@ -140,8 +139,7 @@ entry: define float @load_u32_from_u8_off1(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #1] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 1 @@ -154,8 +152,7 @@ entry: define half @load_u16_from_u8_off1(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #1] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -185,8 +182,7 @@ entry: define double @load_u64_from_u16_off2(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrh w8, [x0, #2] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr h0, [x0, #2] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 2 @@ -199,8 +195,7 @@ entry: define double @load_u64_from_u8_off2(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr b0, [x0, #2] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 2 @@ -226,7 +221,7 @@ entry: define float @load_u32_from_u8_off2(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ldr b0, [x0, #2] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 2 @@ -239,7 +234,7 @@ entry: define half @load_u16_from_u8_off2(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ldr b0, [x0, #2] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -283,8 +278,7 @@ entry: define double @load_u64_from_u8_off255(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #255] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr b0, [x0, #255] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 255 @@ -311,8 +305,7 @@ entry: define float @load_u32_from_u8_off255(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #255] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #255] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 255 @@ -325,8 +318,7 @@ entry: define half @load_u16_from_u8_off255(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #255] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #255] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -354,7 +346,7 @@ entry: define double @load_u64_from_u16_off256(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr h0, [x0, #128] +; CHECK-NEXT: ldr h0, [x0, #256] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 256 @@ -367,7 +359,7 @@ entry: define double @load_u64_from_u8_off256(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #64] +; CHECK-NEXT: ldr b0, [x0, #256] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 256 @@ -393,7 +385,7 @@ entry: define float @load_u32_from_u8_off256(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: ldr b0, [x0, #256] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 256 @@ -406,7 +398,7 @@ entry: define half @load_u16_from_u8_off256(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: ldr b0, [x0, #256] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -435,8 +427,7 @@ entry: define double @load_u64_from_u16_offn(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_offn: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #8190 // =0x1ffe -; CHECK-NEXT: ldr h0, [x0, x8] +; CHECK-NEXT: ldr h0, [x0, #8190] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 8190 @@ -517,7 +508,8 @@ entry: define double @load_u64_from_u16_offnp1(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr h0, [x0, #4096] +; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192 +; CHECK-NEXT: ldr h0, [x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 8192 @@ -530,7 +522,8 @@ entry: define double @load_u64_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #1024] +; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr b0, [x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 4096 @@ -557,7 +550,8 @@ entry: define float @load_u32_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr b0, [x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 4096 @@ -570,7 +564,8 @@ entry: define half @load_u16_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr b0, [x8] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll index bd07ba1316ece..eb4cf76c96da6 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll @@ -20,6 +20,9 @@ ; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]] ; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]] ; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]] +; CHECK: OpReadClockKHR [[ulong]] [[uint_1]] +; CHECK: OpReadClockKHR [[ulong]] [[uint_2]] +; CHECK: OpReadClockKHR [[ulong]] [[uint_3]] define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) { entry: @@ -39,6 +42,9 @@ entry: %call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() %arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16 store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8 + %call10 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 1) + %call11 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 2) + %call12 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 3) ret void } @@ -59,3 +65,6 @@ declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_add ; Function Attrs: convergent nounwind declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr + +; Function Attrs: nounwind +declare spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32) diff --git a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll index 1f0737b719254..d0b8d14777f52 100644 --- a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll +++ b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll @@ -24,9 +24,9 @@ ; RUN: llvm-dis %t5.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR1 ; RUN: llvm-dis %t5.2.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR2 -; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1A1nEi) +; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1B1nEi) -; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1B1nEi target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-grtev4-linux-gnu" @@ -55,7 +55,7 @@ entry: ret i32 0 } -; CHECK-IR1: define i32 @test( +; CHECK-IR1: define noundef i32 @test( define i32 @test(ptr %obj, i32 %a) { entry: %vtable = load ptr, ptr %obj @@ -65,7 +65,7 @@ entry: %fptr1 = load ptr, ptr %fptrptr, align 8 ; Check that the call was devirtualized. - ; CHECK-IR1: tail call i32 {{.*}}@_ZN1A1nEi + ; CHECK-IR1: tail call i32 {{.*}}@_ZN1B1nEi %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) ret i32 %call @@ -73,7 +73,7 @@ entry: ; CHECK-IR2: define i32 @test2 ; Check that the call was devirtualized. -; CHECK-IR2: tail call i32 @_ZN1A1nEi +; CHECK-IR2: tail call i32 @_ZN1B1nEi declare i1 @llvm.type.test(ptr, metadata) declare void @llvm.assume(i1) diff --git a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll index 5127e92a7c345..4b8ac09801ab2 100644 --- a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll +++ b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll @@ -757,8 +757,7 @@ define i1 @add_neg_1_known_sge_ult_1(i32 %a) { ; CHECK-NEXT: [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 1 ; CHECK-NEXT: call void @llvm.assume(i1 [[A_SGE]]) ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[A]], -1 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[SUB]], [[A]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %a.sge = icmp sge i32 %a, 1 @@ -823,8 +822,7 @@ define i1 @add_neg_3_known_sge_ult_1(i32 %a) { ; CHECK-NEXT: [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[A_SGE]]) ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[A]], -3 -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[SUB]], [[A]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %a.sge = icmp sge i32 %a, 3 diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll index 52adc78b4e159..8dcac7832179a 100644 --- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll +++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll @@ -389,8 +389,7 @@ define i1 @gep_count_add_1_sge_known_ult_1(i32 %count, ptr %p) { ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[COUNT]], -1 ; CHECK-NEXT: [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64 ; CHECK-NEXT: [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]] -; CHECK-NEXT: [[C:%.*]] = icmp ult ptr [[GEP_SUB]], [[GEP_COUNT]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %sge = icmp sge i32 %count, 1 @@ -415,8 +414,7 @@ define i1 @gep_count_add_1_sge_known_uge_1(i32 %count, ptr %p) { ; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[COUNT]], -1 ; CHECK-NEXT: [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64 ; CHECK-NEXT: [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]] -; CHECK-NEXT: [[C:%.*]] = icmp uge ptr [[GEP_SUB]], [[P]] -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: %sge = icmp sge i32 %count, 1 diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo.ll b/llvm/test/Transforms/DeadArgElim/dbginfo.ll index a27ca9dd70c24..f6313c7957f88 100644 --- a/llvm/test/Transforms/DeadArgElim/dbginfo.ll +++ b/llvm/test/Transforms/DeadArgElim/dbginfo.ll @@ -21,14 +21,14 @@ ; updated LLVM functions. ; Function Attrs: uwtable -define void @_Z2f2v() #0 !dbg !4 { +define void @_Z2f2v() !dbg !4 { entry: call void (i32, ...) @_ZL2f1iz(i32 1), !dbg !15 ret void, !dbg !16 } ; Function Attrs: nounwind uwtable -define internal void @_ZL2f1iz(i32, ...) #1 !dbg !8 { +define internal void @_ZL2f1iz(i32, ...) !dbg !8 { entry: call void @llvm.dbg.value(metadata i32 %0, metadata !17, metadata !18), !dbg !19 ret void, !dbg !20 @@ -40,8 +40,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, metadata, metadata) #2 -attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll index f66aa0f81ec75..2cec6b5d805b7 100644 --- a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll +++ b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll @@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu" @g = common global [1 x i8] zeroinitializer, align 1, !dbg !0 ; Function Attrs: noinline nounwind uwtable -define void @foo() #0 !dbg !14 { +define void @foo() !dbg !14 { entry: %i = alloca i8, align 1 store i8 1, ptr %i, align 1, !dbg !19 @@ -37,7 +37,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #2 -attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone speculatable } attributes #2 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll index 4d3a241352630..628669d6f4260 100644 --- a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll +++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll @@ -3,13 +3,11 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define void @func() #0 !dbg !4 { +define void @func() !dbg !4 { entry: ret void, !dbg !10 } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} !llvm.ident = !{!9} diff --git a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll index f449047096ed3..713dc4ebf8e89 100644 --- a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll +++ b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll @@ -24,16 +24,13 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable -define i32 @main() #0 !dbg !4 { +define i32 @main() !dbg !4 { entry: call void (...) @func(), !dbg !11 ret i32 0, !dbg !12 } -declare void @func(...) #1 - -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @func(...) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8, !9} diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll index 1840f045b3ffe..543edac243def 100644 --- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll +++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll @@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu" @A = common global i32 0, align 4, !dbg !9 ; Function Attrs: nounwind uwtable -define void @test() #0 !dbg !4 { +define void @test() !dbg !4 { entry: tail call void (...) @f() #2, !dbg !14 %0 = load i32, ptr @A, align 4, !dbg !15 @@ -28,12 +28,10 @@ if.end: ; preds = %entry, %if.then ret void, !dbg !18 } -declare void @f(...) #1 +declare void @f(...) -declare void @g(...) #1 +declare void @g(...) -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.gcov = !{!19} diff --git a/llvm/test/Transforms/GCOVProfiling/linezero.ll b/llvm/test/Transforms/GCOVProfiling/linezero.ll index 8bfeabdf906a1..1eae41303f5d4 100644 --- a/llvm/test/Transforms/GCOVProfiling/linezero.ll +++ b/llvm/test/Transforms/GCOVProfiling/linezero.ll @@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu" %struct.vector = type { i8 } ; Function Attrs: nounwind -define i32 @_Z4testv() #0 !dbg !15 { +define i32 @_Z4testv() !dbg !15 { entry: %retval = alloca i32, align 4 %__range = alloca ptr, align 8 @@ -63,19 +63,19 @@ return: ; No predecessors! } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare void @_Z13TagFieldSpecsv() #2 +declare void @_Z13TagFieldSpecsv() -declare ptr @_ZN6vector5beginEv(ptr) #2 +declare ptr @_ZN6vector5beginEv(ptr) -declare ptr @_ZN6vector3endEv(ptr) #2 +declare ptr @_ZN6vector3endEv(ptr) ; Function Attrs: noreturn nounwind -declare void @llvm.trap() #3 +declare void @llvm.trap() ; Function Attrs: nounwind -define void @_Z2f1v() #0 !dbg !20 { +define void @_Z2f1v() !dbg !20 { entry: br label %0 @@ -83,11 +83,6 @@ entry: ret void, !dbg !45 } -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { noreturn nounwind } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!23, !24} !llvm.gcov = !{!25} diff --git a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll index 7f169e2d1ebc3..98bdd748ed03c 100644 --- a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll +++ b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: load {{.*}} @__llvm_gcov_ctr ; CHECK-NOT: load {{.*}} @__llvm_gcov_ctr -define dso_local i32 @cannot_split(ptr nocapture readonly %p) #0 !dbg !7 { +define dso_local i32 @cannot_split(ptr nocapture readonly %p) !dbg !7 { entry: %targets = alloca <2 x ptr>, align 16 store <2 x ptr> , ptr %targets, align 16, !dbg !9 @@ -42,8 +42,6 @@ end: ; preds = %indirect ret i32 0, !dbg !22 } -attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} diff --git a/llvm/test/Transforms/GVN/cond_br2.ll b/llvm/test/Transforms/GVN/cond_br2.ll index 6ceec95845b7a..4811ad021b109 100644 --- a/llvm/test/Transforms/GVN/cond_br2.ll +++ b/llvm/test/Transforms/GVN/cond_br2.ll @@ -11,12 +11,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 } ; Function Attrs: ssp uwtable -define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { +define void @_Z4testv() personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: define void @_Z4testv( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { +; CHECK-SAME: ) personality ptr @__gxx_personality_v0 { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[SV:%.*]] = alloca %"class.llvm::SmallVector", align 16 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SV]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[SV]]) ; CHECK-NEXT: [[FIRSTEL_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 3 ; CHECK-NEXT: store ptr [[FIRSTEL_I_I_I_I_I_I]], ptr [[SV]], align 16, !tbaa [[ANYPTR_TBAA0:![0-9]+]] ; CHECK-NEXT: [[ENDX_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 1 @@ -66,10 +66,10 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { ; CHECK-NEXT: [[CMP_I_I_I_I19:%.*]] = icmp eq ptr [[TMP0]], [[FIRSTEL_I_I_I_I_I_I]] ; CHECK-NEXT: br i1 [[CMP_I_I_I_I19]], label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21:.*]], label %[[IF_THEN_I_I_I20:.*]] ; CHECK: [[IF_THEN_I_I_I20]]: -; CHECK-NEXT: call void @free(ptr [[TMP0]]) #[[ATTR4]] +; CHECK-NEXT: call void @free(ptr [[TMP0]]) ; CHECK-NEXT: br label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]] ; CHECK: [[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SV]]) #[[ATTR4]] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[SV]]) ; CHECK-NEXT: ret void ; CHECK: [[LPAD]]: ; CHECK-NEXT: [[TMP1:%.*]] = landingpad { ptr, i32 } @@ -78,7 +78,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { ; CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp eq ptr [[TMP2]], [[FIRSTEL_I_I_I_I_I_I]] ; CHECK-NEXT: br i1 [[CMP_I_I_I_I]], label %[[EH_RESUME:.*]], label %[[IF_THEN_I_I_I:.*]] ; CHECK: [[IF_THEN_I_I_I]]: -; CHECK-NEXT: call void @free(ptr [[TMP2]]) #[[ATTR4]] +; CHECK-NEXT: call void @free(ptr [[TMP2]]) ; CHECK-NEXT: br label %[[EH_RESUME]] ; CHECK: [[EH_RESUME]]: ; CHECK-NEXT: resume { ptr, i32 } [[TMP1]] @@ -86,7 +86,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { entry: %sv = alloca %"class.llvm::SmallVector", align 16 - call void @llvm.lifetime.start.p0(ptr %sv) #1 + call void @llvm.lifetime.start.p0(ptr %sv) %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3 store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4 %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1 @@ -151,11 +151,11 @@ invoke.cont3: ; preds = %invoke.cont2 br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20 if.then.i.i.i20: ; preds = %invoke.cont3 - call void @free(ptr %5) #1 + call void @free(ptr %5) br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21: ; preds = %invoke.cont3, %if.then.i.i.i20 - call void @llvm.lifetime.end.p0(ptr %sv) #1 + call void @llvm.lifetime.end.p0(ptr %sv) ret void lpad: ; preds = %if.end.i14, %if.end.i, %invoke.cont2 @@ -166,7 +166,7 @@ lpad: ; preds = %if.end.i14, %if.end br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i if.then.i.i.i: ; preds = %lpad - call void @free(ptr %7) #1 + call void @free(ptr %7) br label %eh.resume eh.resume: ; preds = %if.then.i.i.i, %lpad @@ -174,24 +174,19 @@ eh.resume: ; preds = %if.then.i.i.i, %lpa } ; Function Attrs: nounwind -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) declare i32 @__gxx_personality_v0(...) -declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2 +declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) ; Function Attrs: nounwind -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) -declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2 +declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) ; Function Attrs: nounwind -declare void @free(ptr nocapture) #3 - -attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @free(ptr nocapture) !0 = !{!"any pointer", !1} !1 = !{!"omnipotent char", !2} diff --git a/llvm/test/Transforms/GVN/pr33549.ll b/llvm/test/Transforms/GVN/pr33549.ll index a8ce37c4f86a6..a1e28f760a193 100644 --- a/llvm/test/Transforms/GVN/pr33549.ll +++ b/llvm/test/Transforms/GVN/pr33549.ll @@ -4,9 +4,9 @@ @Data = common local_unnamed_addr global [32 x i32] zeroinitializer, align 4 ; Function Attrs: norecurse nounwind -define void @testshl() local_unnamed_addr #0 { +define void @testshl() local_unnamed_addr { ; CHECK-LABEL: define void @testshl( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: @@ -78,8 +78,6 @@ for.end10: ; preds = %for.inc8 ret void } -attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+hwdiv,+strict-align,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/GVN/pr42605.ll b/llvm/test/Transforms/GVN/pr42605.ll index 3e6241c35ac60..98b447f8add5a 100644 --- a/llvm/test/Transforms/GVN/pr42605.ll +++ b/llvm/test/Transforms/GVN/pr42605.ll @@ -114,7 +114,7 @@ if.end: ; preds = %if.then, %entry ret void } -attributes #0 = { noinline norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline norecurse nounwind readonly uwtable } !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll index b9c8537781a82..082d01682ad4c 100644 --- a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll +++ b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll @@ -14,7 +14,7 @@ @res = common global i32 0, align 4 ; Function Attrs: -define i64 @func() #0 { +define i64 @func() { entry: ret i64 1 } @@ -27,7 +27,7 @@ entry: %2 = load volatile i32, ptr @g_x_u, align 4 %3 = load volatile i32, ptr @g_z_u, align 4 %4 = load volatile i32, ptr @g_m, align 4 - %call = call i64 @func() #4 + %call = call i64 @func() %conv = sext i32 %1 to i64 %cmp = icmp ne i64 %call, %conv br i1 %cmp, label %if.end, label %lor.lhs.false @@ -42,7 +42,7 @@ if.then: br label %cleanup if.end: - %call4 = call i64 @func() #4 + %call4 = call i64 @func() %conv5 = zext i32 %3 to i64 %cmp6 = icmp ne i64 %call4, %conv5 br i1 %cmp6, label %if.end14, label %lor.lhs.false8 @@ -57,7 +57,7 @@ if.then13: br label %cleanup if.end14: - %call15 = call i64 @func() #4 + %call15 = call i64 @func() %cmp17 = icmp ne i64 %call15, %conv br i1 %cmp17, label %if.end25, label %lor.lhs.false19 @@ -77,5 +77,3 @@ cleanup: %retval.0 = phi i32 [ 0, %if.end25 ], [ 1, %if.then24 ], [ 1, %if.then13 ], [ 1, %if.then ] ret i32 %retval.0 } - -attributes #0 = { minsize noinline nounwind optsize uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/GVNHoist/pr30499.ll b/llvm/test/Transforms/GVNHoist/pr30499.ll index bd6f98c0f8a22..6df9484b2471b 100644 --- a/llvm/test/Transforms/GVNHoist/pr30499.ll +++ b/llvm/test/Transforms/GVNHoist/pr30499.ll @@ -1,6 +1,6 @@ ; RUN: opt -S -passes=gvn-hoist < %s -define void @_Z3fn2v() #0 { +define void @_Z3fn2v() { entry: %a = alloca ptr, align 8 %b = alloca i32, align 4 @@ -11,7 +11,7 @@ entry: br i1 %tobool, label %if.then, label %if.end if.then: ; preds = %entry - %call = call i64 @_Z3fn1v() #2 + %call = call i64 @_Z3fn1v() %conv = trunc i64 %call to i32 store i32 %conv, ptr %b, align 4 br label %if.end @@ -23,8 +23,4 @@ if.end: ; preds = %if.then, %entry } ; Function Attrs: nounwind readonly -declare i64 @_Z3fn1v() #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readonly } +declare i64 @_Z3fn1v() diff --git a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll index 7cba30dd8a1f4..81a93232b4350 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll @@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx" ; CHECK-LABEL: @test1 ; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -define i32 @test1(ptr %a) #0 { +define i32 @test1(ptr %a) { entry: br label %for.cond @@ -25,5 +25,3 @@ for.body: ; preds = %for.cond for.end: ; preds = %for.cond ret i32 %sum.0 } - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/Inline/always-inline-attr.ll b/llvm/test/Transforms/Inline/always-inline-attr.ll index 08ea3079d3ff2..4e698229a1ee1 100644 --- a/llvm/test/Transforms/Inline/always-inline-attr.ll +++ b/llvm/test/Transforms/Inline/always-inline-attr.ll @@ -6,7 +6,7 @@ target triple = "x86_64-grtev4-linux-gnu" ; After AlwaysInline the callee's attributes should be merged into caller's attibutes. -; CHECK: define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0) #0 +; CHECK: define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0) ; CHECK: attributes #0 = { mustprogress uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" ; Function Attrs: uwtable mustprogress @@ -36,12 +36,10 @@ entry: } ; Function Attrs: nounwind readnone -declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) #2 - -attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+avx512vl,+bmi2,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512f,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone } +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) +attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "prefer-vector-width"="128" } +attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "prefer-vector-width"="128" } !2 = !{!3, !3, i64 0} !3 = !{!"omnipotent char", !4, i64 0} diff --git a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll index a4c41341df951..bf0dfa2c9add0 100644 --- a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll +++ b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll @@ -89,11 +89,10 @@ entry: ret void, !dbg !20 } -declare void @_Z2f1v() #2 +attributes #0 = { uwtable } +attributes #1 = { alwaysinline inlinehint uwtable } -attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { alwaysinline inlinehint uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @_Z2f1v() !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!10, !11} diff --git a/llvm/test/Transforms/Inline/inline-vla.ll b/llvm/test/Transforms/Inline/inline-vla.ll index 8e4bb3d140bca..1e3c2351e6ed3 100644 --- a/llvm/test/Transforms/Inline/inline-vla.ll +++ b/llvm/test/Transforms/Inline/inline-vla.ll @@ -9,7 +9,7 @@ @.str1 = private unnamed_addr constant [3 x i8] c"ab\00", align 1 ; Function Attrs: nounwind ssp uwtable -define i32 @main(i32 %argc, ptr nocapture readnone %argv) #0 { +define i32 @main(i32 %argc, ptr nocapture readnone %argv) { entry: %data = alloca [2 x i8], align 1 call fastcc void @memcpy2(ptr %data, ptr @.str, i64 1) @@ -18,7 +18,7 @@ entry: } ; Function Attrs: inlinehint nounwind ssp uwtable -define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) #1 { +define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) { entry: %vla = alloca i64, i64 %size, align 16 call void @llvm.memcpy.p0.p0.i64(ptr %vla, ptr %src, i64 %size, i1 false) @@ -27,11 +27,7 @@ entry: } ; Function Attrs: nounwind -declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #2 - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll index 3021935cdd025..2b4aedd7331e8 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll @@ -27,20 +27,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" ; Function Attrs: nounwind ssp uwtable -define i32 @foo() #0 !dbg !7 { +define i32 @foo() !dbg !7 { entry: ret i32 1, !dbg !9 } ; Function Attrs: nounwind ssp uwtable -define i32 @bar() #0 !dbg !10 { +define i32 @bar() !dbg !10 { entry: %call = call i32 @foo(), !dbg !11 ret i32 %call, !dbg !12 } -attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll index d3947137b4d80..f547c37259314 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll @@ -63,20 +63,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" ; Function Attrs: nounwind ssp uwtable -define internal i32 @foo() #0 !dbg !7 { +define internal i32 @foo() !dbg !7 { entry: ret i32 1, !dbg !9 } ; Function Attrs: nounwind ssp uwtable -define i32 @bar() #0 !dbg !10 !prof !13 { +define i32 @bar() !dbg !10 !prof !13 { entry: %call = call i32 @foo(), !dbg !11 ret i32 %call, !dbg !12 } -attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll index b0a238ff8efee..64b305b0d4749 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll @@ -72,20 +72,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" ; Function Attrs: nounwind ssp uwtable -define i32 @foo() #0 !dbg !7 { +define i32 @foo() !dbg !7 { entry: ret i32 1, !dbg !9 } ; Function Attrs: nounwind ssp uwtable -define i32 @bar() #0 !dbg !10 !prof !13 { +define i32 @bar() !dbg !10 !prof !13 { entry: %call = call i32 @foo(), !dbg !11 ret i32 %call, !dbg !12 } -attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} diff --git a/llvm/test/Transforms/Inline/optimization-remarks.ll b/llvm/test/Transforms/Inline/optimization-remarks.ll index bc1e690ee61fe..135d835431e59 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks.ll @@ -81,9 +81,9 @@ entry: ret i32 %add } -attributes #0 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { alwaysinline nounwind uwtable } +attributes #1 = { noinline nounwind uwtable } +attributes #2 = { nounwind uwtable } !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll index ecedbdb3522db..abe1ed0fa37b8 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll @@ -124,6 +124,39 @@ define @try_combine_svbool_binop_orr( %a, %t3 } +; Verify predicate cast does not hinder "isAllActive" knowledge. +define @try_combine_svbool_binop_fadd( %a, %b) { +; CHECK-LABEL: @try_combine_svbool_binop_fadd( +; CHECK-NEXT: [[T2:%.*]] = fadd [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret [[T2]] +; + %t1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( splat (i1 true)) + %t2 = tail call @llvm.aarch64.sve.fadd.nxv8f16( %t1, %a, %b) + ret %t2 +} + +; Verify predicate cast does not hinder "isAllActive" knowledge. +define @try_combine_svbool_binop_fmul( %a, %b) { +; CHECK-LABEL: @try_combine_svbool_binop_fmul( +; CHECK-NEXT: [[T2:%.*]] = fmul [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret [[T2]] +; + %t1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( splat (i1 true)) + %t2 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %t1, %a, %b) + ret %t2 +} + +; Verify predicate cast does not hinder "isAllActive" knowledge. +define @try_combine_svbool_binop_fsub( %a, %b) { +; CHECK-LABEL: @try_combine_svbool_binop_fsub( +; CHECK-NEXT: [[T2:%.*]] = fsub [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret [[T2]] +; + %t1 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( splat (i1 true)) + %t2 = tail call @llvm.aarch64.sve.fsub.nxv2f64( %t1, %a, %b) + ret %t2 +} + declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() declare @llvm.aarch64.sve.convert.to.svbool.nxv4i1() declare @llvm.aarch64.sve.convert.to.svbool.nxv2i1() diff --git a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll index bb0129926799f..3f29509cc3525 100644 --- a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll +++ b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll @@ -21,7 +21,7 @@ @b = common global i32 0, align 4 ; CHECK: define i32 @fn1 -define i32 @fn1() #0 { +define i32 @fn1() { entry: %b.promoted = load i32, ptr @b, align 4, !tbaa !2 br label %for.body @@ -40,8 +40,6 @@ for.end: ; preds = %for.body ret i32 undef } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll index 2348490265e1b..2d06f4298baed 100644 --- a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll @@ -208,6 +208,3 @@ define <4 x i32> @extract_cond_type_mismatch(<4 x i32> %x, <4 x i32> %y, <5 x i1 %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y ret <4 x i32> %r } - - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/InstCombine/select-extractelement.ll b/llvm/test/Transforms/InstCombine/select-extractelement.ll index 621d278dee6e5..6f80b7d021715 100644 --- a/llvm/test/Transforms/InstCombine/select-extractelement.ll +++ b/llvm/test/Transforms/InstCombine/select-extractelement.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=instcombine < %s | FileCheck %s -declare void @v4float_user(<4 x float>) #0 +declare void @v4float_user(<4 x float>) -define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 { +define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) { ; CHECK-LABEL: @extract_one_select( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -17,7 +17,7 @@ define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 { } ; Multiple extractelements -define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 { +define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) { ; CHECK-LABEL: @extract_two_select( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -34,7 +34,7 @@ define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) # } ; Select has an extra non-extractelement user, don't change it -define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 { +define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) { ; CHECK-LABEL: @extract_one_select_user( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -49,7 +49,7 @@ define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 ret float %extract } -define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @extract_one_vselect_user( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -67,7 +67,7 @@ define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> ; Do not convert the vector select into a scalar select. That would increase ; the instruction count and potentially obfuscate a vector min/max idiom. -define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @extract_one_vselect( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[SELECT:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -81,7 +81,7 @@ define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) } ; Multiple extractelements from a vector select -define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @extract_two_vselect( ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]] @@ -100,7 +100,7 @@ define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32 ; The vector selects are not decomposed into scalar selects because that would increase ; the instruction count. Extract+insert is converted to non-lane-crossing shuffles. ; Test multiple extractelements -define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { ; CHECK-LABEL: @simple_vector_select( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 0 @@ -232,5 +232,3 @@ define i32 @inf_loop_partial_undef(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x %t11 = extractelement <2 x i32> %p, i32 0 ret i32 %t11 } - -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll index edaade329e9ce..26ba85755523d 100644 --- a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll +++ b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll @@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu" @global.2 = external local_unnamed_addr global i64, align 8 ; Function Attrs: norecurse noreturn nounwind uwtable -define void @hoge() local_unnamed_addr #0 { +define void @hoge() local_unnamed_addr { ; CHECK-LABEL: define void @hoge( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[BB:.*:]] ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: @@ -48,8 +48,6 @@ bb27: ; preds = %bb1 br label %bb26 } -attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 7.0.0 "} diff --git a/llvm/test/Transforms/LICM/volatile-alias.ll b/llvm/test/Transforms/LICM/volatile-alias.ll index 410f3becc5bac..35e88445ff759 100644 --- a/llvm/test/Transforms/LICM/volatile-alias.ll +++ b/llvm/test/Transforms/LICM/volatile-alias.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable -define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) #0 { +define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) { entry: %p.addr = alloca ptr, align 8 %q.addr = alloca ptr, align 8 @@ -51,5 +51,3 @@ for.end: ; preds = %for.cond %8 = load i32, ptr %s, align 4 ret i32 %8 } - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopRotate/noalias.ll b/llvm/test/Transforms/LoopRotate/noalias.ll index f709bba1cb4b1..d6b2414b426d8 100644 --- a/llvm/test/Transforms/LoopRotate/noalias.ll +++ b/llvm/test/Transforms/LoopRotate/noalias.ll @@ -146,11 +146,7 @@ for.end: ; preds = %for.cond } ; Function Attrs: inaccessiblememonly nounwind -declare void @llvm.experimental.noalias.scope.decl(metadata) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inaccessiblememonly nounwind } -attributes #2 = { nounwind readnone speculatable } +declare void @llvm.experimental.noalias.scope.decl(metadata) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll index 54233686feb87..25ded4bc64c6f 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll @@ -157,4 +157,4 @@ bb: br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph } -attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll index 6c34d1b390cce..a878fc2c39e82 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll @@ -62,6 +62,6 @@ for.end: ; preds = %fn3.exit ; Function Attrs: nounwind optsize declare i32 @printf(ptr nocapture readonly, ...) #1 -attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind optsize } diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll index 914ea7aabe0d1..9c20685669374 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll @@ -71,8 +71,8 @@ fn1.exit: ; preds = %lor.end.i ; Function Attrs: nounwind optsize declare i32 @printf(ptr nocapture readonly, ...) #1 -attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } +attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } attributes #2 = { nounwind optsize } !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll index 3364465898643..37e2f68e89a6f 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll @@ -46,7 +46,7 @@ for.body3: ; preds = %for.body3, %for.bod br i1 %exitcond, label %for.cond.loopexit, label %for.body3 } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } !0 = !{!1, !2, i64 16} !1 = !{!"planet", !2, i64 0, !2, i64 8, !2, i64 16, !2, i64 24, !2, i64 32, !2, i64 40, !2, i64 48} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll index ee28aa1b0008b..df4dfa6a2101b 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll @@ -76,7 +76,7 @@ lee1.exit: ; preds = %lee1.exit.loopexit, ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 -attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "use-soft-float"="false" } attributes #1 = { nounwind readnone } !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 27ca4143b5be5..199203a9f5cb0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -86,7 +86,8 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 16: 41 +; CHECK: Cost of 1 for VF 16: EXPRESSION vp<%11> = ir<%sum> + partial.reduce.add (mul nuw nsw (ir<%1> zext to i64), (ir<%0> zext to i64)) +; CHECK: Cost for VF 16: 3 ; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll index 2d1543185098f..8109d0683fe71 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll @@ -6,8 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" define i32 @dotp(ptr %a, ptr %b) #0 { -; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers -; CHECK-REGS-VP: LV: Selecting VF: vscale x 8. +; CHECK-REGS-VP: LV: Selecting VF: vscale x 16. ; ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 3dfa6df3313a5..287226f14b753 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -31,9 +31,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] ; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) @@ -52,37 +52,34 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP6]] +; CHECK-SVE-NEXT: [[TMP12:%.*]] = sub <16 x i32> zeroinitializer, [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -114,10 +111,10 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -184,9 +181,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -221,9 +218,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -262,10 +259,10 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -331,11 +328,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -352,37 +349,34 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -414,11 +408,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -486,11 +480,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP16]] ; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -508,37 +502,35 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP10]] +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -570,11 +562,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sub zeroinitializer, [[TMP18]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP19]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -644,12 +636,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) -; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP15]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP15]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -683,9 +675,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) ; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]] @@ -726,12 +718,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP11]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE3]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -802,13 +794,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) -; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) -; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP13]] ; CHECK-NEON-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -826,39 +818,37 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] -; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] -; CHECK-SVE-NEXT: [[TMP21]] = sub [[TMP19]], [[TMP20]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) +; CHECK-SVE-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP11]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP10]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -890,13 +880,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw [[TMP14]], [[TMP12]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE3]], [[TMP20]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -969,9 +959,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1005,9 +995,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 ; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1045,9 +1035,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP15]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1110,8 +1100,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-NEON-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1142,8 +1132,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1178,8 +1168,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP11]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE2]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP12]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1240,10 +1230,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -1276,10 +1266,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) ; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -1316,10 +1306,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP15]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] ; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP16]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll index b033f6051f812..b430efc9e5283 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll @@ -467,3 +467,83 @@ loop: exit: ret i32 %red.next } + +define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) { +; CHECK-LABEL: define i64 @partial_reduction_mul_two_users( +; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64> +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]]) +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RES1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD_EXT_EXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RES2:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LOAD:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ADD]] = add i64 [[RES2]], [[MUL_EXT]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[MUL]], [[C]] +; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32 +; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64 +; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[ADD_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] + %res2 = phi i64 [ 0, %entry ], [ %add, %loop ] + %load = load i16, ptr %a, align 2 + %iv.next = add i64 %iv, 1 + %conv = sext i16 %b to i32 + %mul = mul i32 %conv, %conv + %mul.ext = zext i32 %mul to i64 + %add = add i64 %res2, %mul.ext + %second_use = or i32 %mul, %c ; this value is otherwise unused, but that's sufficient for the test + %load.ext = sext i16 %load to i32 + %load.ext.ext = sext i32 %load.ext to i64 + %exitcond740.not = icmp eq i64 %iv, %n + br i1 %exitcond740.not, label %exit, label %loop + +exit: + ret i64 %add +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 8ece59aef90a9..d8f1a86c9ebda 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -16,10 +16,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -68,7 +68,6 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -77,11 +76,12 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] @@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] @@ -112,7 +112,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]] ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] @@ -136,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 ; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1 ; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: while.end.loopexit: ; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret void @@ -495,7 +495,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: br label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 09b41fb551775..26e630f969ef3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -26,26 +26,26 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD2]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP19:%.*]] = mul [[TMP18]], [[TMP7]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP14]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP19]]) +; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP12:%.*]] = mul [[TMP18]], [[TMP11]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP12]]) +; CHECK-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[TMP14]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP20]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -71,25 +71,25 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 ; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD2]] to ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul [[TMP18]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[TMP22]] = add [[TMP20]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul [[TMP12]], [[TMP11]] +; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul [[TMP21]], [[TMP7]] +; CHECK-NOI8MM-NEXT: [[TMP18]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP20]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[TMP23]], [[TMP22]] -; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[TMP20]], [[TMP18]] +; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: @@ -137,26 +137,26 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP19:%.*]] = mul [[TMP18]], [[TMP7]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP14]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP19]]) +; CHECK-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP12:%.*]] = mul [[TMP18]], [[TMP11]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP12]]) +; CHECK-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-NEXT: [[TMP20:%.*]] = mul [[TMP19]], [[TMP14]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP20]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: @@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 @@ -182,25 +182,25 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 1 ; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 ; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 ; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-NOI8MM-NEXT: [[TMP20:%.*]] = mul [[TMP18]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] -; CHECK-NOI8MM-NEXT: [[TMP22]] = add [[TMP20]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP23]] = add [[TMP21]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul [[TMP12]], [[TMP11]] +; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul [[TMP21]], [[TMP7]] +; CHECK-NOI8MM-NEXT: [[TMP18]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP20]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[TMP23]], [[TMP22]] -; CHECK-NOI8MM-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add [[TMP20]], [[TMP18]] +; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-NOI8MM: scalar.ph: @@ -242,18 +242,18 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -337,18 +337,18 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 801eb810d8625..b84763142b686 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -18,10 +18,10 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP7]], [[TMP10]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -80,10 +80,10 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -720,26 +720,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) @@ -788,59 +788,59 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP21]]) ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP29]]) ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP40]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP33]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP37]]) ; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP41]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP45]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -884,26 +884,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) ; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) ; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) ; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) @@ -2025,7 +2025,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 @@ -2062,7 +2062,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -2091,7 +2091,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 1ace7d44125b9..4636c1b63da82 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 @@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 @@ -75,26 +75,26 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 ; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI1]], [[TMP22]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul [[TMP4]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP6]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE5]]) +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-MAXBW: scalar.ph: @@ -134,10 +134,10 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -157,54 +157,78 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 32 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 48 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i64> [[TMP2]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw nsw <16 x i64> [[TMP12]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI2]], <16 x i64> [[TMP14]]) +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD11]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nuw nsw <16 x i64> [[TMP15]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI3]], <16 x i64> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX15:%.*]] = add <2 x i64> [[PARTIAL_REDUCE13]], [[BIN_RDX]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX15]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX16]]) ; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod( ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[NEXT_GEP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP15]] = add [[TMP14]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP15]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret i64 [[TMP4]] ; entry: br label %for.body @@ -245,10 +269,10 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -269,30 +293,50 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP19]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP2]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <8 x i16> [[WIDE_LOAD11]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <8 x i16> [[WIDE_LOAD7]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw nsw <8 x i64> [[TMP12]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI2]], <8 x i64> [[TMP14]]) +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <8 x i16> [[WIDE_LOAD12]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <8 x i16> [[WIDE_LOAD8]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nuw nsw <8 x i64> [[TMP15]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE15]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI3]], <8 x i64> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX17:%.*]] = add <2 x i64> [[PARTIAL_REDUCE15]], [[BIN_RDX16]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX17]]) ; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: ret i64 [[TMP9]] @@ -302,36 +346,28 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-MAXBW-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-MAXBW-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 2 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[NEXT_GEP2]], align 2 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nuw nsw [[TMP15]], [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP17]] = add [[TMP16]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP17]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret i64 [[TMP4]] ; entry: br label %for.body @@ -687,7 +723,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]]) ; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] @@ -835,7 +871,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 @@ -964,7 +1000,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = add [[TMP20]], [[TMP19]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -1024,26 +1060,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP12]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) @@ -1092,58 +1128,58 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP15]]) ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP17]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]]) ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP23]]) +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP25]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]]) ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]] -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP30]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP33]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]] -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP38]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP39]]) +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP40]], [[TMP41]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1165,21 +1201,21 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1191,38 +1227,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext [[WIDE_LOAD9]] to -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw [[TMP29]], [[TMP23]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE11]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI7]], [[TMP31]]) -; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD12]] to -; CHECK-MAXBW-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = sext [[WIDE_LOAD14]] to -; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = mul nsw [[TMP37]], [[TMP43]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI6]], [[TMP45]]) -; CHECK-MAXBW-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = sext [[WIDE_LOAD18]] to -; CHECK-MAXBW-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = sext [[WIDE_LOAD20]] to -; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = mul nsw [[TMP51]], [[TMP57]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI5]], [[TMP59]]) -; CHECK-MAXBW-NEXT: [[WIDE_LOAD24:%.*]] = load , ptr [[TMP16]], align 1 -; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = sext [[WIDE_LOAD24]] to -; CHECK-MAXBW-NEXT: [[WIDE_LOAD26:%.*]] = load , ptr [[TMP17]], align 1 -; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = sext [[WIDE_LOAD26]] to -; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = mul nsw [[TMP65]], [[TMP71]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI4]], [[TMP73]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = mul nsw [[TMP27]], [[TMP32]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI3]], [[TMP33]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD5]] to +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw [[TMP18]], [[TMP19]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE7]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI2]], [[TMP20]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = mul nsw [[TMP21]], [[TMP22]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE10]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP23]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD11:%.*]] = load , ptr [[TMP16]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = sext [[WIDE_LOAD11]] to +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw [[TMP24]], [[TMP25]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP26]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE16]]) -; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE17]]) -; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE11]]) +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE13]]) +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE10]]) +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE7]]) +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -1390,7 +1426,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true -; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] @@ -1525,7 +1561,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1565,110 +1601,93 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw nsw <16 x i64> [[TMP3]], [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] -; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD3]] to -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw [[TMP20]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nuw nsw [[TMP21]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[VEC_PHI]], [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI1]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP25]], [[TMP24]] -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nuw nsw [[TMP8]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP14]] = add [[VEC_PHI]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = mul nuw nsw [[TMP12]], [[TMP8]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP9]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP14]]) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -1971,7 +1990,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2104,7 +2123,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2160,10 +2179,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2181,10 +2200,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVED: for.body.preheader: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] @@ -2194,19 +2213,29 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i64> [[TMP13]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i64> [[TMP10]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -2219,35 +2248,35 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW: for.body.preheader: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = insertelement zeroinitializer, i64 [[COST]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = insertelement zeroinitializer, i64 [[COST]], i32 0 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP12]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[NEXT_GEP1]], align 1 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD2]] to -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nuw nsw [[TMP16]], [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP20]] = add [[TMP17]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[NEXT_GEP1]], align 1 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = mul nuw nsw [[TMP14]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP11]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP20]]) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2319,17 +2348,16 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 @@ -2341,42 +2369,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP27]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -2429,7 +2458,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] ; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 @@ -2441,42 +2469,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]]) -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]]) -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]]) +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -2529,7 +2558,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] ; CHECK-MAXBW-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 @@ -2541,42 +2569,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> ; CHECK-MAXBW-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> ; CHECK-MAXBW-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]]) +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]]) -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]]) -; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]]) +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]]) +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]]) +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]]) +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]]) +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) -; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) -; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) -; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) -; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) -; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) -; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll index b308b925181b1..bd9fae6cd610b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll @@ -23,12 +23,12 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star ; IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; IC2-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; IC2-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]] +; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; IC2-NEXT: [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]] -; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]]) +; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) +; IC2-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]] +; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP6]]) ; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; IC2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -80,18 +80,18 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star ; IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 ; IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; IC4-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]] +; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]]) ; IC4-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; IC4-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; IC4-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i32> [[TMP6]], [[TMP6]] ; IC4-NEXT: [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]] -; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]] -; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]] -; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) ; IC4-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) +; IC4-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> +; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP10]], [[TMP10]] ; IC4-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]]) -; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]]) +; IC4-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> +; IC4-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i32> [[TMP14]], [[TMP14]] +; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) ; IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; IC4-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC4-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll index 7bb47157d7cf2..6dae09ef97e1c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -48,19 +48,19 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]] ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -78,27 +78,27 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: br label [[ENTRY:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] -; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP14]]) +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul [[TMP4]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub zeroinitializer, [[TMP6]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP8]]) ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-MAXBW: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index 70532ad6586c7..46ec858d7455c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -45,8 +45,8 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -138,8 +138,8 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -227,8 +227,8 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -321,8 +321,8 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -421,12 +421,12 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]]) +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP10]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP7]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -811,8 +811,8 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -1000,11 +1000,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] @@ -1012,7 +1011,8 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -1031,23 +1031,23 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 ; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 ; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP5]]) ; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] @@ -1071,11 +1071,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[C]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i32 [[D]], [[N_VEC]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = insertelement zeroinitializer, i32 [[A]], i32 0 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[BROADCAST_SPLAT]] to +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[C]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] @@ -1083,6 +1082,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] ; CHECK-MAXBW-NEXT: store zeroinitializer, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[BROADCAST_SPLAT]] to ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP9]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -1164,12 +1164,12 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 { ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]]) +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64> ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll index ebf4a4fabb605..ab1486da794f7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll @@ -37,7 +37,7 @@ for.end: ; preds = %for.body ret i32 %conv27 } -attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" } !llvm.ident = !{!0} !0 = !{!"clang"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index 25ee10077fb47..70685c1c3fe12 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 12 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 6 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 786a2aab6d0e7..28d2a278de498 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1630,4 +1630,4 @@ for.body: ; preds = %for.body, %entry } attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } -attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } +attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index d4e5dea3d4aab..49f663f5703b6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -23,18 +23,15 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4) ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> ; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> ; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> -; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> -; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<[[ACC]]>, ir<%mul> +; CHECK-NEXT: EXPRESSION vp<[[REDUCE]]> = ir<[[ACC]]> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32)) ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -42,7 +39,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -89,10 +86,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4) ; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]> ; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a> -; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]> ; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b> ; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32 ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> ; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> ; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll index 0f398a69a4bc7..25799183b1a02 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll @@ -327,5 +327,5 @@ for.end: ; preds = %for.body, %entry declare float @fabsf(float) -attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" } +attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 0a9b1e0af48bc..61e3a1848ceed 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -60,10 +60,10 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD1]] to +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -125,17 +125,17 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -222,10 +222,10 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD1]] to +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -287,17 +287,17 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]] ; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -384,10 +384,10 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD1]] to +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -449,18 +449,18 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -545,10 +545,10 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 1 ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD1]] to +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP12]]) ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] @@ -610,18 +610,18 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] ; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1 ; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 -; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> -; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) -; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]]) ; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll new file mode 100644 index 0000000000000..2338da5339354 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s -S | FileCheck %s +; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s -S | FileCheck %s --check-prefix=CHECK-MAX-BANDWIDTH + +target triple = "wasm32" + +define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: define hidden i32 @accumulate_add_u8_u8( +; CHECK-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[CONV]] +; CHECK-NEXT: [[ADD3]] = add i32 [[ADD]], [[CONV2]] +; CHECK-NEXT: [[INC]] = add nuw i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +; CHECK-MAX-BANDWIDTH-LABEL: define hidden i32 @accumulate_add_u8_u8( +; CHECK-MAX-BANDWIDTH-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAX-BANDWIDTH-NEXT: [[ENTRY:.*]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAX-BANDWIDTH: [[VECTOR_PH]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 +; CHECK-MAX-BANDWIDTH-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] +; CHECK-MAX-BANDWIDTH-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAX-BANDWIDTH: [[VECTOR_BODY]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]] +; CHECK-MAX-BANDWIDTH-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]] +; CHECK-MAX-BANDWIDTH-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-MAX-BANDWIDTH-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAX-BANDWIDTH-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) +; CHECK-MAX-BANDWIDTH-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAX-BANDWIDTH: [[MIDDLE_BLOCK]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]]) +; CHECK-MAX-BANDWIDTH-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; CHECK-MAX-BANDWIDTH: [[SCALAR_PH]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAX-BANDWIDTH-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAX-BANDWIDTH: [[FOR_COND_CLEANUP]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAX-BANDWIDTH-NEXT: ret i32 [[RESULT_0_LCSSA]] +; CHECK-MAX-BANDWIDTH: [[FOR_BODY]]: +; CHECK-MAX-BANDWIDTH-NEXT: [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-MAX-BANDWIDTH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]] +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-MAX-BANDWIDTH-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]] +; CHECK-MAX-BANDWIDTH-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-MAX-BANDWIDTH-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[CONV]] +; CHECK-MAX-BANDWIDTH-NEXT: [[ADD3]] = add i32 [[ADD]], [[CONV2]] +; CHECK-MAX-BANDWIDTH-NEXT: [[INC]] = add nuw i32 [[IV]], 1 +; CHECK-MAX-BANDWIDTH-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-MAX-BANDWIDTH-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3 + +for.body: ; preds = %entry, %for.body + %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %red = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %iv + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = zext i8 %1 to i32 + %add = add i32 %red, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %iv, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll index 9168ebf6335ab..08d39ea038586 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -214,7 +214,7 @@ for.end15: ; preds = %for.end.us, %entry ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } !3 = !{!4, !5} !4 = !{!4} diff --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll index 44e9d3e426f45..b7f8be1802de1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll @@ -71,6 +71,6 @@ declare i32 @printf(ptr, ...) #1 ; Function Attrs: nounwind declare i32 @puts(ptr nocapture readonly) #2 -attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll index f066000fe9f66..52e90e4475208 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -362,4 +362,4 @@ for.body: br i1 %exitcond, label %for.cond.cleanup, label %for.body } -attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="false" } +attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index 005696af67b5a..e405fe7c6f764 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -335,4 +335,4 @@ for.body: ; preds = %for.body.preheader, br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99 } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll index b98a2ea54ffe1..c7550ca016e76 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll @@ -143,7 +143,7 @@ for.inc: br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11 } -attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" } !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll index 2c187caaf93c7..9471e4a71e8c8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll @@ -72,7 +72,7 @@ for.end: ; preds = %for.body, %entry ret void, !dbg !27 } -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!7, !8} diff --git a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll index 8ce2a979b1e46..0656de476f482 100644 --- a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll +++ b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll @@ -50,7 +50,7 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !llvm.loop !18 } -attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll index 65bd36cfa0853..5e51624f28eba 100644 --- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll +++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll @@ -134,7 +134,7 @@ for.cond.cleanup: ret void, !dbg !44 } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll index 4b7b714a25628..77ec95a3d8ef9 100644 --- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll +++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll @@ -145,7 +145,7 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55 } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll index 5cf99b8998603..34873319176d1 100644 --- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll +++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll @@ -135,7 +135,7 @@ thread-pre-split5: ; preds = %.lr.ph ret void } -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index 8efe29a54d913..fc2e2337e0569 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -783,7 +783,7 @@ for.body: ; preds = %for.body, %entry @SA = common global i32 0, align 4 @SB = common global float 0.000000e+00, align 4 -define void @int_float_struct(ptr nocapture readonly %A) #0 { +define void @int_float_struct(ptr nocapture readonly %A) { ; CHECK-LABEL: @int_float_struct( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] @@ -1546,5 +1546,3 @@ loop: end: ret void } - -attributes #0 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll index ddf90294d5bfc..22243d96d1b42 100644 --- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll +++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll @@ -89,7 +89,7 @@ for.end: ; preds = %for.body, %entry ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } !0 = !{!0, !1, !5} !1 = !{!"llvm.loop.vectorize.width", i32 8} diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll index a1fc1b8f34ff3..87b5a0b3553e4 100644 --- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll @@ -56,4 +56,4 @@ for.end: ; preds = %for.body ret i32 0 } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll index 705bbabfcf1df..64ae730eb938e 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll @@ -218,4 +218,4 @@ for.end: ; preds = %for.body, %entry ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll index 1effb1061a3dd..872a39298c0f2 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll @@ -129,4 +129,4 @@ for.end: ; preds = %for.body ret void } -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll index 8224d6b5df9f4..137e098b91260 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll @@ -104,8 +104,8 @@ for.end26: ; preds = %for.cond4.for.end26 } declare i32 @fn2(double) #1 -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" } !0 = !{!"int", !1} !1 = !{!"omnipotent char", !2} diff --git a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll index e2f500753bbdc..cc187dec37709 100644 --- a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll +++ b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll @@ -35,11 +35,7 @@ if.end5: ; preds = %if.then, %entry ret i1 %rez.0 } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) diff --git a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll index d4fd34afc1452..d8065e869ba9c 100644 --- a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll +++ b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll @@ -7,13 +7,13 @@ target triple = "i386-pc-windows-msvc19.11.0" %class.a = type { i32, i32, i32, i32, i32 } ; Function Attrs: nounwind optsize -define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr #0 { +define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr { ; CHECK-LABEL: @pr41917( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call zeroext i1 @f2() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call zeroext i1 @f2() ; CHECK-NEXT: br i1 [[CALL]], label [[LAND_RHS:%.*]], label %"land.end+land.rhs3" ; CHECK: land.rhs: -; CHECK-NEXT: [[CALL1:%.*]] = tail call zeroext i1 @f2() #[[ATTR3]] +; CHECK-NEXT: [[CALL1:%.*]] = tail call zeroext i1 @f2() ; CHECK-NEXT: br label %"land.end+land.rhs3" ; CHECK: "land.end+land.rhs3": ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[G:%.*]], i32 0, i32 1 @@ -25,11 +25,11 @@ define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly alig ; CHECK-NEXT: ret i1 [[TMP2]] ; entry: - %call = tail call zeroext i1 @f2() #2 + %call = tail call zeroext i1 @f2() br i1 %call, label %land.rhs, label %land.end land.rhs: ; preds = %entry - %call1 = tail call zeroext i1 @f2() #2 + %call1 = tail call zeroext i1 @f2() br label %land.end land.end: ; preds = %land.rhs, %entry @@ -53,11 +53,7 @@ land.end6: ; preds = %land.rhs3, %land.en ret i1 %4 } -declare dso_local zeroext i1 @f2() local_unnamed_addr #1 - -attributes #0 = { nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind optsize } +declare dso_local zeroext i1 @f2() local_unnamed_addr !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll index 1cf9fd9009564..12b41845f26a8 100644 --- a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll +++ b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ;; Function Attrs: nounwind ssp uwtable ;; We should eliminate the sub, and one of the phi nodes -define void @vnum_test1(ptr %data) #0 { +define void @vnum_test1(ptr %data) { ; CHECK-LABEL: @vnum_test1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3 @@ -74,7 +74,7 @@ bb19: ; preds = %bb4 ;; We should eliminate the sub, one of the phi nodes, prove the store of the sub ;; and the load of data are equivalent, that the load always produces constant 0, and ;; delete the load replacing it with constant 0. -define i32 @vnum_test2(ptr %data) #0 { +define i32 @vnum_test2(ptr %data) { ; CHECK-LABEL: @vnum_test2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3 @@ -142,11 +142,10 @@ bb21: ; preds = %bb4 ret i32 %p.0 } - ; Function Attrs: nounwind ssp uwtable ;; Same as test 2, with a conditional store of m-n, so it has to also discover ;; that data ends up with the same value no matter what branch is taken. -define i32 @vnum_test3(ptr %data) #0 { +define i32 @vnum_test3(ptr %data) { ; CHECK-LABEL: @vnum_test3( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3 @@ -305,7 +304,6 @@ bb3: ; preds = %bb2 %tmp3 = sub i32 %tmp, %phi2 ret i32 %tmp3 } -attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0, !0, !0} diff --git a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll index 8b2d662e484fc..b6da86bd430f8 100644 --- a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll +++ b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll @@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 } ; Function Attrs: ssp uwtable -define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { +define void @_Z4testv() personality ptr @__gxx_personality_v0 { ; CHECK: @_Z4testv() ; CHECK: invoke.cont: ; CHECK: br i1 true, label %new.notnull.i11, label %if.end.i14 @@ -18,7 +18,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 { entry: %sv = alloca %"class.llvm::SmallVector", align 16 - call void @llvm.lifetime.start.p0(ptr %sv) #1 + call void @llvm.lifetime.start.p0(ptr %sv) %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3 store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4 %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1 @@ -83,11 +83,11 @@ invoke.cont3: ; preds = %invoke.cont2 br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20 if.then.i.i.i20: ; preds = %invoke.cont3 - call void @free(ptr %5) #1 + call void @free(ptr %5) br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21: ; preds = %invoke.cont3, %if.then.i.i.i20 - call void @llvm.lifetime.end.p0(ptr %sv) #1 + call void @llvm.lifetime.end.p0(ptr %sv) ret void lpad: ; preds = %if.end.i14, %if.end.i, %invoke.cont2 @@ -98,7 +98,7 @@ lpad: ; preds = %if.end.i14, %if.end br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i if.then.i.i.i: ; preds = %lpad - call void @free(ptr %7) #1 + call void @free(ptr %7) br label %eh.resume eh.resume: ; preds = %if.then.i.i.i, %lpad @@ -106,24 +106,19 @@ eh.resume: ; preds = %if.then.i.i.i, %lpa } ; Function Attrs: nounwind -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) declare i32 @__gxx_personality_v0(...) -declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2 +declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) ; Function Attrs: nounwind -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) -declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2 +declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) ; Function Attrs: nounwind -declare void @free(ptr nocapture) #3 - -attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +declare void @free(ptr nocapture) !0 = !{!"any pointer", !1} !1 = !{!"omnipotent char", !2} diff --git a/llvm/test/Transforms/NewGVN/equivalent-phi.ll b/llvm/test/Transforms/NewGVN/equivalent-phi.ll index ba4fc14fa2feb..388b94125f560 100644 --- a/llvm/test/Transforms/NewGVN/equivalent-phi.ll +++ b/llvm/test/Transforms/NewGVN/equivalent-phi.ll @@ -8,7 +8,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ;; one set of indexing calculations and a load ; Function Attrs: nounwind ssp uwtable -define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 { +define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-LABEL: @bar( ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB3:%.*]] @@ -59,8 +59,6 @@ bb20: ; preds = %bb17 ret i32 %tmp14 } -attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll index f83d145167c75..71e90419f0bf5 100644 --- a/llvm/test/Transforms/NewGVN/memory-handling.ll +++ b/llvm/test/Transforms/NewGVN/memory-handling.ll @@ -282,10 +282,10 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) #2 ; Function Attrs: inlinehint nounwind readonly uwtable declare i32 @tolower(i32) local_unnamed_addr #3 -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind readnone } attributes #2 = { argmemonly nounwind } -attributes #3 = { inlinehint nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { inlinehint nounwind readonly uwtable } attributes #4 = { nounwind readnone } attributes #5 = { nounwind readonly } diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll index 82e9a2ab286ee..c1fb8366c15e5 100644 --- a/llvm/test/Transforms/NewGVN/pr31483.ll +++ b/llvm/test/Transforms/NewGVN/pr31483.ll @@ -6,7 +6,7 @@ target datalayout = "E-m:e-i64:64-n32:64" ;; Ensure we do not believe the indexing increments are unreachable due to incorrect memory ;; equivalence detection. In PR31483, we were deleting those blocks as unreachable ; Function Attrs: nounwind -define signext i32 @ham(ptr %arg, ptr %arg1) #0 { +define signext i32 @ham(ptr %arg, ptr %arg1) { ; CHECK-LABEL: @ham( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 @@ -89,12 +89,7 @@ bb23: ; preds = %bb2 ret i32 undef } -declare signext i32 @zot(ptr, ...) #1 +declare signext i32 @zot(ptr, ...) ; Function Attrs: nounwind -declare void @llvm.va_end(ptr) #2 - -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - +declare void @llvm.va_end(ptr) diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll index 353c693f2a29b..b2ba42b25ad86 100644 --- a/llvm/test/Transforms/NewGVN/pr31501.ll +++ b/llvm/test/Transforms/NewGVN/pr31501.ll @@ -49,9 +49,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" %struct.wombat.28 = type <{ ptr, i8, i8, [6 x i8] }> ; Function Attrs: norecurse nounwind ssp uwtable -define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 { +define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr align 2 { ; CHECK-LABEL: define weak_odr hidden ptr @quux( -; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr align 2 { ; CHECK-NEXT: [[BB:.*]]: ; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]] @@ -112,8 +112,6 @@ bb21: ; preds = %bb19, %bb ret ptr %tmp22 } -attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll index 969f172cba956..61eb7a526e6db 100644 --- a/llvm/test/Transforms/NewGVN/pr33187.ll +++ b/llvm/test/Transforms/NewGVN/pr33187.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ;; Ensure we don't change after value numbering by accidentally deleting the wrong expression. ; RUN: opt -passes=newgvn -S %s | FileCheck %s -define void @fn1(i1 %arg) local_unnamed_addr #0 { +define void @fn1(i1 %arg) local_unnamed_addr { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_COND_PREHEADER:%.*]] @@ -89,8 +89,7 @@ if.end18: ; preds = %L, %while.body12 br label %while.cond10 } - -define void @hoge() local_unnamed_addr #0 { +define void @hoge() local_unnamed_addr { ; CHECK-LABEL: @hoge( ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] @@ -108,9 +107,6 @@ bb1: ; preds = %bb1, %bb br label %bb1 } -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - - define void @a(i1 %arg) { ; CHECK-LABEL: @a( ; CHECK-NEXT: b: @@ -143,4 +139,3 @@ f: ; preds = %d %e = getelementptr i8, ptr %i, i64 1 br label %d } - diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll index e742f14249c7c..ff645f8804a28 100644 --- a/llvm/test/Transforms/NewGVN/pr33305.ll +++ b/llvm/test/Transforms/NewGVN/pr33305.ll @@ -16,9 +16,9 @@ target triple = "x86_64-apple-macosx10.12.0" @str.2 = private unnamed_addr constant [8 x i8] c"Screwed\00" ; Function Attrs: nounwind optsize ssp uwtable -define i32 @main() local_unnamed_addr #0 { +define i32 @main() local_unnamed_addr { ; CHECK-LABEL: define i32 @main( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[INT_TBAA3:![0-9]+]] ; CHECK-NEXT: [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1 @@ -77,7 +77,7 @@ define i32 @main() local_unnamed_addr #0 { ; CHECK-NEXT: br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]] ; CHECK: [[IF_THEN]]: ; CHECK-NEXT: [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2) -; CHECK-NEXT: tail call void @abort() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: tail call void @abort() ; CHECK-NEXT: unreachable ; CHECK: [[IF_END]]: ; CHECK-NEXT: [[PUTS:%.*]] = tail call i32 @puts(ptr @str) @@ -153,7 +153,7 @@ fn1.exit: ; preds = %if.then.i, %for.end if.then: ; preds = %fn1.exit %puts2 = tail call i32 @puts(ptr @str.2) - tail call void @abort() #3 + tail call void @abort() unreachable if.end: ; preds = %fn1.exit @@ -162,15 +162,10 @@ if.end: ; preds = %fn1.exit } ; Function Attrs: noreturn nounwind optsize -declare void @abort() local_unnamed_addr #1 +declare void @abort() local_unnamed_addr ; Function Attrs: nounwind -declare i32 @puts(ptr nocapture readonly) local_unnamed_addr #2 - -attributes #0 = { nounwind optsize ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { noreturn nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } -attributes #3 = { noreturn nounwind optsize } +declare i32 @puts(ptr nocapture readonly) local_unnamed_addr !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/NewGVN/pr34430.ll b/llvm/test/Transforms/NewGVN/pr34430.ll index 490ba433305d5..62d5770167f95 100644 --- a/llvm/test/Transforms/NewGVN/pr34430.ll +++ b/llvm/test/Transforms/NewGVN/pr34430.ll @@ -4,7 +4,7 @@ source_filename = "bugpoint-output-e4c7d0f.bc" ; Make sure we still properly resolve phi cycles when they involve predicateinfo copies of phis. -define void @hoge(i1 %arg) local_unnamed_addr #0 { +define void @hoge(i1 %arg) local_unnamed_addr { ; CHECK-LABEL: @hoge( ; CHECK-NEXT: bb: ; CHECK-NEXT: br i1 %arg, label [[BB6:%.*]], label [[BB1:%.*]] @@ -41,8 +41,6 @@ bb6: ; preds = %bb4, %bb2, %bb1, %b br label %bb4 } -attributes #0 = { norecurse noreturn nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - !llvm.ident = !{!0} !0 = !{!"clang version 6.0.0"} diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll index 48bdd88e9591a..7c38147c1ac9a 100644 --- a/llvm/test/Transforms/NewGVN/pr34452.ll +++ b/llvm/test/Transforms/NewGVN/pr34452.ll @@ -6,9 +6,9 @@ source_filename = "bugpoint-output-09f7a24.bc" @WHOLELINE = external local_unnamed_addr global i32, align 4 ; Function Attrs: nounwind uwtable -define void @sgrep() local_unnamed_addr #0 { +define void @sgrep() local_unnamed_addr { ; CHECK-LABEL: define void @sgrep( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[INT_TBAA1:![0-9]+]] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0 @@ -36,10 +36,7 @@ while.body.us: ; preds = %while.body.us, %ent } ; Function Attrs: nounwind readnone speculatable -declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone speculatable } +declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) !llvm.ident = !{!0} diff --git a/llvm/test/Transforms/OpenMP/dead_use.ll b/llvm/test/Transforms/OpenMP/dead_use.ll index 1c4b2c6fe27a6..ad0f91ccdedb4 100644 --- a/llvm/test/Transforms/OpenMP/dead_use.ll +++ b/llvm/test/Transforms/OpenMP/dead_use.ll @@ -6,9 +6,9 @@ @0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8 ; Function Attrs: nounwind uwtable -define dso_local i32 @b() #0 { +define dso_local i32 @b() { ; CHECK-LABEL: define dso_local i32 @b( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) { ; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @a() ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 @@ -21,9 +21,9 @@ define dso_local i32 @b() #0 { } ; Function Attrs: nounwind uwtable -define internal i32 @a() #0 { +define internal i32 @a() { ; CHECK-LABEL: define internal i32 @a( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) { ; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @b() ; CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB0:[0-9]+]], i32 0, ptr @.omp_outlined.) @@ -38,9 +38,9 @@ define internal i32 @a() #0 { } ; Function Attrs: norecurse nounwind uwtable -define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 { +define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) { ; CHECK-LABEL: define internal void @.omp_outlined.( -; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = alloca ptr, align 8 ; CHECK-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]] @@ -55,11 +55,7 @@ define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 { } ; Function Attrs: nounwind -declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...) #2 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } +declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/OpenMP/icv_remarks.ll b/llvm/test/Transforms/OpenMP/icv_remarks.ll index f76d4876a55de..3caa56f3cd415 100644 --- a/llvm/test/Transforms/OpenMP/icv_remarks.ll +++ b/llvm/test/Transforms/OpenMP/icv_remarks.ll @@ -14,55 +14,48 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV active_levels Value: 0 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV cancel Value: 0 -define dso_local void @foo(i32 %a) local_unnamed_addr #0 !dbg !17 { +define dso_local void @foo(i32 %a) local_unnamed_addr !dbg !17 { entry: %.kmpc_loc.addr = alloca %struct.ident_t, align 8 call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(24) %.kmpc_loc.addr, ptr nonnull align 8 dereferenceable(24) @0, i64 16, i1 false) call void @llvm.dbg.value(metadata i32 %a, metadata !19, metadata !DIExpression()), !dbg !21 - tail call void @omp_set_num_threads(i32 %a) #1, !dbg !22 - %call = tail call i32 @omp_get_max_threads() #1, !dbg !23 + tail call void @omp_set_num_threads(i32 %a), !dbg !22 + %call = tail call i32 @omp_get_max_threads(), !dbg !23 call void @llvm.dbg.value(metadata i32 %call, metadata !20, metadata !DIExpression()), !dbg !21 - tail call void @use(i32 %call) #1, !dbg !24 + tail call void @use(i32 %call), !dbg !24 %0 = getelementptr inbounds %struct.ident_t, ptr %.kmpc_loc.addr, i64 0, i32 4, !dbg !25 store ptr @1, ptr %0, align 8, !dbg !25, !tbaa !26 - call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.) #1, !dbg !25 + call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.), !dbg !25 ret void, !dbg !32 } -declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr #1 +declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr -declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr #1 +declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr -declare !dbg !12 dso_local void @use(i32) local_unnamed_addr #2 +declare !dbg !12 dso_local void @use(i32) local_unnamed_addr ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV active_levels Value: 0 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV cancel Value: 0 -define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) #3 !dbg !33 { +define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) !dbg !33 { entry: call void @llvm.dbg.value(metadata ptr %.global_tid., metadata !41, metadata !DIExpression()), !dbg !43 call void @llvm.dbg.value(metadata ptr %.bound_tid., metadata !42, metadata !DIExpression()), !dbg !43 - call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()) #1, !dbg !50 - call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()) #1, !dbg !50 - tail call void @omp_set_num_threads(i32 10) #1, !dbg !52 - %call.i = tail call i32 @omp_get_max_threads() #1, !dbg !53 - call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()) #1, !dbg !54 - tail call void @use(i32 %call.i) #1, !dbg !55 + call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()), !dbg !50 + call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()), !dbg !50 + tail call void @omp_set_num_threads(i32 10), !dbg !52 + %call.i = tail call i32 @omp_get_max_threads(), !dbg !53 + call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()), !dbg !54 + tail call void @use(i32 %call.i), !dbg !55 ret void, !dbg !56 } -declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #4 +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) -declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr #1 +declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr -declare void @llvm.dbg.value(metadata, metadata, metadata) #5 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { argmemonly nounwind willreturn } -attributes #5 = { nounwind readnone speculatable willreturn } +declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!13, !14, !15, !59} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll index 0a1efdade760f..eb1cc5ec7fa6c 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll @@ -9,7 +9,6 @@ ; CHECK-NOT: remark: {{.*}} ; CHECK: !{!"branch_weights", i32 0, i32 200000} - ; ModuleID = 'misexpect-branch-correct.c' source_filename = "misexpect-branch-correct.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -19,14 +18,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 { +define i32 @bar() { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4 + call void @llvm.lifetime.start.p0(ptr %rando) %call = call i32 (...) @buzz() store i32 %call, ptr %rando, align 4, !tbaa !3 - call void @llvm.lifetime.start.p0(ptr %x) #4 + call void @llvm.lifetime.start.p0(ptr %x) store i32 0, ptr %x, align 4, !tbaa !3 %0 = load i32, ptr %rando, align 4, !tbaa !3 %rem = srem i32 %0, 200000 @@ -52,31 +51,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !tbaa !3 - call void @llvm.lifetime.end.p0(ptr %x) #4 - call void @llvm.lifetime.end.p0(ptr %rando) #4 + call void @llvm.lifetime.end.p0(ptr %x) + call void @llvm.lifetime.end.p0(ptr %rando) ret i32 %2 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll index 68b233edda524..339082580793b 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll @@ -16,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 !dbg !6 { +define i32 @bar() !dbg !6 { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9 + call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9 %call = call i32 (...) @buzz(), !dbg !9 store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10 - call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14 + call void @llvm.lifetime.start.p0(ptr %x), !dbg !14 store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10 %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10 %rem = srem i32 %0, 200000, !dbg !15 @@ -49,31 +49,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10 - call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20 - call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20 + call void @llvm.lifetime.end.p0(ptr %x), !dbg !20 + call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20 ret i32 %2, !dbg !19 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll index 2f188f5efd6fc..d34708c9d60d2 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll @@ -19,7 +19,6 @@ ; DISABLED-NOT: warning: :0:0: 19.98% ; DISABLED-NOT: remark: :0:0: Potential performance regression from use of the llvm.expect intrinsic: Annotation was correct on 19.98% (399668 / 2000000) of profiled executions. - ; ModuleID = 'misexpect-branch.c' source_filename = "misexpect-branch.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -29,14 +28,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 { +define i32 @bar() { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4 + call void @llvm.lifetime.start.p0(ptr %rando) %call = call i32 (...) @buzz() store i32 %call, ptr %rando, align 4, !tbaa !3 - call void @llvm.lifetime.start.p0(ptr %x) #4 + call void @llvm.lifetime.start.p0(ptr %x) store i32 0, ptr %x, align 4, !tbaa !3 %0 = load i32, ptr %rando, align 4, !tbaa !3 %rem = srem i32 %0, 200000 @@ -62,31 +61,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !tbaa !3 - call void @llvm.lifetime.end.p0(ptr %x) #4 - call void @llvm.lifetime.end.p0(ptr %rando) #4 + call void @llvm.lifetime.end.p0(ptr %x) + call void @llvm.lifetime.end.p0(ptr %rando) ret i32 %2 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll index 4add781d52071..a43e63226a1b0 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll @@ -7,7 +7,6 @@ ; CHECK-NOT: warning: {{.*}} ; CHECK-NOT: remark: {{.*}} - ; ModuleID = 'misexpect-branch-unpredictable.c' source_filename = "clang/test/Profile/misexpect-branch-unpredictable.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -17,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 { +define i32 @bar() { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #3 + call void @llvm.lifetime.start.p0(ptr %rando) %call = call i32 (...) @buzz() store i32 %call, ptr %rando, align 4, !tbaa !2 - call void @llvm.lifetime.start.p0(ptr %x) #3 + call void @llvm.lifetime.start.p0(ptr %x) store i32 0, ptr %x, align 4, !tbaa !2 %0 = load i32, ptr %rando, align 4, !tbaa !2 %rem = srem i32 %0, 200000 @@ -49,27 +48,22 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !tbaa !2 - call void @llvm.lifetime.end.p0(ptr %x) #3 - call void @llvm.lifetime.end.p0(ptr %rando) #3 + call void @llvm.lifetime.end.p0(ptr %x) + call void @llvm.lifetime.end.p0(ptr %rando) ret i32 %2 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.module.flags = !{!0} !llvm.ident = !{!1} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll index 5a7731bd2442d..7e01f7aca5c32 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll @@ -33,14 +33,14 @@ target triple = "x86_64-unknown-linux-gnu" @outer_loop = constant i32 2000, align 4 ; Function Attrs: nounwind -define i32 @bar() #0 !dbg !6 { +define i32 @bar() !dbg !6 { entry: %rando = alloca i32, align 4 %x = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9 + call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9 %call = call i32 (...) @buzz(), !dbg !9 store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10 - call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14 + call void @llvm.lifetime.start.p0(ptr %x), !dbg !14 store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10 %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10 %rem = srem i32 %0, 200000, !dbg !15 @@ -66,31 +66,25 @@ if.else: ; preds = %entry if.end: ; preds = %if.else, %if.then %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10 - call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20 - call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20 + call void @llvm.lifetime.end.p0(ptr %x), !dbg !20 + call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20 ret i32 %2, !dbg !19 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) -declare i32 @buzz(...) #2 +declare i32 @buzz(...) ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #3 +declare i64 @llvm.expect.i64(i64, i64) -declare i32 @baz(i32) #2 +declare i32 @baz(i32) -declare i32 @foo(i32) #2 +declare i32 @foo(i32) ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind readnone willreturn } -attributes #4 = { nounwind } +declare void @llvm.lifetime.end.p0(ptr nocapture) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll index 859ba7223a17b..38c27b9f3266c 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll @@ -25,7 +25,6 @@ ; CORRECT-NOT: warning: {{.*}} ; CORRECT-NOT: remark: {{.*}} - ; ModuleID = 'misexpect-switch.c' source_filename = "misexpect-switch.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -37,10 +36,10 @@ target triple = "x86_64-unknown-linux-gnu" @arry = dso_local global [25 x i32] zeroinitializer, align 16 ; Function Attrs: nounwind uwtable -define dso_local void @init_arry() #0 { +define dso_local void @init_arry() { entry: %i = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %i) #6 + call void @llvm.lifetime.start.p0(ptr %i) store i32 0, ptr %i, align 4, !tbaa !4 br label %for.cond @@ -50,7 +49,7 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond - %call = call i32 @rand() #6 + %call = call i32 @rand() %rem = srem i32 %call, 10 %1 = load i32, ptr %i, align 4, !tbaa !4 %idxprom = sext i32 %1 to i64 @@ -65,24 +64,24 @@ for.inc: ; preds = %for.body br label %for.cond for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %i) #6 + call void @llvm.lifetime.end.p0(ptr %i) ret void } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) ; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; Function Attrs: nounwind -declare dso_local i32 @rand() #3 +declare dso_local i32 @rand() ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) ; Function Attrs: nounwind uwtable -define dso_local i32 @main() #0 { +define dso_local i32 @main() { entry: %retval = alloca i32, align 4 %val = alloca i32, align 4 @@ -90,9 +89,9 @@ entry: %condition = alloca i32, align 4 store i32 0, ptr %retval, align 4 call void @init_arry() - call void @llvm.lifetime.start.p0(ptr %val) #6 + call void @llvm.lifetime.start.p0(ptr %val) store i32 0, ptr %val, align 4, !tbaa !4 - call void @llvm.lifetime.start.p0(ptr %j) #6 + call void @llvm.lifetime.start.p0(ptr %j) store i32 0, ptr %j, align 4, !tbaa !4 br label %for.cond @@ -102,8 +101,8 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond - call void @llvm.lifetime.start.p0(ptr %condition) #6 - %call = call i32 @rand() #6 + call void @llvm.lifetime.start.p0(ptr %condition) + %call = call i32 @rand() %rem = srem i32 %call, 5 store i32 %rem, ptr %condition, align 4, !tbaa !4 %1 = load i32, ptr %condition, align 4, !tbaa !4 @@ -138,7 +137,7 @@ sw.default: ; preds = %for.body unreachable sw.epilog: ; preds = %sw.bb3, %sw.bb2, %sw.bb - call void @llvm.lifetime.end.p0(ptr %condition) #6 + call void @llvm.lifetime.end.p0(ptr %condition) br label %for.inc for.inc: ; preds = %sw.epilog @@ -148,25 +147,17 @@ for.inc: ; preds = %sw.epilog br label %for.cond for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %j) #6 - call void @llvm.lifetime.end.p0(ptr %val) #6 + call void @llvm.lifetime.end.p0(ptr %j) + call void @llvm.lifetime.end.p0(ptr %val) ret i32 0 } ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #4 - -declare dso_local i32 @sum(ptr, i32) #5 +declare i64 @llvm.expect.i64(i64, i64) -declare dso_local i32 @random_sample(ptr, i32) #5 +declare dso_local i32 @sum(ptr, i32) -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { nounwind readnone speculatable willreturn } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind readnone willreturn } -attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #6 = { nounwind } +declare dso_local i32 @random_sample(ptr, i32) !llvm.module.flags = !{!0, !1, !2} !llvm.ident = !{!3} diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll index 242d5b8cc112e..966555908a51e 100644 --- a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll +++ b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll @@ -28,7 +28,6 @@ ; CORRECT-NOT: warning: {{.*}} ; CORRECT-NOT: remark: {{.*}} - ; ModuleID = 'misexpect-switch.c' source_filename = "misexpect-switch.c" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -40,10 +39,10 @@ target triple = "x86_64-unknown-linux-gnu" @arry = dso_local global [25 x i32] zeroinitializer, align 16, !dbg !12 ; Function Attrs: nounwind uwtable -define dso_local void @init_arry() #0 !dbg !21 { +define dso_local void @init_arry() !dbg !21 { entry: %i = alloca i32, align 4 - call void @llvm.lifetime.start.p0(ptr %i) #6, !dbg !26 + call void @llvm.lifetime.start.p0(ptr %i), !dbg !26 call void @llvm.dbg.declare(metadata ptr %i, metadata !25, metadata !DIExpression()), !dbg !27 store i32 0, ptr %i, align 4, !dbg !28, !tbaa !30 br label %for.cond, !dbg !34 @@ -54,7 +53,7 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end, !dbg !38 for.body: ; preds = %for.cond - %call = call i32 @rand() #6, !dbg !39 + %call = call i32 @rand(), !dbg !39 %rem = srem i32 %call, 10, !dbg !41 %1 = load i32, ptr %i, align 4, !dbg !42, !tbaa !30 %idxprom = sext i32 %1 to i64, !dbg !43 @@ -69,24 +68,24 @@ for.inc: ; preds = %for.body br label %for.cond, !dbg !47, !llvm.loop !48 for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %i) #6, !dbg !50 + call void @llvm.lifetime.end.p0(ptr %i), !dbg !50 ret void, !dbg !50 } ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.start.p0(ptr nocapture) #1 +declare void @llvm.lifetime.start.p0(ptr nocapture) ; Function Attrs: nounwind readnone speculatable willreturn -declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 +declare void @llvm.dbg.declare(metadata, metadata, metadata) ; Function Attrs: nounwind -declare dso_local i32 @rand() #3 +declare dso_local i32 @rand() ; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.lifetime.end.p0(ptr nocapture) #1 +declare void @llvm.lifetime.end.p0(ptr nocapture) ; Function Attrs: nounwind uwtable -define dso_local i32 @main() #0 !dbg !51 { +define dso_local i32 @main() !dbg !51 { entry: %retval = alloca i32, align 4 %val = alloca i32, align 4 @@ -94,10 +93,10 @@ entry: %condition = alloca i32, align 4 store i32 0, ptr %retval, align 4 call void @init_arry(), !dbg !62 - call void @llvm.lifetime.start.p0(ptr %val) #6, !dbg !63 + call void @llvm.lifetime.start.p0(ptr %val), !dbg !63 call void @llvm.dbg.declare(metadata ptr %val, metadata !55, metadata !DIExpression()), !dbg !64 store i32 0, ptr %val, align 4, !dbg !64, !tbaa !30 - call void @llvm.lifetime.start.p0(ptr %j) #6, !dbg !65 + call void @llvm.lifetime.start.p0(ptr %j), !dbg !65 call void @llvm.dbg.declare(metadata ptr %j, metadata !56, metadata !DIExpression()), !dbg !66 store i32 0, ptr %j, align 4, !dbg !67, !tbaa !30 br label %for.cond, !dbg !68 @@ -108,9 +107,9 @@ for.cond: ; preds = %for.inc, %entry br i1 %cmp, label %for.body, label %for.end, !dbg !71 for.body: ; preds = %for.cond - call void @llvm.lifetime.start.p0(ptr %condition) #6, !dbg !72 + call void @llvm.lifetime.start.p0(ptr %condition), !dbg !72 call void @llvm.dbg.declare(metadata ptr %condition, metadata !57, metadata !DIExpression()), !dbg !73 - %call = call i32 @rand() #6, !dbg !74 + %call = call i32 @rand(), !dbg !74 %rem = srem i32 %call, 5, !dbg !75 store i32 %rem, ptr %condition, align 4, !dbg !73, !tbaa !30 %1 = load i32, ptr %condition, align 4, !dbg !76, !tbaa !30 @@ -145,7 +144,7 @@ sw.default: ; preds = %for.body unreachable, !dbg !87 sw.epilog: ; preds = %sw.bb3, %sw.bb2, %sw.bb - call void @llvm.lifetime.end.p0(ptr %condition) #6, !dbg !88 + call void @llvm.lifetime.end.p0(ptr %condition), !dbg !88 br label %for.inc, !dbg !89 for.inc: ; preds = %sw.epilog @@ -155,25 +154,17 @@ for.inc: ; preds = %sw.epilog br label %for.cond, !dbg !91, !llvm.loop !92 for.end: ; preds = %for.cond - call void @llvm.lifetime.end.p0(ptr %j) #6, !dbg !94 - call void @llvm.lifetime.end.p0(ptr %val) #6, !dbg !94 + call void @llvm.lifetime.end.p0(ptr %j), !dbg !94 + call void @llvm.lifetime.end.p0(ptr %val), !dbg !94 ret i32 0, !dbg !95 } ; Function Attrs: nounwind readnone willreturn -declare i64 @llvm.expect.i64(i64, i64) #4 - -declare dso_local i32 @sum(ptr, i32) #5 +declare i64 @llvm.expect.i64(i64, i64) -declare dso_local i32 @random_sample(ptr, i32) #5 +declare dso_local i32 @sum(ptr, i32) -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind willreturn } -attributes #2 = { nounwind readnone speculatable willreturn } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind readnone willreturn } -attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #6 = { nounwind } +declare dso_local i32 @random_sample(ptr, i32) !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!17, !18, !19} diff --git a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h index dc077f900d195..f03c82f76c324 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h +++ b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h @@ -132,7 +132,7 @@ class MockJITLinkContext : public llvm::jitlink::JITLinkContext { : JITLinkContext(&JD), MJMM(std::move(MJMM)), HandleFailed(std::move(HandleFailed)) {} - ~MockJITLinkContext() { + ~MockJITLinkContext() override { if (auto Err = MJMM->deallocate(std::move(FinalizedAllocs))) notifyFailed(std::move(Err)); } diff --git a/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp b/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp index 2bcff3356b86d..a72dc5f7af7a6 100644 --- a/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp @@ -140,7 +140,7 @@ class EPCGenericMemoryAccessTest : public testing::Test { MemAccess = std::make_unique(*EPC, FAs); } - ~EPCGenericMemoryAccessTest() { cantFail(EPC->disconnect()); } + ~EPCGenericMemoryAccessTest() override { cantFail(EPC->disconnect()); } protected: std::shared_ptr EPC; diff --git a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp index a57241b8a3da6..018a6786a92dc 100644 --- a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp @@ -18,7 +18,7 @@ static int finalTarget() { return 53; } class JITLinkRedirectionManagerTest : public testing::Test { public: - ~JITLinkRedirectionManagerTest() { + ~JITLinkRedirectionManagerTest() override { if (ES) if (auto Err = ES->endSession()) ES->reportError(std::move(Err)); diff --git a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp index 5ff3e26f35296..5deebec967be1 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp @@ -34,7 +34,7 @@ ArrayRef BlockContent(BlockContentBytes); class ObjectLinkingLayerTest : public testing::Test { public: - ~ObjectLinkingLayerTest() { + ~ObjectLinkingLayerTest() override { if (auto Err = ES.endSession()) ES.reportError(std::move(Err)); } @@ -161,7 +161,7 @@ TEST_F(ObjectLinkingLayerTest, HandleErrorDuringPostAllocationPass) { // abandon the in-flight allocation and report an error. class TestPlugin : public ObjectLinkingLayer::Plugin { public: - ~TestPlugin() { EXPECT_TRUE(ErrorReported); } + ~TestPlugin() override { EXPECT_TRUE(ErrorReported); } void modifyPassConfig(MaterializationResponsibility &MR, jitlink::LinkGraph &G, @@ -215,7 +215,7 @@ TEST_F(ObjectLinkingLayerTest, AddAndRemovePlugins) { TestPlugin(size_t &ActivationCount, bool &PluginDestroyed) : ActivationCount(ActivationCount), PluginDestroyed(PluginDestroyed) {} - ~TestPlugin() { PluginDestroyed = true; } + ~TestPlugin() override { PluginDestroyed = true; } void modifyPassConfig(MaterializationResponsibility &MR, jitlink::LinkGraph &G, @@ -336,7 +336,7 @@ TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) { class CheckDefs : public ObjectLinkingLayer::Plugin { public: - ~CheckDefs() { EXPECT_TRUE(SawSymbolDef); } + ~CheckDefs() override { EXPECT_TRUE(SawSymbolDef); } void modifyPassConfig(MaterializationResponsibility &MR, jitlink::LinkGraph &G, diff --git a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h index 469de2a3665a0..e22c35f1332f7 100644 --- a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h +++ b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h @@ -47,7 +47,7 @@ namespace orc { // (5) V -- A JITDylib associated with ES. class CoreAPIsBasedStandardTest : public testing::Test { public: - ~CoreAPIsBasedStandardTest() { + ~CoreAPIsBasedStandardTest() override { if (auto Err = ES.endSession()) ES.reportError(std::move(Err)); } diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp index 686d85dc6c15b..ac591c1e94a18 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp @@ -26,7 +26,7 @@ using namespace llvm::jitlink; class ReOptimizeLayerTest : public testing::Test { public: - ~ReOptimizeLayerTest() { + ~ReOptimizeLayerTest() override { if (ES) if (auto Err = ES->endSession()) ES->reportError(std::move(Err)); diff --git a/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp index 0e5d151714fcc..3591829bdbcec 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp @@ -50,7 +50,7 @@ class SimpleResourceManager : public ResourceManager { SimpleResourceManager(SimpleResourceManager &&) = delete; SimpleResourceManager &operator=(SimpleResourceManager &&) = delete; - ~SimpleResourceManager() { ES.deregisterResourceManager(*this); } + ~SimpleResourceManager() override { ES.deregisterResourceManager(*this); } /// Set the HandleRemove function object. void setHandleRemove(HandleRemoveFunction HandleRemove) { diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 697a04e94441a..137554f49460d 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -108,7 +108,6 @@ def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d"> { LogicalResult verifyOutputZeroPoint(int64_t zp); }]; - let hasCanonicalizer = 1; let hasVerifier = 1; let assemblyFormat = diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp index a6159ee93e87e..f0ddb506064fe 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp @@ -14,13 +14,6 @@ #include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h" #include "mlir/Dialect/Bufferization/Transforms/Transforms.h" -namespace mlir { -namespace bufferization { -#define GEN_PASS_DEF_TENSORCOPYINSERTION -#include "mlir/Dialect/Bufferization/Transforms/Passes.h.inc" -} // namespace bufferization -} // namespace mlir - using namespace mlir; using namespace mlir::bufferization; diff --git a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp index 11400de35e430..a15bf891dd596 100644 --- a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp @@ -59,6 +59,17 @@ struct DimOpInterface } }; +struct ExpandShapeOpInterface + : public ValueBoundsOpInterface::ExternalModel { + void populateBoundsForShapedValueDim(Operation *op, Value value, int64_t dim, + ValueBoundsConstraintSet &cstr) const { + auto expandOp = cast(op); + assert(value == expandOp.getResult() && "invalid value"); + cstr.bound(value)[dim] == expandOp.getOutputShape()[dim]; + } +}; + struct GetGlobalOpInterface : public ValueBoundsOpInterface::ExternalModel { @@ -123,6 +134,8 @@ void mlir::memref::registerValueBoundsOpInterfaceExternalModels( memref::AllocOpInterface>(*ctx); memref::CastOp::attachInterface(*ctx); memref::DimOp::attachInterface(*ctx); + memref::ExpandShapeOp::attachInterface( + *ctx); memref::GetGlobalOp::attachInterface(*ctx); memref::RankOp::attachInterface(*ctx); memref::SubViewOp::attachInterface(*ctx); diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index a3e9373fd46c0..ae75f7f670432 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -27,6 +27,7 @@ #include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/Casting.h" #include "llvm/Support/DebugLog.h" @@ -291,9 +292,102 @@ struct MultiBlockExecuteInliner : public OpRewritePattern { } }; +// Pattern to eliminate ExecuteRegionOp results which forward external +// values from the region. In case there are multiple yield operations, +// all of them must have the same operands in order for the pattern to be +// applicable. +struct ExecuteRegionForwardingEliminator + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(ExecuteRegionOp op, + PatternRewriter &rewriter) const override { + if (op.getNumResults() == 0) + return failure(); + + SmallVector yieldOps; + for (Block &block : op.getRegion()) { + if (auto yield = dyn_cast(block.getTerminator())) + yieldOps.push_back(yield.getOperation()); + } + + if (yieldOps.empty()) + return failure(); + + // Check if all yield operations have the same operands. + auto yieldOpsOperands = yieldOps[0]->getOperands(); + for (auto *yieldOp : yieldOps) { + if (yieldOp->getOperands() != yieldOpsOperands) + return failure(); + } + + SmallVector externalValues; + SmallVector internalValues; + SmallVector opResultsToReplaceWithExternalValues; + SmallVector opResultsToKeep; + for (auto [index, yieldedValue] : llvm::enumerate(yieldOpsOperands)) { + if (isValueFromInsideRegion(yieldedValue, op)) { + internalValues.push_back(yieldedValue); + opResultsToKeep.push_back(op.getResult(index)); + } else { + externalValues.push_back(yieldedValue); + opResultsToReplaceWithExternalValues.push_back(op.getResult(index)); + } + } + // No yielded external values - nothing to do. + if (externalValues.empty()) + return failure(); + + // There are yielded external values - create a new execute_region returning + // just the internal values. + SmallVector resultTypes; + for (Value value : internalValues) + resultTypes.push_back(value.getType()); + auto newOp = + ExecuteRegionOp::create(rewriter, op.getLoc(), TypeRange(resultTypes)); + newOp->setAttrs(op->getAttrs()); + + // Move old op's region to the new operation. + rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(), + newOp.getRegion().end()); + + // Replace all yield operations with a new yield operation with updated + // results. scf.execute_region must have at least one yield operation. + for (auto *yieldOp : yieldOps) { + rewriter.setInsertionPoint(yieldOp); + rewriter.replaceOpWithNewOp(yieldOp, + ValueRange(internalValues)); + } + + // Replace the old operation with the external values directly. + rewriter.replaceAllUsesWith(opResultsToReplaceWithExternalValues, + externalValues); + // Replace the old operation's remaining results with the new operation's + // results. + rewriter.replaceAllUsesWith(opResultsToKeep, newOp.getResults()); + rewriter.eraseOp(op); + return success(); + } + +private: + bool isValueFromInsideRegion(Value value, + ExecuteRegionOp executeRegionOp) const { + // Check if the value is defined within the execute_region + if (Operation *defOp = value.getDefiningOp()) + return &executeRegionOp.getRegion() == defOp->getParentRegion(); + + // If it's a block argument, check if it's from within the region + if (BlockArgument blockArg = dyn_cast(value)) + return &executeRegionOp.getRegion() == blockArg.getParentRegion(); + + return false; // Value is from outside the region + } +}; + void ExecuteRegionOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add(context); } void ExecuteRegionOp::getSuccessorRegions( diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index e8a6a739619b5..de9096da9b87b 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -75,28 +75,6 @@ namespace { template struct PoolPadFoldAdaptor; -template <> -struct PoolPadFoldAdaptor { - using OpTy = tosa::AvgPool2dOp; - static bool checkKernelCompliance(OpTy op, const ArrayRef newPad) { - const llvm::ArrayRef kernel = op.getKernel(); - if (newPad[2] >= kernel[1] || newPad[3] >= kernel[1] || - newPad[0] >= kernel[0] || newPad[1] >= kernel[0]) - return false; - return true; - } - static bool checkPadConstCompliance(OpTy op, Value padConst) { - return checkMatchingPadConstAndZp(padConst, op.getInputZp()); - } - static void replaceOpWithNewPad(PatternRewriter &rewriter, OpTy op, - Value padInput, ArrayRef newPad) { - rewriter.replaceOpWithNewOp( - op, op.getType(), padInput, op.getInputZp(), op.getOutputZp(), - op.getKernel(), op.getStride(), rewriter.getDenseI64ArrayAttr(newPad), - op.getAccType()); - } -}; - template <> struct PoolPadFoldAdaptor { using OpTy = tosa::MaxPool2dOp; @@ -245,13 +223,6 @@ struct FoldPadToTensorOp : public OpRewritePattern { }; } // namespace -void AvgPool2dOp::getCanonicalizationPatterns(RewritePatternSet &results, - MLIRContext *context) { - results.add>>( - context); -} - void Conv2DOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { results.add< diff --git a/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt b/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt index 6ef1529343453..c712c64b6de55 100644 --- a/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt @@ -21,6 +21,6 @@ set_property(TARGET MLIRSparseTensorRuntime PROPERTY CXX_STANDARD 17) check_cxx_compiler_flag(-Wweak-vtables COMPILER_SUPPORTS_WARNING_WEAK_VTABLES) if(COMPILER_SUPPORTS_WARNING_WEAK_VTABLES) - target_compile_options(MLIRSparseTensorRuntime PUBLIC + target_compile_options(MLIRSparseTensorRuntime PRIVATE "-Wweak-vtables") endif() diff --git a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir index 8bd7ae8df9049..ac1f22b68b1e1 100644 --- a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir +++ b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir @@ -63,6 +63,20 @@ func.func @memref_dim_all_positive(%m: memref, %x: index) { // ----- +// CHECK-LABEL: func @memref_expand( +// CHECK-SAME: %[[m:[a-zA-Z0-9]+]]: memref +// CHECK-SAME: %[[sz:[a-zA-Z0-9]+]]: index +// CHECK: %[[c4:.*]] = arith.constant 4 : index +// CHECK: return %[[sz]], %[[c4]] +func.func @memref_expand(%m: memref, %sz: index) -> (index, index) { + %0 = memref.expand_shape %m [[0, 1]] output_shape [%sz, 4]: memref into memref + %1 = "test.reify_bound"(%0) {dim = 0} : (memref) -> (index) + %2 = "test.reify_bound"(%0) {dim = 1} : (memref) -> (index) + return %1, %2 : index, index +} + +// ----- + // CHECK-LABEL: func @memref_get_global( // CHECK: %[[c4:.*]] = arith.constant 4 : index // CHECK: return %[[c4]] diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 2bec63672e783..084c3fc065de3 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -1604,6 +1604,148 @@ func.func @func_execute_region_inline_multi_yield() { // ----- +// Test case with single scf.yield op inside execute_region and its operand is defined outside the execute_region op. +// Make scf.execute_region not to return anything. + +// CHECK: scf.execute_region no_inline { +// CHECK: func.call @foo() : () -> () +// CHECK: scf.yield +// CHECK: } + +module { +func.func private @foo()->() +func.func private @execute_region_yeilding_external_value() -> memref<1x60xui8> { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %1 = scf.execute_region -> memref<1x60xui8> no_inline { + func.call @foo():()->() + scf.yield %alloc: memref<1x60xui8> + } + return %1 : memref<1x60xui8> +} +} + +// ----- + +// Test case with scf.yield op inside execute_region with multiple operands. +// One of operands is defined outside the execute_region op. +// Remove just this operand from the op results. + +// CHECK: %[[VAL_1:.*]] = scf.execute_region -> memref<1x120xui8> no_inline { +// CHECK: %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> +// CHECK: func.call @foo() : () -> () +// CHECK: scf.yield %[[VAL_2]] : memref<1x120xui8> +// CHECK: } +module { +func.func private @foo()->() +func.func private @execute_region_yeilding_external_and_local_values() -> (memref<1x60xui8>, memref<1x120xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + func.call @foo():()->() + scf.yield %alloc, %alloc_1: memref<1x60xui8>, memref<1x120xui8> + } + return %1, %2 : memref<1x60xui8>, memref<1x120xui8> +} +} + +// ----- + +// Test case with multiple scf.yield ops inside execute_region with same operands and those operands are defined outside the execute_region op.. +// Make scf.execute_region not to return anything. +// scf.yield must remain, cause scf.execute_region can't be empty. + +// CHECK: scf.execute_region no_inline { +// CHECK: %[[VAL_3:.*]] = "test.cmp"() : () -> i1 +// CHECK: cf.cond_br %[[VAL_3]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: scf.yield +// CHECK: ^bb2: +// CHECK: scf.yield +// CHECK: } + +module { + func.func private @foo()->() + func.func private @execute_region_multiple_yields_same_operands() -> (memref<1x60xui8>, memref<1x120xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { + %c = "test.cmp"() : () -> i1 + cf.cond_br %c, ^bb2, ^bb3 + ^bb2: + func.call @foo():()->() + scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8> + ^bb3: + func.call @foo():()->() + scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8> + } + return %1, %2 : memref<1x60xui8>, memref<1x120xui8> + } +} + +// ----- + +// Test case with multiple scf.yield ops with at least one different operand, then no change. + +// CHECK: %[[VAL_3:.*]]:2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { +// CHECK: ^bb1: +// CHECK: scf.yield %{{.*}}, %{{.*}} : memref<1x60xui8>, memref<1x120xui8> +// CHECK: ^bb2: +// CHECK: scf.yield %{{.*}}, %{{.*}} : memref<1x60xui8>, memref<1x120xui8> +// CHECK: } + +module { + func.func private @foo()->() + func.func private @execute_region_multiple_yields_different_operands() -> (memref<1x60xui8>, memref<1x120xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8> + %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline { + %c = "test.cmp"() : () -> i1 + cf.cond_br %c, ^bb2, ^bb3 + ^bb2: + func.call @foo():()->() + scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8> + ^bb3: + func.call @foo():()->() + scf.yield %alloc, %alloc_2 : memref<1x60xui8>, memref<1x120xui8> + } + return %1, %2 : memref<1x60xui8>, memref<1x120xui8> + } +} + +// ----- + +// Test case with multiple scf.yield ops each has different operand. +// In this case scf.execute_region isn't changed. + +// CHECK: %[[VAL_2:.*]] = scf.execute_region -> memref<1x60xui8> no_inline { +// CHECK: ^bb1: +// CHECK: scf.yield %{{.*}} : memref<1x60xui8> +// CHECK: ^bb2: +// CHECK: scf.yield %{{.*}} : memref<1x60xui8> +// CHECK: } + +module { +func.func private @foo()->() +func.func private @execute_region_multiple_yields_different_operands() -> (memref<1x60xui8>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8> + %1 = scf.execute_region -> (memref<1x60xui8>) no_inline { + %c = "test.cmp"() : () -> i1 + cf.cond_br %c, ^bb2, ^bb3 + ^bb2: + func.call @foo():()->() + scf.yield %alloc : memref<1x60xui8> + ^bb3: + func.call @foo():()->() + scf.yield %alloc_1 : memref<1x60xui8> + } + return %1 : memref<1x60xui8> +} +} + +// ----- + // CHECK-LABEL: func @canonicalize_parallel_insert_slice_indices( // CHECK-SAME: %[[arg0:.*]]: tensor<1x5xf32>, %[[arg1:.*]]: tensor func.func @canonicalize_parallel_insert_slice_indices( diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index 7574afa215e78..5a40f3fa8572c 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -18,11 +18,11 @@ func.func @test_argmax_fold_i64_index(%arg0: tensor<1xi8>) -> tensor { // ----- -// CHECK-LABEL: @pad_wh_avg_pool2d_fold -func.func @pad_wh_avg_pool2d_fold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> { - // CHECK-NOT: tosa.pad +// CHECK-LABEL: @pad_wh_avg_pool2d_nofold +func.func @pad_wh_avg_pool2d_nofold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> { + // CHECK: tosa.pad // CHECK: tosa.avg_pool2d - // CHECK-SAME: pad = array + // CHECK-SAME: pad = array %pad_shape = tosa.const_shape { values = dense<[0, 0, 1, 0, 1, 0, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8> %pad_const = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : ()-> tensor<1xf32> %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : ()-> tensor<1xf32>