diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 6a69d02fa81753..b80bc30cfa0a21 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -11150,7 +11150,7 @@ static bool getAArch64PBV(QualType QT, ASTContext &C) { /// as defined by `LS(P)` in 3.2.1 of the AAVFABI. /// TODO: Add support for references, section 3.2.1, item 1. static unsigned getAArch64LS(QualType QT, ParamKindTy Kind, ASTContext &C) { - if (getAArch64MTV(QT, Kind) && QT.getCanonicalType()->isPointerType()) { + if (!getAArch64MTV(QT, Kind) && QT.getCanonicalType()->isPointerType()) { QualType PTy = QT.getCanonicalType()->getPointeeType(); if (getAArch64PBV(PTy, C)) return C.getTypeSize(PTy); diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index bad796bf92dcfb..3c91a04d54642f 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1051,8 +1051,7 @@ llvm::MDNode *CodeGenFunction::createProfileWeightsForLoop(const Stmt *Cond, if (!PGO.haveRegionCounts()) return nullptr; Optional CondCount = PGO.getStmtCount(Cond); - assert(CondCount.hasValue() && "missing expected loop condition count"); - if (*CondCount == 0) + if (!CondCount || *CondCount == 0) return nullptr; return createProfileWeights(LoopCount, std::max(*CondCount, LoopCount) - LoopCount); diff --git a/clang/test/Lexer/case-insensitive-include-ms.c b/clang/test/Lexer/case-insensitive-include-ms.c index cf14d2530d0161..f7af1fef8b4e6a 100644 --- a/clang/test/Lexer/case-insensitive-include-ms.c +++ b/clang/test/Lexer/case-insensitive-include-ms.c @@ -6,15 +6,17 @@ // RUN: %clang_cc1 -fsyntax-only -fms-compatibility %s -include %s -I %t/Output -verify // RUN: %clang_cc1 -fsyntax-only -fms-compatibility -fdiagnostics-parseable-fixits %s -include %s -I %t/Output 2>&1 | FileCheck %s -// FIXME: Add a test with repeated backslashes once clang can handle that -// in ms-compat mode on non-Windows hosts. #include "..\Output\.\case-insensitive-include.h" #include "..\Output\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\"" +#include "..\\Output\.\\Case-Insensitive-Include.h" // expected-warning {{non-portable path}} +// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:52}:"\"..\\\\Output\\.\\\\case-insensitive-include.h\"" #include "..\output\.\case-insensitive-include.h" // expected-warning {{non-portable path}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\"" #include "apath\..\.\case-insensitive-include.h" #include "apath\..\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:49}:"\"apath\\..\\.\\case-insensitive-include.h\"" +#include "apath\\..\\.\\Case-Insensitive-Include.h" // expected-warning {{non-portable path}} +// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:52}:"\"apath\\\\..\\\\.\\\\case-insensitive-include.h\"" #include "APath\..\.\case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-( diff --git a/clang/test/OpenMP/aarch64_vfabi_NarrowestDataSize.c b/clang/test/OpenMP/aarch64_vfabi_NarrowestDataSize.c new file mode 100644 index 00000000000000..d65c4edaeea709 --- /dev/null +++ b/clang/test/OpenMP/aarch64_vfabi_NarrowestDataSize.c @@ -0,0 +1,82 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s + +// REQUIRES: aarch64-registered-target +// Note: -fopemp and -fopenmp-simd behavior are expected to be the same. + +// This test checks the values of Narrowest Data Size (NDS), as defined in +// https://github.com/ARM-software/abi-aa/tree/master/vfabia64 +// +// NDS is used to compute the token in the name of AdvSIMD +// vector functions when no `simdlen` is specified, with the rule: +// +// if NDS(f) = 1, then VLEN = 16, 8; +// if NDS(f) = 2, then VLEN = 8, 4; +// if NDS(f) = 4, then VLEN = 4, 2; +// if NDS(f) = 8 or NDS(f) = 16, then VLEN = 2. + +// NDS(NDS_is_sizeof_char) = 1 +#pragma omp declare simd notinbranch +char NDS_is_sizeof_char(short in); +// CHECK-DAG: _ZGVnN16v_NDS_is_sizeof_char +// CHECK-DAG: _ZGVnN8v_NDS_is_sizeof_char +// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_char + +// NDS(NDS_is_sizeof_short) = 2 +#pragma omp declare simd notinbranch +int NDS_is_sizeof_short(short in); +// CHECK-DAG: _ZGVnN8v_NDS_is_sizeof_short +// CHECK-DAG: _ZGVnN4v_NDS_is_sizeof_short +// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_short + +// NDS(NDS_is_sizeof_float_with_linear) = 4, and not 2, because the pointers are +// marked as `linear` and therefore the size of the pointee realizes +// the NDS. +#pragma omp declare simd linear(sin) notinbranch +void NDS_is_sizeof_float_with_linear(double in, float *sin); +// Neon accepts only power of 2 values as . +// CHECK-DAG: _ZGVnN4vl4_NDS_is_sizeof_float_with_linear +// CHECK-DAG: _ZGVnN2vl4_NDS_is_sizeof_float_with_linear +// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_float_with_linear + +// NDS(NDS_is_size_of_float) = 4 +#pragma omp declare simd notinbranch +double NDS_is_size_of_float(float in); +// CHECK-DAG: _ZGVnN4v_NDS_is_size_of_float +// CHECK-DAG: _ZGVnN2v_NDS_is_size_of_float +// CHECK-NOT: _ZGV{{.*}}_NDS_is_size_of_float + +// NDS(NDS_is_sizeof_double) = 8 +#pragma omp declare simd linear(sin) notinbranch +void NDS_is_sizeof_double(double in, double *sin); +// CHECK-DAG: _ZGVnN2vl8_NDS_is_sizeof_double +// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_double + +// NDS(double_complex) = 16 +#pragma omp declare simd notinbranch +double _Complex double_complex(double _Complex); +// CHECK-DAG: _ZGVnN2v_double_complex +// CHECK-NOT: _ZGV{{.*}}_double_complex + +// NDS(double_complex_linear_char) = 1, becasue `x` is marked linear. +#pragma omp declare simd linear(x) notinbranch +double _Complex double_complex_linear_char(double _Complex y, char *x); +// CHECK-DAG: _ZGVnN8vl_double_complex_linear_char +// CHECK-DAG: _ZGVnN16vl_double_complex_linear_char +// CHECK-NOT: _ZGV{{.*}}_double_complex_linear_char + +static float *F; +static double *D; +static short S; +static int I; +static char C; +static double _Complex DC; +void do_something() { + C = NDS_is_sizeof_char(S); + I = NDS_is_sizeof_short(S); + NDS_is_sizeof_float_with_linear(*D, F); + *D = NDS_is_size_of_float(*F); + NDS_is_sizeof_double(*D, D); + DC = double_complex(DC); + DC = double_complex_linear_char(DC, &C); +} diff --git a/clang/test/OpenMP/aarch64_vfabi_WidestDataSize.c b/clang/test/OpenMP/aarch64_vfabi_WidestDataSize.c new file mode 100644 index 00000000000000..841a64053e5e3b --- /dev/null +++ b/clang/test/OpenMP/aarch64_vfabi_WidestDataSize.c @@ -0,0 +1,78 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +sve -fopenmp -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +sve -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s + +// REQUIRES: aarch64-registered-target +// Note: -fopemp and -fopenmp-simd behavior are expected to be the same. + +// This test checks the values of Widest Data Size (WDS), as defined +// in https://github.com/ARM-software/abi-aa/tree/master/vfabia64 +// +// WDS is used to check the accepted values of `simdlen()` when +// targeting fixed-length SVE vector function names. The values of +// `` that are accepted are such that for X = WDS * * 8, +// 128-bit <= X <= 2048-bit and X is a multiple of 128-bit. + +#pragma omp declare simd simdlen(8) +#pragma omp declare simd simdlen(16) +#pragma omp declare simd simdlen(256) +#pragma omp declare simd simdlen(272) +char WDS_is_sizeof_char(char in); +// WDS = 1, simdlen(8) and simdlen(272) are not generated. +// CHECK-DAG: _ZGVsM16v_WDS_is_sizeof_char +// CHECK-DAG: _ZGVsM256v_WDS_is_sizeof_char +// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_char + +#pragma omp declare simd simdlen(4) +#pragma omp declare simd simdlen(8) +#pragma omp declare simd simdlen(128) +#pragma omp declare simd simdlen(136) +char WDS_is_sizeof_short(short in); +// WDS = 2, simdlen(4) and simdlen(136) are not generated. +// CHECK-DAG: _ZGVsM8v_WDS_is_sizeof_short +// CHECK-DAG: _ZGVsM128v_WDS_is_sizeof_short +// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_short + +#pragma omp declare simd linear(sin) notinbranch simdlen(2) +#pragma omp declare simd linear(sin) notinbranch simdlen(4) +#pragma omp declare simd linear(sin) notinbranch simdlen(64) +#pragma omp declare simd linear(sin) notinbranch simdlen(68) +void WDS_is_sizeof_float_pointee(float in, float *sin); +// WDS = 4, simdlen(2) and simdlen(68) are not generated. +// CHECK-DAG: _ZGVsM4vl4_WDS_is_sizeof_float_pointee +// CHECK-DAG: _ZGVsM64vl4_WDS_is_sizeof_float_pointee +// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_float_pointee + +#pragma omp declare simd linear(sin) notinbranch simdlen(2) +#pragma omp declare simd linear(sin) notinbranch simdlen(4) +#pragma omp declare simd linear(sin) notinbranch simdlen(32) +#pragma omp declare simd linear(sin) notinbranch simdlen(34) +void WDS_is_sizeof_double_pointee(float in, double *sin); +// WDS = 8 because of the linear clause, simdlen(34) is not generated. +// CHECK-DAG: _ZGVsM2vl8_WDS_is_sizeof_double_pointee +// CHECK-DAG: _ZGVsM4vl8_WDS_is_sizeof_double_pointee +// CHECK-DAG: _ZGVsM32vl8_WDS_is_sizeof_double_pointee +// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_double_pointee + +#pragma omp declare simd simdlen(2) +#pragma omp declare simd simdlen(4) +#pragma omp declare simd simdlen(32) +#pragma omp declare simd simdlen(34) +double WDS_is_sizeof_double(double in); +// WDS = 8, simdlen(34) is not generated. +// CHECK-DAG: _ZGVsM2v_WDS_is_sizeof_double +// CHECK-DAG: _ZGVsM4v_WDS_is_sizeof_double +// CHECK-DAG: _ZGVsM32v_WDS_is_sizeof_double +// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_double + +static char C; +static short S; +static float F; +static double D; + +void do_something() { + C = WDS_is_sizeof_char(C); + C = WDS_is_sizeof_short(S); + WDS_is_sizeof_float_pointee(F, &F); + WDS_is_sizeof_double_pointee(F, &D); + D = WDS_is_sizeof_double(D); +} diff --git a/flang/include/flang/Evaluate/check-expression.h b/flang/include/flang/Evaluate/check-expression.h index a26f83b01bbbf4..b14a47838e3aaf 100644 --- a/flang/include/flang/Evaluate/check-expression.h +++ b/flang/include/flang/Evaluate/check-expression.h @@ -12,6 +12,7 @@ #define FORTRAN_EVALUATE_CHECK_EXPRESSION_H_ #include "expression.h" +#include "intrinsics.h" #include "type.h" #include @@ -41,24 +42,38 @@ bool IsInitialDataTarget( // Check whether an expression is a specification expression // (10.1.11(2), C1010). Constant expressions are always valid // specification expressions. + +// There are two contexts where specification expressions appear -- array +// bounds and type param expressions. We need to differentiate them because +// additional checks are required for array bounds expressions in declarations +// of derived type components (see C750). +ENUM_CLASS(SpecificationExprContext, TYPE_PARAM, BOUND) + template -void CheckSpecificationExpr( - const A &, parser::ContextualMessages &, const semantics::Scope &); +void CheckSpecificationExpr(const A &, parser::ContextualMessages &, + const semantics::Scope &, const IntrinsicProcTable &, + SpecificationExprContext); extern template void CheckSpecificationExpr(const Expr &x, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); extern template void CheckSpecificationExpr(const Expr &x, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); extern template void CheckSpecificationExpr(const Expr &x, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); extern template void CheckSpecificationExpr( const std::optional> &x, parser::ContextualMessages &, - const semantics::Scope &); + const semantics::Scope &, const IntrinsicProcTable &, + SpecificationExprContext); extern template void CheckSpecificationExpr( const std::optional> &x, parser::ContextualMessages &, - const semantics::Scope &); + const semantics::Scope &, const IntrinsicProcTable &, + SpecificationExprContext); extern template void CheckSpecificationExpr( const std::optional> &x, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); // Simple contiguity (9.5.4) template diff --git a/flang/include/flang/Evaluate/intrinsics.h b/flang/include/flang/Evaluate/intrinsics.h index fc79638189193e..88d6a7af13eb75 100644 --- a/flang/include/flang/Evaluate/intrinsics.h +++ b/flang/include/flang/Evaluate/intrinsics.h @@ -55,6 +55,11 @@ struct SpecificIntrinsicFunctionInterface : public characteristics::Procedure { // All argument and result types are intrinsic types with default kinds. }; +// Generic intrinsic classes from table 16.1 +ENUM_CLASS(IntrinsicClass, atomicSubroutine, collectiveSubroutine, + elementalFunction, elementalSubroutine, inquiryFunction, pureSubroutine, + impureSubroutine, transformationalFunction, noClass) + class IntrinsicProcTable { private: class Implementation; @@ -68,6 +73,9 @@ class IntrinsicProcTable { // statement. bool IsIntrinsic(const std::string &) const; + // Inquiry intrinsics are defined in section 16.7, table 16.1 + IntrinsicClass GetIntrinsicClass(const std::string &) const; + // Probe the intrinsics for a match against a specific call. // On success, the actual arguments are transferred to the result // in dummy argument order; on failure, the actual arguments remain diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp index 3f71cb6a1aeaf8..43686815ab3513 100644 --- a/flang/lib/Evaluate/check-expression.cpp +++ b/flang/lib/Evaluate/check-expression.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "flang/Evaluate/check-expression.h" +#include "flang/Evaluate/intrinsics.h" #include "flang/Evaluate/traverse.h" #include "flang/Evaluate/type.h" #include "flang/Semantics/symbol.h" #include "flang/Semantics/tools.h" +#include +#include namespace Fortran::evaluate { @@ -171,6 +174,7 @@ class IsInitialDataTargetHelper return (*this)(x.left()); } bool operator()(const Relational &) const { return false; } + private: parser::ContextualMessages *messages_; }; @@ -187,8 +191,10 @@ class CheckSpecificationExprHelper public: using Result = std::optional; using Base = AnyTraverse; - explicit CheckSpecificationExprHelper(const semantics::Scope &s) - : Base{*this}, scope_{s} {} + explicit CheckSpecificationExprHelper(const semantics::Scope &s, + const IntrinsicProcTable &table, SpecificationExprContext specExprContext) + : Base{*this}, scope_{s}, table_{table}, specExprContext_{ + specExprContext} {} using Base::operator(); Result operator()(const ProcedureDesignator &) const { @@ -199,6 +205,10 @@ class CheckSpecificationExprHelper Result operator()(const semantics::Symbol &symbol) const { if (semantics::IsNamedConstant(symbol)) { return std::nullopt; + } else if (scope_.IsDerivedType() && IsVariableName(symbol) && + specExprContext_ == SpecificationExprContext::BOUND) { // C750 + return "reference to variable '"s + symbol.name().ToString() + + "' not allowed for derived type components"; } else if (symbol.IsDummy()) { if (symbol.attrs().test(semantics::Attr::OPTIONAL)) { return "reference to OPTIONAL dummy argument '"s + @@ -243,16 +253,51 @@ class CheckSpecificationExprHelper return std::nullopt; } + template + Result operator()(const TypeParamInquiry &inq) const { + if (scope_.IsDerivedType() && !IsConstantExpr(inq) && + inq.parameter().owner() != scope_ && + specExprContext_ == SpecificationExprContext::BOUND) { // C750 + return "non-constant reference to a type parameter inquiry " + "not allowed for derived type components"; + } + return std::nullopt; + } + template Result operator()(const FunctionRef &x) const { if (const auto *symbol{x.proc().GetSymbol()}) { if (!semantics::IsPureProcedure(*symbol)) { return "reference to impure function '"s + symbol->name().ToString() + "'"; } + if (semantics::IsStmtFunction(*symbol)) { + return "reference to statement function '"s + + symbol->name().ToString() + "'"; + } + if (scope_.IsDerivedType() && + specExprContext_ == SpecificationExprContext::BOUND) { // C750 + return "reference to function '"s + symbol->name().ToString() + + "' not allowed for derived type components"; + } // TODO: other checks for standard module procedures } else { const SpecificIntrinsic &intrin{DEREF(x.proc().GetSpecificIntrinsic())}; - if (intrin.name == "present") { + if (scope_.IsDerivedType() && + specExprContext_ == SpecificationExprContext::BOUND) { // C750 + if ((table_.IsIntrinsic(intrin.name) && + badIntrinsicsForComponents_.find(intrin.name) != + badIntrinsicsForComponents_.end()) || + IsProhibitedFunction(intrin.name)) { + return "reference to intrinsic '"s + intrin.name + + "' not allowed for derived type components"; + } + if (table_.GetIntrinsicClass(intrin.name) == + IntrinsicClass::inquiryFunction && + !IsConstantExpr(x)) { + return "non-constant reference to inquiry intrinsic '"s + + intrin.name + "' not allowed for derived type components"; + } + } else if (intrin.name == "present") { return std::nullopt; // no need to check argument(s) } if (IsConstantExpr(x)) { @@ -265,29 +310,42 @@ class CheckSpecificationExprHelper private: const semantics::Scope &scope_; + const IntrinsicProcTable &table_; + const SpecificationExprContext specExprContext_; + const std::set badIntrinsicsForComponents_{ + "allocated", "associated", "extends_type_of", "present", "same_type_as"}; + static bool IsProhibitedFunction(std::string name) { return false; } }; template void CheckSpecificationExpr(const A &x, parser::ContextualMessages &messages, - const semantics::Scope &scope) { - if (auto why{CheckSpecificationExprHelper{scope}(x)}) { + const semantics::Scope &scope, const IntrinsicProcTable &table, + SpecificationExprContext specExprContext) { + if (auto why{ + CheckSpecificationExprHelper{scope, table, specExprContext}(x)}) { messages.Say("Invalid specification expression: %s"_err_en_US, *why); } } template void CheckSpecificationExpr(const Expr &, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); template void CheckSpecificationExpr(const Expr &, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); template void CheckSpecificationExpr(const Expr &, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); template void CheckSpecificationExpr(const std::optional> &, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); template void CheckSpecificationExpr(const std::optional> &, - parser::ContextualMessages &, const semantics::Scope &); + parser::ContextualMessages &, const semantics::Scope &, + const IntrinsicProcTable &, SpecificationExprContext); template void CheckSpecificationExpr( const std::optional> &, parser::ContextualMessages &, - const semantics::Scope &); + const semantics::Scope &, const IntrinsicProcTable &, + SpecificationExprContext); // IsSimplyContiguous() -- 9.5.4 class IsSimplyContiguousHelper diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index cbf082bd8ac5ab..605b100f42f3a7 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -229,6 +229,7 @@ struct IntrinsicInterface { IntrinsicDummyArgument dummy[maxArguments]; TypePattern result; Rank rank{Rank::elemental}; + IntrinsicClass intrinsicClass{IntrinsicClass::elementalFunction}; std::optional Match(const CallCharacteristics &, const common::IntrinsicTypeDefaultKinds &, ActualArguments &, FoldingContext &context) const; @@ -265,19 +266,21 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"aimag", {{"x", SameComplex}}, SameReal}, {"aint", {{"a", SameReal}, MatchingDefaultKIND}, KINDReal}, {"all", {{"mask", SameLogical, Rank::array}, OptionalDIM}, SameLogical, - Rank::dimReduced}, - {"allocated", {{"array", AnyData, Rank::array}}, DefaultLogical}, - {"allocated", {{"scalar", AnyData, Rank::scalar}}, DefaultLogical}, + Rank::dimReduced, IntrinsicClass::transformationalFunction}, + {"allocated", {{"array", AnyData, Rank::array}}, DefaultLogical, + Rank::elemental, IntrinsicClass::inquiryFunction}, + {"allocated", {{"scalar", AnyData, Rank::scalar}}, DefaultLogical, + Rank::elemental, IntrinsicClass::inquiryFunction}, {"anint", {{"a", SameReal}, MatchingDefaultKIND}, KINDReal}, {"any", {{"mask", SameLogical, Rank::array}, OptionalDIM}, SameLogical, - Rank::dimReduced}, + Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"asin", {{"x", SameFloating}}, SameFloating}, {"asind", {{"x", SameFloating}}, SameFloating}, {"asinh", {{"x", SameFloating}}, SameFloating}, {"associated", {{"pointer", Addressable, Rank::known}, {"target", Addressable, Rank::known, Optionality::optional}}, - DefaultLogical}, + DefaultLogical, Rank::elemental, IntrinsicClass::inquiryFunction}, {"atan", {{"x", SameFloating}}, SameFloating}, {"atand", {{"x", SameFloating}}, SameFloating}, {"atan", {{"y", OperandReal}, {"x", OperandReal}}, OperandReal}, @@ -291,14 +294,14 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"bessel_jn", {{"n1", AnyInt, Rank::scalar}, {"n2", AnyInt, Rank::scalar}, {"x", SameReal, Rank::scalar}}, - SameReal, Rank::vector}, + SameReal, Rank::vector, IntrinsicClass::transformationalFunction}, {"bessel_y0", {{"x", SameReal}}, SameReal}, {"bessel_y1", {{"x", SameReal}}, SameReal}, {"bessel_yn", {{"n", AnyInt}, {"x", SameReal}}, SameReal}, {"bessel_yn", {{"n1", AnyInt, Rank::scalar}, {"n2", AnyInt, Rank::scalar}, {"x", SameReal, Rank::scalar}}, - SameReal, Rank::vector}, + SameReal, Rank::vector, IntrinsicClass::transformationalFunction}, {"bge", {{"i", AnyInt, Rank::elementalOrBOZ}, {"j", AnyInt, Rank::elementalOrBOZ}}, @@ -308,7 +311,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"j", AnyInt, Rank::elementalOrBOZ}}, DefaultLogical}, {"bit_size", {{"i", SameInt, Rank::anyOrAssumedRank}}, SameInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"ble", {{"i", AnyInt, Rank::elementalOrBOZ}, {"j", AnyInt, Rank::elementalOrBOZ}}, @@ -327,34 +330,36 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"y", AnyIntOrReal, Rank::elementalOrBOZ, Optionality::optional}, DefaultingKIND}, KINDComplex}, - {"command_argument_count", {}, DefaultInt, Rank::scalar}, + {"command_argument_count", {}, DefaultInt, Rank::scalar, + IntrinsicClass::transformationalFunction}, {"conjg", {{"z", SameComplex}}, SameComplex}, {"cos", {{"x", SameFloating}}, SameFloating}, {"cosd", {{"x", SameFloating}}, SameFloating}, {"cosh", {{"x", SameFloating}}, SameFloating}, {"count", {{"mask", AnyLogical, Rank::array}, OptionalDIM, DefaultingKIND}, - KINDInt, Rank::dimReduced}, + KINDInt, Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"cshift", {{"array", SameType, Rank::array}, {"shift", AnyInt, Rank::dimRemoved}, OptionalDIM}, - SameType, Rank::conformable}, + SameType, Rank::conformable, IntrinsicClass::transformationalFunction}, {"dble", {{"a", AnyNumeric, Rank::elementalOrBOZ}}, DoublePrecision}, {"digits", {{"x", AnyIntOrReal, Rank::anyOrAssumedRank}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"dim", {{"x", OperandIntOrReal}, {"y", OperandIntOrReal}}, OperandIntOrReal}, {"dot_product", {{"vector_a", AnyLogical, Rank::vector}, {"vector_b", AnyLogical, Rank::vector}}, - ResultLogical, Rank::scalar}, + ResultLogical, Rank::scalar, IntrinsicClass::transformationalFunction}, {"dot_product", {{"vector_a", AnyComplex, Rank::vector}, {"vector_b", AnyNumeric, Rank::vector}}, - ResultNumeric, Rank::scalar}, // conjugates vector_a + ResultNumeric, Rank::scalar, // conjugates vector_a + IntrinsicClass::transformationalFunction}, {"dot_product", {{"vector_a", AnyIntOrReal, Rank::vector}, {"vector_b", AnyNumeric, Rank::vector}}, - ResultNumeric, Rank::scalar}, + ResultNumeric, Rank::scalar, IntrinsicClass::transformationalFunction}, {"dprod", {{"x", DefaultReal}, {"y", DefaultReal}}, DoublePrecision}, {"dshiftl", {{"i", SameInt}, {"j", SameInt, Rank::elementalOrBOZ}, @@ -372,68 +377,72 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"boundary", SameIntrinsic, Rank::dimRemoved, Optionality::optional}, OptionalDIM}, - SameIntrinsic, Rank::conformable}, + SameIntrinsic, Rank::conformable, + IntrinsicClass::transformationalFunction}, {"eoshift", {{"array", SameDerivedType, Rank::array}, {"shift", AnyInt, Rank::dimRemoved}, {"boundary", SameDerivedType, Rank::dimRemoved}, OptionalDIM}, - SameDerivedType, Rank::conformable}, + SameDerivedType, Rank::conformable, + IntrinsicClass::transformationalFunction}, {"epsilon", {{"x", SameReal, Rank::anyOrAssumedRank}}, SameReal, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"erf", {{"x", SameReal}}, SameReal}, {"erfc", {{"x", SameReal}}, SameReal}, {"erfc_scaled", {{"x", SameReal}}, SameReal}, {"exp", {{"x", SameFloating}}, SameFloating}, + {"exp", {{"x", SameFloating}}, SameFloating}, {"exponent", {{"x", AnyReal}}, DefaultInt}, + {"exp", {{"x", SameFloating}}, SameFloating}, {"extends_type_of", {{"a", ExtensibleDerived, Rank::anyOrAssumedRank}, {"mold", ExtensibleDerived, Rank::anyOrAssumedRank}}, - DefaultLogical, Rank::scalar}, + DefaultLogical, Rank::scalar, IntrinsicClass::inquiryFunction}, {"findloc", {{"array", AnyNumeric, Rank::array}, {"value", AnyNumeric, Rank::scalar}, RequiredDIM, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::dimRemoved}, + KINDInt, Rank::dimRemoved, IntrinsicClass::transformationalFunction}, {"findloc", {{"array", AnyNumeric, Rank::array}, {"value", AnyNumeric, Rank::scalar}, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::vector}, + KINDInt, Rank::vector, IntrinsicClass::transformationalFunction}, {"findloc", {{"array", SameChar, Rank::array}, {"value", SameChar, Rank::scalar}, RequiredDIM, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::dimRemoved}, + KINDInt, Rank::dimRemoved, IntrinsicClass::transformationalFunction}, {"findloc", {{"array", SameChar, Rank::array}, {"value", SameChar, Rank::scalar}, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::vector}, + KINDInt, Rank::vector, IntrinsicClass::transformationalFunction}, {"findloc", {{"array", AnyLogical, Rank::array}, {"value", AnyLogical, Rank::scalar}, RequiredDIM, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::dimRemoved}, + KINDInt, Rank::dimRemoved, IntrinsicClass::transformationalFunction}, {"findloc", {{"array", AnyLogical, Rank::array}, {"value", AnyLogical, Rank::scalar}, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::vector}, + KINDInt, Rank::vector, IntrinsicClass::transformationalFunction}, {"floor", {{"a", AnyReal}, DefaultingKIND}, KINDInt}, {"fraction", {{"x", SameReal}}, SameReal}, {"gamma", {{"x", SameReal}}, SameReal}, {"huge", {{"x", SameIntOrReal, Rank::anyOrAssumedRank}}, SameIntOrReal, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"hypot", {{"x", OperandReal}, {"y", OperandReal}}, OperandReal}, {"iachar", {{"c", AnyChar}, DefaultingKIND}, KINDInt}, {"iall", {{"array", SameInt, Rank::array}, OptionalDIM, OptionalMASK}, - SameInt, Rank::dimReduced}, + SameInt, Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"iany", {{"array", SameInt, Rank::array}, OptionalDIM, OptionalMASK}, - SameInt, Rank::dimReduced}, + SameInt, Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"iparity", {{"array", SameInt, Rank::array}, OptionalDIM, OptionalMASK}, - SameInt, Rank::dimReduced}, + SameInt, Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"iand", {{"i", SameInt}, {"j", SameInt, Rank::elementalOrBOZ}}, SameInt}, {"iand", {{"i", BOZ}, {"j", SameInt}}, SameInt}, {"ibclr", {{"i", SameInt}, {"pos", AnyInt}}, SameInt}, @@ -461,19 +470,20 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"size", AnyInt, Rank::elemental, Optionality::optional}}, SameInt}, {"is_contiguous", {{"array", Addressable, Rank::anyOrAssumedRank}}, - DefaultLogical}, + DefaultLogical, Rank::elemental, IntrinsicClass::inquiryFunction}, {"is_iostat_end", {{"i", AnyInt}}, DefaultLogical}, {"is_iostat_eor", {{"i", AnyInt}}, DefaultLogical}, - {"kind", {{"x", AnyIntrinsic}}, DefaultInt}, + {"kind", {{"x", AnyIntrinsic}}, DefaultInt, Rank::elemental, + IntrinsicClass::inquiryFunction}, {"lbound", {{"array", AnyData, Rank::anyOrAssumedRank}, RequiredDIM, SizeDefaultKIND}, - KINDInt, Rank::scalar}, + KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"lbound", {{"array", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND}, - KINDInt, Rank::vector}, + KINDInt, Rank::vector, IntrinsicClass::inquiryFunction}, {"leadz", {{"i", AnyInt}}, DefaultInt}, {"len", {{"string", AnyChar, Rank::anyOrAssumedRank}, DefaultingKIND}, - KINDInt, Rank::scalar}, + KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"len_trim", {{"string", AnyChar}, DefaultingKIND}, KINDInt}, {"lge", {{"string_a", SameChar}, {"string_b", SameChar}}, DefaultLogical}, {"lgt", {{"string_a", SameChar}, {"string_b", SameChar}}, DefaultLogical}, @@ -488,27 +498,27 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"matmul", {{"array_a", AnyLogical, Rank::vector}, {"array_b", AnyLogical, Rank::matrix}}, - ResultLogical, Rank::vector}, + ResultLogical, Rank::vector, IntrinsicClass::transformationalFunction}, {"matmul", {{"array_a", AnyLogical, Rank::matrix}, {"array_b", AnyLogical, Rank::vector}}, - ResultLogical, Rank::vector}, + ResultLogical, Rank::vector, IntrinsicClass::transformationalFunction}, {"matmul", {{"array_a", AnyLogical, Rank::matrix}, {"array_b", AnyLogical, Rank::matrix}}, - ResultLogical, Rank::matrix}, + ResultLogical, Rank::matrix, IntrinsicClass::transformationalFunction}, {"matmul", {{"array_a", AnyNumeric, Rank::vector}, {"array_b", AnyNumeric, Rank::matrix}}, - ResultNumeric, Rank::vector}, + ResultNumeric, Rank::vector, IntrinsicClass::transformationalFunction}, {"matmul", {{"array_a", AnyNumeric, Rank::matrix}, {"array_b", AnyNumeric, Rank::vector}}, - ResultNumeric, Rank::vector}, + ResultNumeric, Rank::vector, IntrinsicClass::transformationalFunction}, {"matmul", {{"array_a", AnyNumeric, Rank::matrix}, {"array_b", AnyNumeric, Rank::matrix}}, - ResultNumeric, Rank::matrix}, + ResultNumeric, Rank::matrix, IntrinsicClass::transformationalFunction}, {"maskl", {{"i", AnyInt}, DefaultingKIND}, KINDInt}, {"maskr", {{"i", AnyInt}, DefaultingKIND}, KINDInt}, {"max", @@ -520,15 +530,16 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"a3", SameChar, Rank::elemental, Optionality::repeats}}, SameChar}, {"maxexponent", {{"x", AnyReal, Rank::anyOrAssumedRank}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"maxloc", {{"array", AnyRelatable, Rank::array}, OptionalDIM, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::dimReduced}, + KINDInt, Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"maxval", {{"array", SameRelatable, Rank::array}, OptionalDIM, OptionalMASK}, - SameRelatable, Rank::dimReduced}, + SameRelatable, Rank::dimReduced, + IntrinsicClass::transformationalFunction}, {"merge", {{"tsource", SameType}, {"fsource", SameType}, {"mask", AnyLogical}}, SameType}, @@ -548,25 +559,26 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"a3", SameChar, Rank::elemental, Optionality::repeats}}, SameChar}, {"minexponent", {{"x", AnyReal, Rank::anyOrAssumedRank}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"minloc", {{"array", AnyRelatable, Rank::array}, OptionalDIM, OptionalMASK, SizeDefaultKIND, {"back", AnyLogical, Rank::scalar, Optionality::optional}}, - KINDInt, Rank::dimReduced}, + KINDInt, Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"minval", {{"array", SameRelatable, Rank::array}, OptionalDIM, OptionalMASK}, - SameRelatable, Rank::dimReduced}, + SameRelatable, Rank::dimReduced, + IntrinsicClass::transformationalFunction}, {"mod", {{"a", OperandIntOrReal}, {"p", OperandIntOrReal}}, OperandIntOrReal}, {"modulo", {{"a", OperandIntOrReal}, {"p", OperandIntOrReal}}, OperandIntOrReal}, {"nearest", {{"x", SameReal}, {"s", AnyReal}}, SameReal}, {"new_line", {{"x", SameChar, Rank::anyOrAssumedRank}}, SameChar, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"nint", {{"a", AnyReal}, DefaultingKIND}, KINDInt}, {"norm2", {{"x", SameReal, Rank::array}, OptionalDIM}, SameReal, - Rank::dimReduced}, + Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"not", {{"i", SameInt}}, SameInt}, // NULL() is a special case handled in Probe() below {"out_of_range", @@ -581,24 +593,25 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {{"array", SameType, Rank::array}, {"mask", AnyLogical, Rank::conformable}, {"vector", SameType, Rank::vector, Optionality::optional}}, - SameType, Rank::vector}, + SameType, Rank::vector, IntrinsicClass::transformationalFunction}, {"parity", {{"mask", SameLogical, Rank::array}, OptionalDIM}, SameLogical, - Rank::dimReduced}, + Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"popcnt", {{"i", AnyInt}}, DefaultInt}, {"poppar", {{"i", AnyInt}}, DefaultInt}, {"product", {{"array", SameNumeric, Rank::array}, OptionalDIM, OptionalMASK}, - SameNumeric, Rank::dimReduced}, + SameNumeric, Rank::dimReduced, + IntrinsicClass::transformationalFunction}, {"precision", {{"x", AnyFloating, Rank::anyOrAssumedRank}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"present", {{"a", Addressable, Rank::anyOrAssumedRank}}, DefaultLogical, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"radix", {{"x", AnyIntOrReal, Rank::anyOrAssumedRank}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, {"range", {{"x", AnyNumeric, Rank::anyOrAssumedRank}}, DefaultInt, - Rank::scalar}, - {"rank", {{"a", AnyData, Rank::anyOrAssumedRank}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::inquiryFunction}, + {"rank", {{"a", AnyData, Rank::anyOrAssumedRank}}, DefaultInt, Rank::scalar, + IntrinsicClass::inquiryFunction}, {"real", {{"a", SameComplex, Rank::elemental}}, SameReal}, // 16.9.160(4)(ii) {"real", {{"a", AnyNumeric, Rank::elementalOrBOZ}, DefaultingKIND}, @@ -608,19 +621,19 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"operation", SameType, Rank::reduceOperation}, OptionalDIM, OptionalMASK, {"identity", SameType, Rank::scalar}, {"ordered", AnyLogical, Rank::scalar, Optionality::optional}}, - SameType, Rank::dimReduced}, + SameType, Rank::dimReduced, IntrinsicClass::transformationalFunction}, {"repeat", {{"string", SameChar, Rank::scalar}, {"ncopies", AnyInt}}, - SameChar, Rank::scalar}, + SameChar, Rank::scalar, IntrinsicClass::transformationalFunction}, {"reshape", {{"source", SameType, Rank::array}, {"shape", AnyInt, Rank::shape}, {"pad", SameType, Rank::array, Optionality::optional}, {"order", AnyInt, Rank::vector, Optionality::optional}}, - SameType, Rank::shaped}, + SameType, Rank::shaped, IntrinsicClass::transformationalFunction}, {"rrspacing", {{"x", SameReal}}, SameReal}, {"same_type_as", {{"a", ExtensibleDerived, Rank::anyOrAssumedRank}, {"b", ExtensibleDerived, Rank::anyOrAssumedRank}}, - DefaultLogical, Rank::scalar}, + DefaultLogical, Rank::scalar, IntrinsicClass::inquiryFunction}, {"scale", {{"x", SameReal}, {"i", AnyInt}}, SameReal}, {"scan", {{"string", SameChar}, {"set", SameChar}, @@ -628,27 +641,27 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ DefaultingKIND}, KINDInt}, {"selected_char_kind", {{"name", DefaultChar, Rank::scalar}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::transformationalFunction}, {"selected_int_kind", {{"r", AnyInt, Rank::scalar}}, DefaultInt, - Rank::scalar}, + Rank::scalar, IntrinsicClass::transformationalFunction}, {"selected_real_kind", {{"p", AnyInt, Rank::scalar}, {"r", AnyInt, Rank::scalar, Optionality::optional}, {"radix", AnyInt, Rank::scalar, Optionality::optional}}, - DefaultInt, Rank::scalar}, + DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction}, {"selected_real_kind", {{"p", AnyInt, Rank::scalar, Optionality::optional}, {"r", AnyInt, Rank::scalar}, {"radix", AnyInt, Rank::scalar, Optionality::optional}}, - DefaultInt, Rank::scalar}, + DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction}, {"selected_real_kind", {{"p", AnyInt, Rank::scalar, Optionality::optional}, {"r", AnyInt, Rank::scalar, Optionality::optional}, {"radix", AnyInt, Rank::scalar}}, - DefaultInt, Rank::scalar}, + DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction}, {"set_exponent", {{"x", SameReal}, {"i", AnyInt}}, SameReal}, {"shape", {{"source", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND}, - KINDInt, Rank::vector}, + KINDInt, Rank::vector, IntrinsicClass::inquiryFunction}, {"shifta", {{"i", SameInt}, {"shift", AnyInt}}, SameInt}, {"shiftl", {{"i", SameInt}, {"shift", AnyInt}}, SameInt}, {"shiftr", {{"i", SameInt}, {"shift", AnyInt}}, SameInt}, @@ -659,45 +672,49 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"size", {{"array", AnyData, Rank::anyOrAssumedRank}, OptionalDIM, SizeDefaultKIND}, - KINDInt, Rank::scalar}, + KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"spacing", {{"x", SameReal}}, SameReal}, {"spread", {{"source", SameType, Rank::known}, RequiredDIM, {"ncopies", AnyInt, Rank::scalar}}, - SameType, Rank::rankPlus1}, + SameType, Rank::rankPlus1, IntrinsicClass::transformationalFunction}, {"sqrt", {{"x", SameFloating}}, SameFloating}, {"storage_size", {{"a", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND}, - KINDInt, Rank::scalar}, + KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"sum", {{"array", SameNumeric, Rank::array}, OptionalDIM, OptionalMASK}, - SameNumeric, Rank::dimReduced}, + SameNumeric, Rank::dimReduced, + IntrinsicClass::transformationalFunction}, {"tan", {{"x", SameFloating}}, SameFloating}, {"tand", {{"x", SameFloating}}, SameFloating}, {"tanh", {{"x", SameFloating}}, SameFloating}, - {"tiny", {{"x", SameReal, Rank::anyOrAssumedRank}}, SameReal, Rank::scalar}, + {"tiny", {{"x", SameReal, Rank::anyOrAssumedRank}}, SameReal, Rank::scalar, + IntrinsicClass::inquiryFunction}, {"trailz", {{"i", AnyInt}}, DefaultInt}, {"transfer", {{"source", AnyData, Rank::known}, {"mold", SameType, Rank::scalar}}, - SameType, Rank::scalar}, + SameType, Rank::scalar, IntrinsicClass::transformationalFunction}, {"transfer", {{"source", AnyData, Rank::known}, {"mold", SameType, Rank::array}}, - SameType, Rank::vector}, + SameType, Rank::vector, IntrinsicClass::transformationalFunction}, {"transfer", {{"source", AnyData, Rank::anyOrAssumedRank}, {"mold", SameType, Rank::anyOrAssumedRank}, {"size", AnyInt, Rank::scalar}}, - SameType, Rank::vector}, - {"transpose", {{"matrix", SameType, Rank::matrix}}, SameType, Rank::matrix}, - {"trim", {{"string", SameChar, Rank::scalar}}, SameChar, Rank::scalar}, + SameType, Rank::vector, IntrinsicClass::transformationalFunction}, + {"transpose", {{"matrix", SameType, Rank::matrix}}, SameType, Rank::matrix, + IntrinsicClass::transformationalFunction}, + {"trim", {{"string", SameChar, Rank::scalar}}, SameChar, Rank::scalar, + IntrinsicClass::transformationalFunction}, {"ubound", {{"array", AnyData, Rank::anyOrAssumedRank}, RequiredDIM, SizeDefaultKIND}, - KINDInt, Rank::scalar}, + KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"ubound", {{"array", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND}, - KINDInt, Rank::vector}, + KINDInt, Rank::vector, IntrinsicClass::inquiryFunction}, {"unpack", {{"vector", SameType, Rank::vector}, {"mask", AnyLogical, Rank::array}, {"field", SameType, Rank::conformable}}, - SameType, Rank::conformable}, + SameType, Rank::conformable, IntrinsicClass::transformationalFunction}, {"verify", {{"string", SameChar}, {"set", SameChar}, {"back", AnyLogical, Rank::elemental, Optionality::optional}, @@ -900,33 +917,34 @@ static const SpecificIntrinsicInterface specificIntrinsicFunction[]{ }; static const IntrinsicInterface intrinsicSubroutine[]{ - {"cpu_time", {{"time", AnyReal, Rank::scalar}}, {}}, + {"cpu_time", {{"time", AnyReal, Rank::scalar}}, {}, Rank::elemental, + IntrinsicClass::impureSubroutine}, {"date_and_time", {{"date", DefaultChar, Rank::scalar, Optionality::optional}, {"time", DefaultChar, Rank::scalar, Optionality::optional}, {"zone", DefaultChar, Rank::scalar, Optionality::optional}, {"values", AnyInt, Rank::vector, Optionality::optional}}, - {}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"execute_command_line", {{"command", DefaultChar, Rank::scalar}, {"wait", AnyLogical, Rank::scalar, Optionality::optional}, {"exitstat", AnyInt, Rank::scalar, Optionality::optional}, {"cmdstat", AnyInt, Rank::scalar, Optionality::optional}, {"cmdmsg", DefaultChar, Rank::scalar, Optionality::optional}}, - {}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"get_command", {{"command", DefaultChar, Rank::scalar, Optionality::optional}, {"length", AnyInt, Rank::scalar, Optionality::optional}, {"status", AnyInt, Rank::scalar, Optionality::optional}, {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}}, - {}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"get_command_argument", {{"number", AnyInt, Rank::scalar}, {"value", DefaultChar, Rank::scalar, Optionality::optional}, {"length", AnyInt, Rank::scalar, Optionality::optional}, {"status", AnyInt, Rank::scalar, Optionality::optional}, {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}}, - {}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"get_environment_variable", {{"name", DefaultChar, Rank::scalar}, {"value", DefaultChar, Rank::scalar, Optionality::optional}, @@ -934,31 +952,34 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {"status", AnyInt, Rank::scalar, Optionality::optional}, {"trim_name", AnyLogical, Rank::scalar, Optionality::optional}, {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}}, - {}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"move_alloc", {{"from", SameType, Rank::known}, {"to", SameType, Rank::known}, {"stat", AnyInt, Rank::scalar, Optionality::optional}, {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}}, - {}}, + {}, Rank::elemental, IntrinsicClass::pureSubroutine}, {"mvbits", {{"from", SameInt}, {"frompos", AnyInt}, {"len", AnyInt}, {"to", SameInt}, {"topos", AnyInt}}, - {}}, // elemental + {}, Rank::elemental, IntrinsicClass::elementalSubroutine}, // elemental {"random_init", {{"repeatable", AnyLogical, Rank::scalar}, {"image_distinct", AnyLogical, Rank::scalar}}, - {}}, - {"random_number", {{"harvest", AnyReal, Rank::known}}, {}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"random_number", {{"harvest", AnyReal, Rank::known}}, {}, Rank::elemental, + IntrinsicClass::impureSubroutine}, {"random_seed", {{"size", DefaultInt, Rank::scalar, Optionality::optional}, {"put", DefaultInt, Rank::vector, Optionality::optional}, {"get", DefaultInt, Rank::vector, Optionality::optional}}, - {}}, // TODO: at most one argument can be present + {}, Rank::elemental, + IntrinsicClass::impureSubroutine}, // TODO: at most one argument can be + // present {"system_clock", {{"count", AnyInt, Rank::scalar, Optionality::optional}, {"count_rate", AnyIntOrReal, Rank::scalar, Optionality::optional}, {"count_max", AnyInt, Rank::scalar, Optionality::optional}}, - {}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, }; // TODO: Intrinsic subroutine EVENT_QUERY @@ -1532,6 +1553,8 @@ class IntrinsicProcTable::Implementation { bool IsIntrinsic(const std::string &) const; + IntrinsicClass GetIntrinsicClass(const std::string &) const; + std::optional Probe(const CallCharacteristics &, ActualArguments &, FoldingContext &, const IntrinsicProcTable &) const; @@ -1571,6 +1594,23 @@ bool IntrinsicProcTable::Implementation::IsIntrinsic( return name == "null" || name == "__builtin_c_f_pointer"; } +IntrinsicClass IntrinsicProcTable::Implementation::GetIntrinsicClass( + const std::string &name) const { + auto specificIntrinsic{specificFuncs_.find(name)}; + if (specificIntrinsic != specificFuncs_.end()) { + return specificIntrinsic->second->intrinsicClass; + } + auto genericIntrinsic{genericFuncs_.find(name)}; + if (genericIntrinsic != genericFuncs_.end()) { + return genericIntrinsic->second->intrinsicClass; + } + auto subrIntrinsic{subroutines_.find(name)}; + if (subrIntrinsic != subroutines_.end()) { + return subrIntrinsic->second->intrinsicClass; + } + return IntrinsicClass::noClass; +} + bool CheckAndRearrangeArguments(ActualArguments &arguments, parser::ContextualMessages &messages, const char *const dummyKeywords[], std::size_t trailingOptionals) { @@ -2014,6 +2054,11 @@ bool IntrinsicProcTable::IsIntrinsic(const std::string &name) const { return DEREF(impl_).IsIntrinsic(name); } +IntrinsicClass IntrinsicProcTable::GetIntrinsicClass( + const std::string &name) const { + return DEREF(impl_).GetIntrinsicClass(name); +} + std::optional IntrinsicProcTable::Probe( const CallCharacteristics &call, ActualArguments &arguments, FoldingContext &context) const { diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index da02b4fbe47f3e..edbd01d4eca07c 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -33,7 +33,10 @@ class CheckHelper { void Check() { Check(context_.globalScope()); } void Check(const ParamValue &, bool canBeAssumed); - void Check(const Bound &bound) { CheckSpecExpr(bound.GetExplicit()); } + void Check(const Bound &bound) { + CheckSpecExpr( + bound.GetExplicit(), evaluate::SpecificationExprContext::BOUND); + } void Check(const ShapeSpec &spec) { Check(spec.lbound()); Check(spec.ubound()); @@ -44,7 +47,9 @@ class CheckHelper { void Check(const Scope &); private: - template void CheckSpecExpr(const A &x) { + template + void CheckSpecExpr( + const A &x, const evaluate::SpecificationExprContext specExprContext) { if (symbolBeingChecked_ && IsSaved(*symbolBeingChecked_)) { if (!evaluate::IsConstantExpr(x)) { messages_.Say( @@ -52,18 +57,23 @@ class CheckHelper { symbolBeingChecked_->name()); } } else { - evaluate::CheckSpecificationExpr(x, messages_, DEREF(scope_)); + evaluate::CheckSpecificationExpr( + x, messages_, DEREF(scope_), context_.intrinsics(), specExprContext); } } - template void CheckSpecExpr(const std::optional &x) { + template + void CheckSpecExpr(const std::optional &x, + const evaluate::SpecificationExprContext specExprContext) { if (x) { - CheckSpecExpr(*x); + CheckSpecExpr(*x, specExprContext); } } - template void CheckSpecExpr(A &x) { + template + void CheckSpecExpr( + A &x, const evaluate::SpecificationExprContext specExprContext) { x = Fold(foldingContext_, std::move(x)); const A &constx{x}; - CheckSpecExpr(constx); + CheckSpecExpr(constx, specExprContext); } void CheckValue(const Symbol &, const DerivedTypeSpec *); void CheckVolatile( @@ -131,7 +141,8 @@ void CheckHelper::Check(const ParamValue &value, bool canBeAssumed) { " external function result"_err_en_US); } } else { - CheckSpecExpr(value.GetExplicit()); + CheckSpecExpr( + value.GetExplicit(), evaluate::SpecificationExprContext::TYPE_PARAM); } } @@ -384,15 +395,25 @@ void CheckHelper::CheckObjectEntity( CheckAssumedTypeEntity(symbol, details); symbolBeingChecked_ = nullptr; if (!details.coshape().empty()) { + bool isDeferredShape{details.coshape().IsDeferredShape()}; if (IsAllocatable(symbol)) { - if (!details.coshape().IsDeferredShape()) { // C827 - messages_.Say( - "ALLOCATABLE coarray must have a deferred coshape"_err_en_US); + if (!isDeferredShape) { // C827 + messages_.Say("'%s' is an ALLOCATABLE coarray and must have a deferred" + " coshape"_err_en_US, + symbol.name()); } + } else if (symbol.owner().IsDerivedType()) { // C746 + std::string deferredMsg{ + isDeferredShape ? "" : " and have a deferred coshape"}; + messages_.Say("Component '%s' is a coarray and must have the ALLOCATABLE" + " attribute%s"_err_en_US, + symbol.name(), deferredMsg); } else { if (!details.coshape().IsAssumedSize()) { // C828 messages_.Say( - "Non-ALLOCATABLE coarray must have an explicit coshape"_err_en_US); + "Component '%s' is a non-ALLOCATABLE coarray and must have" + " an explicit coshape"_err_en_US, + symbol.name()); } } } @@ -409,7 +430,8 @@ void CheckHelper::CheckObjectEntity( "An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE"_err_en_US); } } - if (InPure() && !IsPointer(symbol) && !IsIntentIn(symbol) && + if (InPure() && !IsStmtFunction(DEREF(innermostSymbol_)) && + !IsPointer(symbol) && !IsIntentIn(symbol) && !symbol.attrs().test(Attr::VALUE)) { if (InFunction()) { // C1583 messages_.Say( diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 3431bc05392ef3..9306f702aabbd1 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -2092,13 +2092,14 @@ std::optional ExpressionAnalyzer::CheckCall( } semantics::CheckArguments(*chars, arguments, GetFoldingContext(), context_.FindScope(callSite), treatExternalAsImplicit); - if (!chars->attrs.test(characteristics::Procedure::Attr::Pure)) { + const Symbol *procSymbol{proc.GetSymbol()}; + if (procSymbol && !IsPureProcedure(*procSymbol)) { if (const semantics::Scope * pure{semantics::FindPureProcedureContaining( context_.FindScope(callSite))}) { Say(callSite, "Procedure '%s' referenced in pure subprogram '%s' must be pure too"_err_en_US, - DEREF(proc.GetSymbol()).name(), DEREF(pure->symbol()).name()); + procSymbol->name(), DEREF(pure->symbol()).name()); } } } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 6d04c7f229ed15..e51c33988d0d7e 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -3679,7 +3679,7 @@ bool DeclarationVisitor::Pre(const parser::DerivedTypeDef &x) { if (symbol->has() && !paramNames.count(name)) { SayDerivedType(name, "'%s' is not a type parameter of this derived type"_err_en_US, - currScope()); // C742 + currScope()); // C741 } } Walk(std::get>>(x.t)); @@ -3820,14 +3820,50 @@ void DeclarationVisitor::Post(const parser::ComponentDecl &x) { !attrs.HasAny({Attr::PUBLIC, Attr::PRIVATE})) { attrs.set(Attr::PRIVATE); } - if (!attrs.HasAny({Attr::POINTER, Attr::ALLOCATABLE})) { - if (const auto *declType{GetDeclTypeSpec()}) { - if (const auto *derived{declType->AsDerived()}) { + if (const auto *declType{GetDeclTypeSpec()}) { + if (const auto *derived{declType->AsDerived()}) { + if (!attrs.HasAny({Attr::POINTER, Attr::ALLOCATABLE})) { if (derivedTypeInfo_.type == &derived->typeSymbol()) { // C744 Say("Recursive use of the derived type requires " "POINTER or ALLOCATABLE"_err_en_US); } } + if (!coarraySpec().empty()) { // C747 + if (IsTeamType(derived)) { + Say("A coarray component may not be of type TEAM_TYPE from " + "ISO_FORTRAN_ENV"_err_en_US); + } else { + if (IsIsoCType(derived)) { + Say("A coarray component may not be of type C_PTR or C_FUNPTR from " + "ISO_C_BINDING"_err_en_US); + } + } + } + if (auto it{FindCoarrayUltimateComponent(*derived)}) { // C748 + std::string ultimateName{it.BuildResultDesignatorName()}; + // Strip off the leading "%" + if (ultimateName.length() > 1) { + ultimateName.erase(0, 1); + if (attrs.HasAny({Attr::POINTER, Attr::ALLOCATABLE})) { + evaluate::AttachDeclaration( + Say(name.source, + "A component with a POINTER or ALLOCATABLE attribute may " + "not " + "be of a type with a coarray ultimate component (named " + "'%s')"_err_en_US, + ultimateName), + derived->typeSymbol()); + } + if (!arraySpec().empty() || !coarraySpec().empty()) { + evaluate::AttachDeclaration( + Say(name.source, + "An array or coarray component may not be of a type with a " + "coarray ultimate component (named '%s')"_err_en_US, + ultimateName), + derived->typeSymbol()); + } + } + } } } if (OkToAddComponent(name)) { @@ -4741,7 +4777,7 @@ Symbol *DeclarationVisitor::MakeTypeSymbol( const SourceName &name, Details &&details) { Scope &derivedType{currScope()}; CHECK(derivedType.IsDerivedType()); - if (auto *symbol{FindInScope(derivedType, name)}) { + if (auto *symbol{FindInScope(derivedType, name)}) { // C742 Say2(name, "Type parameter, component, or procedure binding '%s'" " already defined in this type"_err_en_US, diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 249dcb27b65afc..3b68beaa557fc7 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -270,6 +270,24 @@ bool IsPureProcedure(const Symbol &symbol) { } else if (!IsProcedure(symbol)) { return false; } + if (IsStmtFunction(symbol)) { + // Section 15.7(1) states that a statement function is PURE if it does not + // reference an IMPURE procedure or a VOLATILE variable + const MaybeExpr &expr{symbol.get().stmtFunction()}; + if (expr) { + for (const Symbol &refSymbol : evaluate::CollectSymbols(*expr)) { + if (IsFunction(refSymbol) && !IsPureProcedure(refSymbol)) { + return false; + } + if (const Symbol * root{GetAssociationRoot(refSymbol)}) { + if (root->attrs().test(Attr::VOLATILE)) { + return false; + } + } + } + } + return true; // statement function was not found to be impure + } return symbol.attrs().test(Attr::PURE) || (symbol.attrs().test(Attr::ELEMENTAL) && !symbol.attrs().test(Attr::IMPURE)); @@ -1356,4 +1374,5 @@ void LabelEnforce::SayWithConstruct(SemanticsContext &context, context.Say(stmtLocation, message) .Attach(constructLocation, GetEnclosingConstructMsg()); } + } // namespace Fortran::semantics diff --git a/flang/test/Semantics/allocate11.f90 b/flang/test/Semantics/allocate11.f90 index 594bd1ded385f2..01b9944019ae39 100644 --- a/flang/test/Semantics/allocate11.f90 +++ b/flang/test/Semantics/allocate11.f90 @@ -38,6 +38,7 @@ subroutine C937(var) type B type(A) y + !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'y%x') type(B), pointer :: forward real :: u end type @@ -47,6 +48,7 @@ subroutine C937(var) end type type D + !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'x') type(A), pointer :: potential end type diff --git a/flang/test/Semantics/call12.f90 b/flang/test/Semantics/call12.f90 index e25a2608c44117..65da46b067d6cd 100644 --- a/flang/test/Semantics/call12.f90 +++ b/flang/test/Semantics/call12.f90 @@ -15,7 +15,7 @@ module m real, pointer :: p end type type :: hasCoarray - real :: co[*] + real, allocatable :: co[:] end type contains pure function test(ptr, in, hpd) diff --git a/flang/test/Semantics/call14.f90 b/flang/test/Semantics/call14.f90 index b874e6b009125a..ee5086511de3b8 100644 --- a/flang/test/Semantics/call14.f90 +++ b/flang/test/Semantics/call14.f90 @@ -3,7 +3,7 @@ module m type :: hasCoarray - real :: coarray[*] + real, allocatable :: coarray[:] end type contains !ERROR: VALUE attribute may apply only to a dummy data object diff --git a/flang/test/Semantics/misc-declarations.f90 b/flang/test/Semantics/misc-declarations.f90 index 7680eed793bce1..f627836b3732c8 100644 --- a/flang/test/Semantics/misc-declarations.f90 +++ b/flang/test/Semantics/misc-declarations.f90 @@ -4,12 +4,12 @@ ! - 8.5.19 constraints on the VOLATILE attribute module m - !ERROR: ALLOCATABLE coarray must have a deferred coshape + !ERROR: 'mustbedeferred' is an ALLOCATABLE coarray and must have a deferred coshape real, allocatable :: mustBeDeferred[*] ! C827 - !ERROR: Non-ALLOCATABLE coarray must have an explicit coshape + !ERROR: Component 'mustbeexplicit' is a non-ALLOCATABLE coarray and must have an explicit coshape real :: mustBeExplicit[:] ! C828 type :: hasCoarray - real :: coarray[*] + real, allocatable :: coarray[:] end type real :: coarray[*] type(hasCoarray) :: coarrayComponent diff --git a/flang/test/Semantics/modfile24.f90 b/flang/test/Semantics/modfile24.f90 index ec446f9e8d3c39..45f6c0545627fe 100644 --- a/flang/test/Semantics/modfile24.f90 +++ b/flang/test/Semantics/modfile24.f90 @@ -36,8 +36,8 @@ module m2 ! coarray-spec in components and with non-constants bounds module m3 type t - real :: c[1:10,1:*] - complex, codimension[5,*] :: d + real, allocatable :: c[:,:] + complex, allocatable, codimension[:,:] :: d end type real, allocatable :: e[:,:,:] contains @@ -50,8 +50,8 @@ subroutine s(a, b, n) !Expect: m3.mod !module m3 ! type::t -! real(4)::c[1_8:10_8,1_8:*] -! complex(4)::d[1_8:5_8,1_8:*] +! real(4),allocatable::c[:,:] +! complex(4),allocatable::d[:,:] ! end type ! real(4),allocatable::e[:,:,:] !contains diff --git a/flang/test/Semantics/resolve33.f90 b/flang/test/Semantics/resolve33.f90 index 3fa6bec15f2c19..7df5ba935ab0c3 100644 --- a/flang/test/Semantics/resolve33.f90 +++ b/flang/test/Semantics/resolve33.f90 @@ -2,6 +2,12 @@ ! Derived type parameters ! C731 The same type-param-name shall not appear more than once in a given ! derived-type-stmt. +! C741 A type-param-name in a type-param-def-stmt in a derived-type-def shall +! be one of the type-paramnames in the derived-type-stmt of that +! derived-type-def. +! C742 Each type-param-name in the derived-type-stmt in a derived-type-def +! shall appear exactly once as a type-param-name in a type-param-def-stmt +! in that derived-type-def. module m !ERROR: Duplicate type parameter name: 'a' diff --git a/flang/test/Semantics/resolve44.f90 b/flang/test/Semantics/resolve44.f90 index 2d8b7017875372..41ab06ffb6c6ae 100644 --- a/flang/test/Semantics/resolve44.f90 +++ b/flang/test/Semantics/resolve44.f90 @@ -1,5 +1,8 @@ ! RUN: %B/test/Semantics/test_errors.sh %s %flang %t ! Error tests for recursive use of derived types. +! C744 If neither the POINTER nor the ALLOCATABLE attribute is specified, the +! declaration-type-spec in the component-def-stmt shall specify an intrinsic +! type or a previously defined derived type. program main type :: recursive1 diff --git a/flang/test/Semantics/resolve88.f90 b/flang/test/Semantics/resolve88.f90 new file mode 100644 index 00000000000000..50135297241c47 --- /dev/null +++ b/flang/test/Semantics/resolve88.f90 @@ -0,0 +1,75 @@ +! RUN: %B/test/Semantics/test_errors.sh %s %flang %t +! C746, C747, and C748 +module m + use ISO_FORTRAN_ENV + use ISO_C_BINDING + + ! C746 If a coarray-spec appears, it shall be a deferred-coshape-spec-list and + ! the component shall have the ALLOCATABLE attribute. + + type testCoArrayType + real, allocatable, codimension[:] :: allocatableField + !ERROR: Component 'deferredfield' is a coarray and must have the ALLOCATABLE attribute + real, codimension[:] :: deferredField + !ERROR: 'pointerfield' may not have the POINTER attribute because it is a coarray + !ERROR: Component 'pointerfield' is a coarray and must have the ALLOCATABLE attribute + real, pointer, codimension[:] :: pointerField + !ERROR: Component 'realfield' is a coarray and must have the ALLOCATABLE attribute and have a deferred coshape + real, codimension[*] :: realField + !ERROR: 'realfield2' is an ALLOCATABLE coarray and must have a deferred coshape + real, allocatable, codimension[*] :: realField2 + end type testCoArrayType + + ! C747 If a coarray-spec appears, the component shall not be of type C_PTR or + ! C_FUNPTR from the intrinsic module ISO_C_BINDING (18.2), or of type + ! TEAM_TYPE from the intrinsic module ISO_FORTRAN_ENV (16.10.2). + + type goodCoarrayType + real, allocatable, codimension[:] :: field + end type goodCoarrayType + + type goodTeam_typeCoarrayType + type(team_type), allocatable :: field + end type goodTeam_typeCoarrayType + + type goodC_ptrCoarrayType + type(c_ptr), allocatable :: field + end type goodC_ptrCoarrayType + + type goodC_funptrCoarrayType + type(c_funptr), allocatable :: field + end type goodC_funptrCoarrayType + + type team_typeCoarrayType + !ERROR: A coarray component may not be of type TEAM_TYPE from ISO_FORTRAN_ENV + type(team_type), allocatable, codimension[:] :: field + end type team_typeCoarrayType + + type c_ptrCoarrayType + !ERROR: A coarray component may not be of type C_PTR or C_FUNPTR from ISO_C_BINDING + type(c_ptr), allocatable, codimension[:] :: field + end type c_ptrCoarrayType + + type c_funptrCoarrayType + !ERROR: A coarray component may not be of type C_PTR or C_FUNPTR from ISO_C_BINDING + type(c_funptr), allocatable, codimension[:] :: field + end type c_funptrCoarrayType + +! C748 A data component whose type has a coarray ultimate component shall be a +! nonpointer nonallocatable scalar and shall not be a coarray. + + type coarrayType + real, allocatable, codimension[:] :: goodCoarrayField + end type coarrayType + + type testType + type(coarrayType) :: goodField + !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'goodcoarrayfield') + type(coarrayType), pointer :: pointerField + !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'goodcoarrayfield') + type(coarrayType), allocatable :: allocatableField + !ERROR: An array or coarray component may not be of a type with a coarray ultimate component (named 'goodcoarrayfield') + type(coarrayType), dimension(3) :: arrayField + end type testType + +end module m diff --git a/flang/test/Semantics/resolve89.f90 b/flang/test/Semantics/resolve89.f90 new file mode 100644 index 00000000000000..883970f30edf8e --- /dev/null +++ b/flang/test/Semantics/resolve89.f90 @@ -0,0 +1,110 @@ +! RUN: %B/test/Semantics/test_errors.sh %s %flang %t +! C750 Each bound in the explicit-shape-spec shall be a specification +! expression in which there are no references to specification functions or +! the intrinsic functions ALLOCATED, ASSOCIATED, EXTENDS_- TYPE_OF, PRESENT, +! or SAME_TYPE_AS, every specification inquiry reference is a constant +! expression, and the value does not depend on the value of a variable. +impure function impureFunc() + integer :: impureFunc + + impureFunc = 3 +end function impureFunc + +pure function pureFunc() + integer :: pureFunc + + pureFunc = 3 +end function pureFunc + +module m + real, allocatable :: mVar +end module m + +subroutine s(iArg, allocArg, pointerArg, arrayArg, ioArg, optionalArg) + use m + implicit logical(l) + integer, intent(in) :: iArg + real, allocatable, intent(in) :: allocArg + real, pointer, intent(in) :: pointerArg + integer, dimension(:), intent(in) :: arrayArg + integer, intent(inout) :: ioArg + real, optional, intent(in) :: optionalArg + + ! These declarations are OK since they're not in a derived type + real :: realVar + real, volatile :: volatileVar + real, dimension(merge(1, 2, allocated(allocArg))) :: realVar1 + real, dimension(merge(1, 2, associated(pointerArg))) :: realVar2 + real, dimension(merge(1, 2, is_contiguous(arrayArg))) :: realVar3 + real, dimension(ioArg) :: realVar4 + real, dimension(merge(1, 2, present(optionalArg))) :: realVar5 + + ! statement functions referenced below + iVolatileStmtFunc() = 3 * volatileVar + iImpureStmtFunc() = 3 * impureFunc() + iPureStmtFunc() = 3 * pureFunc() + + ! This is OK + real, dimension(merge(1, 2, allocated(mVar))) :: rVar + + + integer :: var = 3 + !ERROR: Invalid specification expression: reference to impure function 'ivolatilestmtfunc' + real, dimension(iVolatileStmtFunc()) :: arrayVarWithVolatile + !ERROR: Invalid specification expression: reference to impure function 'iimpurestmtfunc' + real, dimension(iImpureStmtFunc()) :: arrayVarWithImpureFunction + !ERROR: Invalid specification expression: reference to statement function 'ipurestmtfunc' + real, dimension(iPureStmtFunc()) :: arrayVarWithPureFunction + real, dimension(iabs(iArg)) :: arrayVarWithIntrinsic + + type arrayType + !ERROR: Invalid specification expression: reference to variable 'var' not allowed for derived type components + real, dimension(var) :: varField + !ERROR: Invalid specification expression: reference to impure function 'ivolatilestmtfunc' + real, dimension(iVolatileStmtFunc()) :: arrayFieldWithVolatile + !ERROR: Invalid specification expression: reference to impure function 'iimpurestmtfunc' + real, dimension(iImpureStmtFunc()) :: arrayFieldWithImpureFunction + !ERROR: Invalid specification expression: reference to statement function 'ipurestmtfunc' + real, dimension(iPureStmtFunc()) :: arrayFieldWithPureFunction + !ERROR: Invalid specification expression: reference to variable 'iarg' not allowed for derived type components + real, dimension(iabs(iArg)) :: arrayFieldWithIntrinsic + !ERROR: Invalid specification expression: reference to intrinsic 'allocated' not allowed for derived type components + real, dimension(merge(1, 2, allocated(allocArg))) :: realField1 + !ERROR: Invalid specification expression: reference to intrinsic 'associated' not allowed for derived type components + real, dimension(merge(1, 2, associated(pointerArg))) :: realField2 + !ERROR: Invalid specification expression: non-constant reference to inquiry intrinsic 'is_contiguous' not allowed for derived type components + real, dimension(merge(1, 2, is_contiguous(arrayArg))) :: realField3 + !ERROR: Invalid specification expression: reference to variable 'ioarg' not allowed for derived type components + real, dimension(ioArg) :: realField4 + !ERROR: Invalid specification expression: reference to intrinsic 'present' not allowed for derived type components + real, dimension(merge(1, 2, present(optionalArg))) :: realField5 + end type arrayType + +end subroutine s + +subroutine s1() + ! C750, check for a constant specification inquiry that's a type parameter + ! inquiry which are defined in 9.4.5 + type derived(kindParam, lenParam) + integer, kind :: kindParam = 3 + integer, len :: lenParam = 3 + end type + + contains + subroutine inner (derivedArg) + type(derived), intent(in), dimension(3) :: derivedArg + integer :: localInt + + type(derived), parameter :: localderived = derived() + + type localDerivedType + ! OK because the specification inquiry is a constant + integer, dimension(localDerived%kindParam) :: goodField + !ERROR: Invalid specification expression: non-constant reference to a type parameter inquiry not allowed for derived type components + integer, dimension(derivedArg%lenParam) :: badField + end type localDerivedType + + ! OK because we're not defining a component + integer, dimension(derivedArg%kindParam) :: localVar + end subroutine inner +end subroutine s1 diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py index 439f9710ef6600..c1aa851097b7ca 100644 --- a/flang/test/lit.cfg.py +++ b/flang/test/lit.cfg.py @@ -57,18 +57,14 @@ config.substitutions.append(('%B', config.flang_obj_root)) # For each occurrence of a flang tool name, replace it with the full path to -# the build directory holding that tool. We explicitly specify the directories -# to search to ensure that we get the tools just built and not some random -# tools that might happen to be in the user's PATH. -tool_dirs = [config.llvm_tools_dir, config.flang_tools_dir] -flang_includes = "-I" + config.flang_intrinsic_modules_dir - -tools = [ToolSubst('%flang', command=FindTool('flang'), unresolved='fatal'), - ToolSubst('%f18', command=FindTool('f18'), unresolved='fatal'), - ToolSubst('%f18_with_includes', command=FindTool('f18'), - extra_args=[flang_includes], unresolved='fatal')] - -llvm_config.add_tool_substitutions(tools, tool_dirs) +# the build directory holding that tool. +tools = [ + ToolSubst('%flang', command=FindTool('flang'), unresolved='fatal'), + ToolSubst('%f18', command=FindTool('f18'), unresolved='fatal'), + ToolSubst('%f18_with_includes', command=FindTool('f18'), + extra_args=["-I" + config.flang_intrinsic_modules_dir], unresolved='fatal') +] +llvm_config.add_tool_substitutions(tools, [config.flang_llvm_tools_dir]) # Enable libpgmath testing result = lit_config.params.get("LIBPGMATH") diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in index 92bd926ab5cac5..e8e2945a2cbf0d 100644 --- a/flang/test/lit.site.cfg.py.in +++ b/flang/test/lit.site.cfg.py.in @@ -7,7 +7,7 @@ config.flang_obj_root = "@FLANG_BINARY_DIR@" config.flang_src_dir = "@FLANG_SOURCE_DIR@" config.flang_tools_dir = "@FLANG_TOOLS_DIR@" config.flang_intrinsic_modules_dir = "@FLANG_INTRINSIC_MODULES_DIR@" -config.flang_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" +config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin" config.python_executable = "@PYTHON_EXECUTABLE@" # Support substitution of the tools_dir with user parameters. This is diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index c3c43c04edd766..86434b253befd5 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -1,7 +1,10 @@ +set(LLVM_LINK_COMPONENTS + Support + ) add_flang_tool(f18 dump.cpp f18.cpp -) + ) target_link_libraries(f18 PRIVATE @@ -10,7 +13,6 @@ target_link_libraries(f18 FortranEvaluate FortranSemantics FortranLower - LLVMSupport ) set(MODULES diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp index 94c66cda1b0fea..54bba168cadf4e 100644 --- a/libc/test/src/math/cosf_test.cpp +++ b/libc/test/src/math/cosf_test.cpp @@ -76,7 +76,7 @@ TEST(CosfTest, InFloatRange) { float x = as_float(v); if (isnan(x) || isinf(x)) continue; - EXPECT_TRUE(mpfr::equalsCos(x, __llvm_libc::cosf(x), tolerance)); + ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, __llvm_libc::cosf(x), tolerance); } } @@ -84,12 +84,12 @@ TEST(CosfTest, InFloatRange) { TEST(CosfTest, SmallValues) { float x = as_float(0x17800000); float result = __llvm_libc::cosf(x); - EXPECT_TRUE(mpfr::equalsCos(x, result, tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result, tolerance); EXPECT_EQ(FloatBits::One, as_uint32_bits(result)); - x = as_float(0x00400000); + x = as_float(0x0040000); result = __llvm_libc::cosf(x); - EXPECT_TRUE(mpfr::equalsCos(x, result, tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result, tolerance); EXPECT_EQ(FloatBits::One, as_uint32_bits(result)); } @@ -98,6 +98,6 @@ TEST(CosfTest, SmallValues) { TEST(CosfTest, SDCOMP_26094) { for (uint32_t v : sdcomp26094Values) { float x = as_float(v); - EXPECT_TRUE(mpfr::equalsCos(x, __llvm_libc::cosf(x), tolerance)); + ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, __llvm_libc::cosf(x), tolerance); } } diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp index 36e6b4a129a7cf..93b827a2ad374d 100644 --- a/libc/test/src/math/sincosf_test.cpp +++ b/libc/test/src/math/sincosf_test.cpp @@ -87,8 +87,8 @@ TEST(SinCosfTest, InFloatRange) { float sin, cos; __llvm_libc::sincosf(x, &sin, &cos); - EXPECT_TRUE(mpfr::equalsCos(x, cos, tolerance)); - EXPECT_TRUE(mpfr::equalsSin(x, sin, tolerance)); + ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, cos, tolerance); + ASSERT_MPFR_MATCH(mpfr::OP_Sin, x, sin, tolerance); } } @@ -98,16 +98,16 @@ TEST(SinCosfTest, SmallValues) { float x = as_float(bits); float result_cos, result_sin; __llvm_libc::sincosf(x, &result_sin, &result_cos); - EXPECT_TRUE(mpfr::equalsCos(x, result_cos, tolerance)); - EXPECT_TRUE(mpfr::equalsSin(x, result_sin, tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result_cos, tolerance); + EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result_sin, tolerance); EXPECT_EQ(FloatBits::One, as_uint32_bits(result_cos)); EXPECT_EQ(bits, as_uint32_bits(result_sin)); bits = 0x00400000; x = as_float(bits); __llvm_libc::sincosf(x, &result_sin, &result_cos); - EXPECT_TRUE(mpfr::equalsCos(x, result_cos, tolerance)); - EXPECT_TRUE(mpfr::equalsSin(x, result_sin, tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result_cos, tolerance); + EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result_sin, tolerance); EXPECT_EQ(FloatBits::One, as_uint32_bits(result_cos)); EXPECT_EQ(bits, as_uint32_bits(result_sin)); } @@ -119,7 +119,7 @@ TEST(SinCosfTest, SDCOMP_26094) { float x = as_float(v); float sin, cos; __llvm_libc::sincosf(x, &sin, &cos); - EXPECT_TRUE(mpfr::equalsCos(x, cos, tolerance)); - EXPECT_TRUE(mpfr::equalsSin(x, sin, tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, cos, tolerance); + EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, sin, tolerance); } } diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp index e4c6e818b57a39..c0ce0755964c99 100644 --- a/libc/test/src/math/sinf_test.cpp +++ b/libc/test/src/math/sinf_test.cpp @@ -76,13 +76,13 @@ TEST(SinfTest, InFloatRange) { float x = as_float(v); if (isnan(x) || isinf(x)) continue; - EXPECT_TRUE(mpfr::equalsSin(x, __llvm_libc::sinf(x), tolerance)); + ASSERT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance); } } TEST(SinfTest, SpecificBitPatterns) { float x = as_float(0xc70d39a1); - EXPECT_TRUE(mpfr::equalsSin(x, __llvm_libc::sinf(x), tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance); } // For small values, sin(x) is x. @@ -90,13 +90,13 @@ TEST(SinfTest, SmallValues) { uint32_t bits = 0x17800000; float x = as_float(bits); float result = __llvm_libc::sinf(x); - EXPECT_TRUE(mpfr::equalsSin(x, result, tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result, tolerance); EXPECT_EQ(bits, as_uint32_bits(result)); bits = 0x00400000; x = as_float(bits); result = __llvm_libc::sinf(x); - EXPECT_TRUE(mpfr::equalsSin(x, result, tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result, tolerance); EXPECT_EQ(bits, as_uint32_bits(result)); } @@ -105,6 +105,6 @@ TEST(SinfTest, SmallValues) { TEST(SinfTest, SDCOMP_26094) { for (uint32_t v : sdcomp26094Values) { float x = as_float(v); - EXPECT_TRUE(mpfr::equalsSin(x, __llvm_libc::sinf(x), tolerance)); + EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance); } } diff --git a/libc/utils/CPP/TypeTraits.h b/libc/utils/CPP/TypeTraits.h index 81e8e68f09d69c..dfc16b00ab745a 100644 --- a/libc/utils/CPP/TypeTraits.h +++ b/libc/utils/CPP/TypeTraits.h @@ -46,6 +46,22 @@ template struct IsPointerType : public TrueValue {}; template struct IsSame : public FalseValue {}; template struct IsSame : public TrueValue {}; +template struct TypeIdentity { typedef T Type; }; + +template struct RemoveCV : public TypeIdentity {}; +template struct RemoveCV : public TypeIdentity {}; +template struct RemoveCV : public TypeIdentity {}; +template +struct RemoveCV : public TypeIdentity {}; + +template using RemoveCVType = typename RemoveCV::Type; + +template struct IsFloatingPointType { + static constexpr bool Value = IsSame>::Value || + IsSame>::Value || + IsSame>::Value; +}; + } // namespace cpp } // namespace __llvm_libc diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt index 8de737485681eb..218d5af9fc2818 100644 --- a/libc/utils/MPFRWrapper/CMakeLists.txt +++ b/libc/utils/MPFRWrapper/CMakeLists.txt @@ -12,7 +12,8 @@ if(LIBC_TESTS_CAN_USE_MPFR) MPFRUtils.cpp MPFRUtils.h ) - target_link_libraries(libcMPFRWrapper -lmpfr -lgmp) + add_dependencies(libcMPFRWrapper libc.utils.CPP.standalone_cpp LibcUnitTest LLVMSupport) + target_link_libraries(libcMPFRWrapper -lmpfr -lgmp LibcUnitTest LLVMSupport) else() message(WARNING "Math tests using MPFR will be skipped.") endif() diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index 7bd849934fc779..75ee2adaff5aef 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -8,8 +8,10 @@ #include "MPFRUtils.h" -#include +#include "llvm/ADT/StringRef.h" + #include +#include namespace __llvm_libc { namespace testing { @@ -25,11 +27,38 @@ class MPFRNumber { public: MPFRNumber() { mpfr_init2(value, mpfrPrecision); } - explicit MPFRNumber(float x) { + // We use explicit EnableIf specializations to disallow implicit + // conversions. Implicit conversions can potentially lead to loss of + // precision. + template ::Value, int> = 0> + explicit MPFRNumber(XType x) { mpfr_init2(value, mpfrPrecision); mpfr_set_flt(value, x, MPFR_RNDN); } + template ::Value, int> = 0> + explicit MPFRNumber(XType x) { + mpfr_init2(value, mpfrPrecision); + mpfr_set_d(value, x, MPFR_RNDN); + } + + template ::Value, int> = 0> + MPFRNumber(Operation op, XType rawValue) { + mpfr_init2(value, mpfrPrecision); + MPFRNumber mpfrInput(rawValue); + switch (op) { + case OP_Cos: + mpfr_cos(value, mpfrInput.value, MPFR_RNDN); + break; + case OP_Sin: + mpfr_sin(value, mpfrInput.value, MPFR_RNDN); + break; + } + } + MPFRNumber(const MPFRNumber &other) { mpfr_set(value, other.value, MPFR_RNDN); } @@ -59,38 +88,51 @@ class MPFRNumber { return mpfr_lessequal_p(difference.value, tolerance.value); } + std::string str() const { + // 200 bytes should be more than sufficient to hold a 100-digit number + // plus additional bytes for the decimal point, '-' sign etc. + constexpr size_t printBufSize = 200; + char buffer[printBufSize]; + mpfr_snprintf(buffer, printBufSize, "%100.50Rf", value); + llvm::StringRef ref(buffer); + ref = ref.trim(); + return ref.str(); + } + // These functions are useful for debugging. float asFloat() const { return mpfr_get_flt(value, MPFR_RNDN); } double asDouble() const { return mpfr_get_d(value, MPFR_RNDN); } void dump(const char *msg) const { mpfr_printf("%s%.128Rf\n", msg, value); } +}; -public: - static MPFRNumber cos(float x) { - MPFRNumber result; - MPFRNumber mpfrX(x); - mpfr_cos(result.value, mpfrX.value, MPFR_RNDN); - return result; - } +namespace internal { + +template +void MPFRMatcher::explainError(testutils::StreamWrapper &OS) { + MPFRNumber mpfrResult(operation, input); + MPFRNumber mpfrInput(input); + MPFRNumber mpfrMatchValue(matchValue); + OS << "Match value not within tolerance value of MPFR result:\n" + << "Operation input: " << mpfrInput.str() << '\n' + << " Match value: " << mpfrMatchValue.str() << '\n' + << " MPFR result: " << mpfrResult.str() << '\n'; +} - static MPFRNumber sin(float x) { - MPFRNumber result; - MPFRNumber mpfrX(x); - mpfr_sin(result.value, mpfrX.value, MPFR_RNDN); - return result; - } +template void MPFRMatcher::explainError(testutils::StreamWrapper &); +template void MPFRMatcher::explainError(testutils::StreamWrapper &); + +template +bool compare(Operation op, T input, T libcResult, const Tolerance &t) { + MPFRNumber mpfrResult(op, input); + MPFRNumber mpfrInput(input); + MPFRNumber mpfrLibcResult(libcResult); + return mpfrResult.isEqual(mpfrLibcResult, t); }; -bool equalsCos(float input, float libcOutput, const Tolerance &t) { - MPFRNumber mpfrResult = MPFRNumber::cos(input); - MPFRNumber libcResult(libcOutput); - return mpfrResult.isEqual(libcResult, t); -} +template bool compare(Operation, float, float, const Tolerance &); +template bool compare(Operation, double, double, const Tolerance &); -bool equalsSin(float input, float libcOutput, const Tolerance &t) { - MPFRNumber mpfrResult = MPFRNumber::sin(input); - MPFRNumber libcResult(libcOutput); - return mpfrResult.isEqual(libcResult, t); -} +} // namespace internal } // namespace mpfr } // namespace testing diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h index 9f56ccc61fe636..31afd39b289573 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.h +++ b/libc/utils/MPFRWrapper/MPFRUtils.h @@ -9,6 +9,9 @@ #ifndef LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H #define LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H +#include "utils/CPP/TypeTraits.h" +#include "utils/UnitTest/Test.h" + #include namespace __llvm_libc { @@ -36,16 +39,57 @@ struct Tolerance { uint32_t bits; }; -// Return true if |libcOutput| is within the tolerance |t| of the cos(x) -// value as evaluated by MPFR. -bool equalsCos(float x, float libcOutput, const Tolerance &t); +enum Operation { + OP_Cos, + OP_Sin, +}; + +namespace internal { + +template +bool compare(Operation op, T input, T libcOutput, const Tolerance &t); + +template class MPFRMatcher : public testing::Matcher { + static_assert(__llvm_libc::cpp::IsFloatingPointType::Value, + "MPFRMatcher can only be used with floating point values."); + + Operation operation; + T input; + Tolerance tolerance; + T matchValue; + +public: + MPFRMatcher(Operation op, T testInput, Tolerance &t) + : operation(op), input(testInput), tolerance(t) {} -// Return true if |libcOutput| is within the tolerance |t| of the sin(x) -// value as evaluated by MPFR. -bool equalsSin(float x, float libcOutput, const Tolerance &t); + bool match(T libcResult) { + matchValue = libcResult; + return internal::compare(operation, input, libcResult, tolerance); + } + + void explainError(testutils::StreamWrapper &OS) override; +}; + +} // namespace internal + +template +internal::MPFRMatcher getMPFRMatcher(Operation op, T input, Tolerance t) { + static_assert( + __llvm_libc::cpp::IsFloatingPointType::Value, + "getMPFRMatcher can only be used to match floating point results."); + return internal::MPFRMatcher(op, input, t); +} } // namespace mpfr } // namespace testing } // namespace __llvm_libc +#define EXPECT_MPFR_MATCH(op, input, matchValue, tolerance) \ + EXPECT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher( \ + op, input, tolerance)) + +#define ASSERT_MPFR_MATCH(op, input, matchValue, tolerance) \ + ASSERT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher( \ + op, input, tolerance)) + #endif // LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H diff --git a/libc/utils/testutils/StreamWrapper.cpp b/libc/utils/testutils/StreamWrapper.cpp index b8a693d767ce79..f6318a99340187 100644 --- a/libc/utils/testutils/StreamWrapper.cpp +++ b/libc/utils/testutils/StreamWrapper.cpp @@ -10,6 +10,7 @@ #include "llvm/Support/raw_ostream.h" #include #include +#include namespace __llvm_libc { namespace testutils { @@ -41,6 +42,7 @@ template StreamWrapper & template StreamWrapper & StreamWrapper::operator<<(unsigned long long t); template StreamWrapper &StreamWrapper::operator<<(bool t); +template StreamWrapper &StreamWrapper::operator<<(std::string t); } // namespace testutils } // namespace __llvm_libc diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.sh.cpp similarity index 85% rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.sh.cpp index 5b01f33bf4b0e0..650537dcb20ce8 100644 --- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.sh.cpp @@ -10,6 +10,11 @@ // istream cerr; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} %t.exe 2> %t.err +// RUN: grep -e 'Hello World!' %t.err + #include #include @@ -17,16 +22,15 @@ int main(int, char**) { -#if 0 + std::cerr << "Hello World!\n"; -#else + #ifdef _LIBCPP_HAS_NO_STDOUT assert(std::cerr.tie() == NULL); #else assert(std::cerr.tie() == &std::cout); #endif assert(std::cerr.flags() & std::ios_base::unitbuf); -#endif // 0 return 0; } diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.sh.cpp similarity index 78% rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.sh.cpp index 0b3672a4585cdd..386dbbd4721d3c 100644 --- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.sh.cpp @@ -12,6 +12,11 @@ // istream cin; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} echo "123" | %t.exe > %t.out +// RUN: grep -e 'The number is 123!' %t.out + #include #include @@ -19,18 +24,14 @@ int main(int, char**) { -#if 0 - std::cout << "Hello World!\n"; int i; - std::cout << "Enter a number: "; std::cin >> i; - std::cout << "The number is : " << i << '\n'; -#else // 0 + std::cout << "The number is " << i << "!"; + #ifdef _LIBCPP_HAS_NO_STDOUT assert(std::cin.tie() == NULL); #else assert(std::cin.tie() == &std::cout); -#endif #endif return 0; diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.sh.cpp similarity index 79% rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.sh.cpp index 68e37294750167..32e23bf61c34fe 100644 --- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.sh.cpp @@ -10,17 +10,18 @@ // istream clog; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} %t.exe 2> %t.err +// RUN: grep -e 'Hello World!' %t.err + #include #include "test_macros.h" int main(int, char**) { -#if 0 std::clog << "Hello World!\n"; -#else - (void)std::clog; -#endif - return 0; + return 0; } diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.sh.cpp similarity index 75% rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.sh.cpp index f1d53b773ac128..f4a066b5c50d8c 100644 --- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.sh.cpp @@ -12,21 +12,18 @@ // istream cout; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} %t.exe > %t.out +// RUN: grep -e 'Hello World!' %t.out + #include #include "test_macros.h" int main(int, char**) { -#if 0 std::cout << "Hello World!\n"; - int i; - std::cout << "Enter a number: "; - std::cin >> i; - std::cout << "The number is : " << i << '\n'; -#else // 0 - (void)std::cout; -#endif - - return 0; + + return 0; } diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.sh.cpp similarity index 84% rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.sh.cpp index 1683c49fbf6d87..30974df3951f61 100644 --- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.sh.cpp @@ -10,6 +10,11 @@ // istream wcerr; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} %t.exe 2> %t.err +// RUN: grep -e 'Hello World!' %t.err + #include #include @@ -17,16 +22,14 @@ int main(int, char**) { -#if 0 std::wcerr << L"Hello World!\n"; -#else + #ifdef _LIBCPP_HAS_NO_STDOUT assert(std::wcerr.tie() == NULL); #else assert(std::wcerr.tie() == &std::wcout); #endif assert(std::wcerr.flags() & std::ios_base::unitbuf); -#endif // 0 - return 0; + return 0; } diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.sh.cpp similarity index 77% rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.sh.cpp index c653b2f60678b6..9d24a37233a8c6 100644 --- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.sh.cpp @@ -12,6 +12,11 @@ // istream wcin; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} echo "123" | %t.exe > %t.out +// RUN: grep -e 'The number is 123!' %t.out + #include #include @@ -19,19 +24,15 @@ int main(int, char**) { -#if 0 - std::wcout << L"Hello World!\n"; int i; - std::wcout << L"Enter a number: "; std::wcin >> i; - std::wcout << L"The number is : " << i << L'\n'; -#else // 0 + std::wcout << L"The number is " << i << L"!"; + #ifdef _LIBCPP_HAS_NO_STDOUT assert(std::wcin.tie() == NULL); #else assert(std::wcin.tie() == &std::wcout); -#endif #endif - return 0; + return 0; } diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.sh.cpp similarity index 79% rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.sh.cpp index f396500890d887..d1b126067155cf 100644 --- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.sh.cpp @@ -10,17 +10,18 @@ // istream wclog; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} %t.exe 2> %t.err +// RUN: grep -e 'Hello World!' %t.err + #include #include "test_macros.h" int main(int, char**) { -#if 0 std::wclog << L"Hello World!\n"; -#else - (void)std::wclog; -#endif - return 0; + return 0; } diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.sh.cpp similarity index 80% rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.pass.cpp rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.sh.cpp index b6bd1ef4ea18e8..0a14f898baa213 100644 --- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.pass.cpp +++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.sh.cpp @@ -12,17 +12,18 @@ // istream wcout; +// FILE_DEPENDENCIES: %t.exe +// RUN: %{build} +// RUN: %{exec} %t.exe > %t.out +// RUN: grep -e 'Hello World!' %t.out + #include #include "test_macros.h" int main(int, char**) { -#if 0 std::wcout << L"Hello World!\n"; -#else - (void)std::wcout; -#endif - return 0; + return 0; } diff --git a/libcxxabi/src/cxa_vector.cpp b/libcxxabi/src/cxa_vector.cpp index f20e978d36ef31..325bbf22d20117 100644 --- a/libcxxabi/src/cxa_vector.cpp +++ b/libcxxabi/src/cxa_vector.cpp @@ -24,9 +24,9 @@ namespace __cxxabiv1 { -#if 0 -#pragma mark --Helper routines and classes -- -#endif +// +// Helper routines and classes +// namespace { inline static size_t __get_element_count ( void *p ) { @@ -111,9 +111,9 @@ namespace { }; } -#if 0 -#pragma mark --Externally visible routines-- -#endif +// +// Externally visible routines +// namespace { _LIBCXXABI_NORETURN diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp index 27b20baceeedee..60cc581f94fe17 100644 --- a/lld/ELF/Arch/Hexagon.cpp +++ b/lld/ELF/Arch/Hexagon.cpp @@ -120,6 +120,8 @@ RelExpr Hexagon::getRelExpr(RelType type, const Symbol &s, case R_HEX_B22_PCREL_X: case R_HEX_B32_PCREL_X: case R_HEX_GD_PLT_B22_PCREL: + case R_HEX_GD_PLT_B22_PCREL_X: + case R_HEX_GD_PLT_B32_PCREL_X: return R_PLT_PC; case R_HEX_IE_32_6_X: case R_HEX_IE_16_X: @@ -311,16 +313,18 @@ void Hexagon::relocate(uint8_t *loc, const Relocation &rel, case R_HEX_B15_PCREL_X: or32le(loc, applyMask(0x00df20fe, val & 0x3f)); break; - case R_HEX_GD_PLT_B22_PCREL: case R_HEX_B22_PCREL: + case R_HEX_GD_PLT_B22_PCREL: case R_HEX_PLT_B22_PCREL: checkInt(loc, val, 22, rel); or32le(loc, applyMask(0x1ff3ffe, val >> 2)); break; case R_HEX_B22_PCREL_X: + case R_HEX_GD_PLT_B22_PCREL_X: or32le(loc, applyMask(0x1ff3ffe, val & 0x3f)); break; case R_HEX_B32_PCREL_X: + case R_HEX_GD_PLT_B32_PCREL_X: or32le(loc, applyMask(0x0fff3fff, val >> 6)); break; case R_HEX_GOTREL_HI16: diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index ff068019158726..eb30166fcc4ca1 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -1344,8 +1344,11 @@ static void scanReloc(InputSectionBase &sec, OffsetGetter &getOffset, RelTy *&i, addend &= ~0x8000; // R_HEX_GD_PLT_B22_PCREL (call a@GDPLT) is transformed into // call __tls_get_addr even if the symbol is non-preemptible. - if (!(config->emachine == EM_HEXAGON && type == R_HEX_GD_PLT_B22_PCREL)) - expr = fromPlt(expr); + if (!(config->emachine == EM_HEXAGON && + (type == R_HEX_GD_PLT_B22_PCREL || + type == R_HEX_GD_PLT_B22_PCREL_X || + type == R_HEX_GD_PLT_B32_PCREL_X))) + expr = fromPlt(expr); } } diff --git a/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s b/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s index ba0eee999c806a..ff5e6dbaac710b 100644 --- a/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s +++ b/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s @@ -3,6 +3,7 @@ # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o # RUN: ld.lld -shared %t.o -o %t.so # RUN: llvm-readobj -r %t.so | FileCheck --check-prefix=RELOC %s +# RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=REL %s # RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex %t.so | FileCheck %s ## Prior to D77021 lld would error "relocation R_HEX_GD_PLT_B22_PCREL cannot refer to absolute symbol". @@ -17,17 +18,28 @@ # RELOC-NEXT: R_HEX_JMP_SLOT __tls_get_addr 0x0 # RELOC-NEXT: } +# REL: R_HEX_B32_PCREL_X _GLOBAL_OFFSET_TABLE_ 0x0 +# REL-NEXT: R_HEX_6_PCREL_X _GLOBAL_OFFSET_TABLE_ 0x4 +# REL-NEXT: R_HEX_GD_GOT_32_6_X a 0x0 +# REL-NEXT: R_HEX_GD_GOT_16_X a 0x0 +# REL-NEXT: R_HEX_GD_PLT_B22_PCREL a 0x0 +# REL-NEXT: R_HEX_GD_PLT_B32_PCREL_X a 0x0 +# REL-NEXT: R_HEX_GD_PLT_B22_PCREL_X a 0x4 + # CHECK: { immext(#{{.*}}) # CHECK-NEXT: r2 = add(pc,##{{.*}}) } # CHECK-NEXT: { immext(#{{.*}}) # CHECK-NEXT: r0 = add(r2,##-{{.*}}) } # CHECK-NEXT: { call {{.*}} } +# CHECK-NEXT: { immext({{.*}}) +# CHECK-NEXT: call {{.*}} } # CHECK-NEXT: { r0 = memw(r0+#0x0) } _start: r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL) r0 = add(r2,##a@GDGOT) call a@GDPLT + call ##a@GDPLT r0 = memw(r0+#0) ## a is non-preemptible due to STV_HIDDEN visibility. diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 3f727e83f12beb..2cc3d47406b7f6 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2816,8 +2816,10 @@ void CommandInterpreter::IOHandlerInputComplete(IOHandler &io_handler, case eReturnStatusFailed: m_result.IncrementNumberOfErrors(); - if (io_handler.GetFlags().Test(eHandleCommandFlagStopOnError)) + if (io_handler.GetFlags().Test(eHandleCommandFlagStopOnError)) { + m_result.SetResult(lldb::eCommandInterpreterResultCommandError); io_handler.SetIsDone(true); + } break; case eReturnStatusQuit: diff --git a/lldb/test/Shell/Commands/command-source.test b/lldb/test/Shell/Commands/command-source.test index d8218850c32c13..fa389f2a12889b 100644 --- a/lldb/test/Shell/Commands/command-source.test +++ b/lldb/test/Shell/Commands/command-source.test @@ -1,8 +1,8 @@ # Check that stop command source on error. -# RUN: %lldb -x -b -o "command source -e 1 %s" 2>&1 | FileCheck %s --check-prefix STOP +# RUN: not %lldb -x -b -o "command source -e 1 %s" 2>&1 | FileCheck %s --check-prefix STOP # RUN: %lldb -x -b -o "command source -e 0 %s" 2>&1 | FileCheck %s --check-prefix CONTINUE -# RUN: %lldb -x -b -o 'settings set interpreter.stop-command-source-on-error true' -o "command source %s" 2>&1 | FileCheck %s --check-prefix STOP +# RUN: not %lldb -x -b -o 'settings set interpreter.stop-command-source-on-error true' -o "command source %s" 2>&1 | FileCheck %s --check-prefix STOP # RUN: %lldb -x -b -o 'settings set interpreter.stop-command-source-on-error false' -o "command source %s" 2>&1 | FileCheck %s --check-prefix CONTINUE bogus diff --git a/lldb/test/Shell/Driver/TestProcessAttach.test b/lldb/test/Shell/Driver/TestProcessAttach.test index 4e24ebb161b6e7..ab75814e21ce09 100644 --- a/lldb/test/Shell/Driver/TestProcessAttach.test +++ b/lldb/test/Shell/Driver/TestProcessAttach.test @@ -1,2 +1,2 @@ -# RUN: %lldb -x -b -S %S/Inputs/process_attach_pid.in 2>&1 | FileCheck %s +# RUN: not %lldb -x -b -S %S/Inputs/process_attach_pid.in 2>&1 | FileCheck %s # CHECK: last option requires an argument diff --git a/lldb/test/Shell/Host/TestCustomShell.test b/lldb/test/Shell/Host/TestCustomShell.test index fd97b4c2b06e2e..75114c55449341 100644 --- a/lldb/test/Shell/Host/TestCustomShell.test +++ b/lldb/test/Shell/Host/TestCustomShell.test @@ -8,7 +8,7 @@ # XFAIL: system-openbsd # RUN: %clang_host %S/Inputs/simple.c -g -o %t.out -# RUN: SHELL=bogus %lldb %t.out -b -o 'run' 2>&1 | FileCheck %s --check-prefix ERROR +# RUN: SHELL=bogus not %lldb %t.out -b -o 'run' 2>&1 | FileCheck %s --check-prefix ERROR # RUN: env -i %lldb %t.out -b -o 'run' 2>&1 | FileCheck %s # ERROR: error: shell expansion failed diff --git a/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test b/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test index 87c0bd41bb05fb..1747ddd669b609 100644 --- a/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test +++ b/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test @@ -1,4 +1,4 @@ # UNSUPPORTED: system-windows -# RUN: %lldb -b -s %s 2>&1 | FileCheck %s +# RUN: not %lldb -b -s %s 2>&1 | FileCheck %s q str // CHECK: Couldn't parse 'str' diff --git a/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test b/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test index a67669451e9928..315adf02af4d10 100644 --- a/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test +++ b/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test @@ -1,4 +1,4 @@ # UNSUPPORTED: system-windows -# RUN: %lldb -b -s %s 2>&1 | FileCheck %s +# RUN: not %lldb -b -s %s 2>&1 | FileCheck %s q 1 2 // CHECK: Too many arguments for 'quit' diff --git a/lldb/test/Shell/Reproducer/TestDiscard.test b/lldb/test/Shell/Reproducer/TestDiscard.test index 829aabbe2b03bf..aee56f77c06f70 100644 --- a/lldb/test/Shell/Reproducer/TestDiscard.test +++ b/lldb/test/Shell/Reproducer/TestDiscard.test @@ -6,7 +6,7 @@ # RUN: %clang_host %S/Inputs/simple.c -g -o %t/reproducer.out # Capture but don't generate the reproducer. -# RUN: %lldb -x -b -s %S/Inputs/Discard.in --capture --capture-path %t.repro %t/reproducer.out +# RUN: not %lldb -x -b -s %S/Inputs/Discard.in --capture --capture-path %t.repro %t/reproducer.out # Make sure the directory doesn't exist. # RUN: mkdir %t.repro diff --git a/lldb/test/Shell/Reproducer/TestDump.test b/lldb/test/Shell/Reproducer/TestDump.test index 8300a97004bbfc..cf2c89c938b7d0 100644 --- a/lldb/test/Shell/Reproducer/TestDump.test +++ b/lldb/test/Shell/Reproducer/TestDump.test @@ -25,9 +25,9 @@ # RUN: %lldb --replay %t.repro | FileCheck %s --check-prefix FILES # RUN: rm %t.repro/gdb-remote.yaml -# RUN: %lldb -b -o 'reproducer dump -p gdb -f %t.repro' 2>&1 | FileCheck %s --check-prefix GDB-ERROR +# RUN: not %lldb -b -o 'reproducer dump -p gdb -f %t.repro' 2>&1 | FileCheck %s --check-prefix GDB-ERROR # GDB-ERROR: error: Unable to create GDB loader. # RUN: rm %t.repro/command-interpreter.yaml -# RUN: %lldb -b -o 'reproducer dump -p commands -f %t.repro' 2>&1 | FileCheck %s --check-prefix COMMANDS-ERROR +# RUN: not %lldb -b -o 'reproducer dump -p commands -f %t.repro' 2>&1 | FileCheck %s --check-prefix COMMANDS-ERROR # COMMANDS-ERROR: error: Unable to create command loader. diff --git a/lldb/test/Shell/Settings/TestSettingsSet.test b/lldb/test/Shell/Settings/TestSettingsSet.test index 0def3faaadbb28..3006a694a16b2d 100644 --- a/lldb/test/Shell/Settings/TestSettingsSet.test +++ b/lldb/test/Shell/Settings/TestSettingsSet.test @@ -1,7 +1,7 @@ # This tests setting setting values. # Check that setting an empty value with -f(orce) clears the value. -# RUN: %lldb -b -s %s 2>&1 | FileCheck %s +# RUN: not %lldb -b -s %s 2>&1 | FileCheck %s settings set tab-size 16 settings show tab-size diff --git a/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test b/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test index a53dc2cd6868dd..d734a0940a2d72 100644 --- a/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test +++ b/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test @@ -12,13 +12,13 @@ # RUN: %lldb -b -o 'settings set interpreter.stop-command-source-on-error false' -s %S/Inputs/StopCommandSource.in | FileCheck %s --check-prefix CONTINUE # FIXME: Should continue -# RUN: %lldb -b -s %S/Inputs/DontStopCommandSource.in -o 'bogus' -o 'print 111100000 + 11111' | FileCheck %s --check-prefix STOP +# RUN: not %lldb -b -s %S/Inputs/DontStopCommandSource.in -o 'bogus' -o 'print 111100000 + 11111' | FileCheck %s --check-prefix STOP # FIXME: Should continue -# RUN: %lldb -b -o 'settings set interpreter.stop-command-source-on-error false' -o 'bogus' -o 'print 123400000 + 56789' | FileCheck %s --check-prefix STOP +# RUN: not %lldb -b -o 'settings set interpreter.stop-command-source-on-error false' -o 'bogus' -o 'print 123400000 + 56789' | FileCheck %s --check-prefix STOP # FIXME: Should continue -# RUN: %lldb -b -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP +# RUN: not %lldb -b -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP # FIXME: Should continue -# RUN: %lldb -b -o 'settings set interpreter.stop-command-source-on-error true' -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP +# RUN: not %lldb -b -o 'settings set interpreter.stop-command-source-on-error true' -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test b/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test index f9c02061fc8637..8f2ef7135afc4e 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test +++ b/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test @@ -14,10 +14,10 @@ LOOKUPE: no type was found matching 'E' RUN: %lldb %t -b -o "type lookup EC" | FileCheck --check-prefix=LOOKUPEC %s LOOKUPEC: no type was found matching 'EC' -RUN: %lldb %t -b -o "print (E) 1" 2>&1 | FileCheck --check-prefix=PRINTE %s +RUN: not %lldb %t -b -o "print (E) 1" 2>&1 | FileCheck --check-prefix=PRINTE %s PRINTE: use of undeclared identifier 'E' -RUN: %lldb %t -b -o "print (EC) 1" 2>&1 | FileCheck --check-prefix=PRINTEC %s +RUN: not %lldb %t -b -o "print (EC) 1" 2>&1 | FileCheck --check-prefix=PRINTEC %s PRINTEC: use of undeclared identifier 'EC' RUN: %lldb %t -b -o "target variable a e ec" | FileCheck --check-prefix=VARS %s diff --git a/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test b/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test index e748b4e5c73c31..682b0e5332b1c5 100644 --- a/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test +++ b/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test @@ -5,7 +5,7 @@ # UNSUPPORTED: system-windows # RUN: %clang_host %p/Inputs/call-asm.c -x assembler-with-cpp %p/Inputs/thread-step-out-ret-addr-check.s -o %t -# RUN: %lldb %t -s %s -b 2>&1 | FileCheck %s +# RUN: not %lldb %t -s %s -b 2>&1 | FileCheck %s breakpoint set -n nonstandard_stub # CHECK: Breakpoint 1: where = {{.*}}`nonstandard_stub diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index ff7ed2ca0544c5..b38423b285590e 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -619,6 +619,12 @@ int Driver::MainLoop() { results.GetResult() != lldb::eCommandInterpreterResultInferiorCrash) go_interactive = false; + // When running in batch mode and stopped because of an error, exit with a + // non-zero exit status. + if (m_option_data.m_batch && + results.GetResult() == lldb::eCommandInterpreterResultCommandError) + exit(1); + if (m_option_data.m_batch && results.GetResult() == lldb::eCommandInterpreterResultInferiorCrash && !m_option_data.m_after_crash_commands.empty()) { @@ -636,6 +642,13 @@ int Driver::MainLoop() { if (local_results.GetResult() == lldb::eCommandInterpreterResultQuitRequested) go_interactive = false; + + // When running in batch mode and an error occurred while sourcing + // the crash commands, exit with a non-zero exit status. + if (m_option_data.m_batch && + local_results.GetResult() == + lldb::eCommandInterpreterResultCommandError) + exit(1); } } m_debugger.SetAsync(old_async); diff --git a/lldb/unittests/DataFormatter/StringPrinterTests.cpp b/lldb/unittests/DataFormatter/StringPrinterTests.cpp index 4b01f5c1dbe2c1..180b13772af53f 100644 --- a/lldb/unittests/DataFormatter/StringPrinterTests.cpp +++ b/lldb/unittests/DataFormatter/StringPrinterTests.cpp @@ -74,8 +74,8 @@ TEST(StringPrinterTests, CxxASCII) { EXPECT_EQ(fmt("🥑"), QUOTE("🥑")); // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn). - EXPECT_EQ(fmt("\uD55C"), QUOTE("한")); - EXPECT_EQ(fmt("\U00010348"), QUOTE("𐍈")); + EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C")); + EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348")); // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds // that these are not valid utf8 sequences, but that's OK, the raw values @@ -111,8 +111,8 @@ TEST(StringPrinterTests, CxxUTF8) { EXPECT_EQ(fmt("🥑"), QUOTE("🥑")); // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn). - EXPECT_EQ(fmt("\uD55C"), QUOTE("한")); - EXPECT_EQ(fmt("\U00010348"), QUOTE("𐍈")); + EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C")); + EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348")); // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds // that these are not valid utf8 sequences, but that's OK, the raw values @@ -148,8 +148,8 @@ TEST(StringPrinterTests, SwiftUTF8) { EXPECT_EQ(fmt("🥑"), QUOTE("🥑")); // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn). - EXPECT_EQ(fmt("\uD55C"), QUOTE("한")); - EXPECT_EQ(fmt("\U00010348"), QUOTE("𐍈")); + EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C")); + EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348")); // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds // that these are not valid utf8 sequences, but that's OK, the raw values diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 91ee584fddc4be..dc63a1a27b7c8a 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6562,11 +6562,27 @@ On exit from a function: * FLAT_SCRATCH * EXEC * GFX6-8: M0 - * All SGPR and VGPR registers except the clobbered registers of SGPR4-31 and - VGPR0-31. + * All SGPR registers except the clobbered registers of SGPR4-31. + * VGPR40-47 + VGPR56-63 + VGPR72-79 + VGPR88-95 + VGPR104-111 + VGPR120-127 + VGPR136-143 + VGPR152-159 + VGPR168-175 + VGPR184-191 + VGPR200-207 + VGPR216-223 + VGPR232-239 + VGPR248-255 + *Except the argument registers, the VGPR cloberred and the preserved + registers are intermixed at regular intervals in order to + get a better occupancy.* For the AMDGPU backend, an inter-procedural register allocation (IPRA) - optimization may mark some of clobbered SGPR4-31 and VGPR0-31 registers as + optimization may mark some of clobbered SGPR and VGPR registers as preserved if it can be determined that the called function does not change their value. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 7a819f0aa5ad53..f3e57567b6bd69 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -620,7 +620,7 @@ class TargetTransformInfo { /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts, + unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) const; /// Estimate the overhead of scalarizing an instructions unique @@ -1261,7 +1261,8 @@ class TargetTransformInfo::Concept { virtual bool shouldBuildLookupTables() = 0; virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0; virtual bool useColdCCForColdCall(Function &F) = 0; - virtual unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts, + virtual unsigned getScalarizationOverhead(VectorType *Ty, + const APInt &DemandedElts, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, @@ -1609,7 +1610,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.useColdCCForColdCall(F); } - unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts, + unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) override { return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 6171ff9fbf0d67..529cdbcb20dd0b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -240,7 +240,7 @@ class TargetTransformInfoImplBase { bool useColdCCForColdCall(Function &F) { return false; } - unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts, + unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { return 0; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index e885b1158d07db..140e39d26da718 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -552,32 +552,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts, + unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { - auto *VTy = cast(Ty); - assert(DemandedElts.getBitWidth() == VTy->getNumElements() && + assert(DemandedElts.getBitWidth() == Ty->getNumElements() && "Vector size mismatch"); unsigned Cost = 0; - for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { + for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { if (!DemandedElts[i]) continue; if (Insert) Cost += static_cast(this)->getVectorInstrCost( - Instruction::InsertElement, VTy, i); + Instruction::InsertElement, Ty, i); if (Extract) Cost += static_cast(this)->getVectorInstrCost( - Instruction::ExtractElement, VTy, i); + Instruction::ExtractElement, Ty, i); } return Cost; } /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { - auto *VTy = cast(Ty); - APInt DemandedElts = APInt::getAllOnesValue(VTy->getNumElements()); + unsigned getScalarizationOverhead(VectorType *Ty, bool Insert, bool Extract) { + APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements()); return static_cast(this)->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } @@ -591,11 +589,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { SmallPtrSet UniqueOperands; for (const Value *A : Args) { if (!isa(A) && UniqueOperands.insert(A).second) { - Type *VecTy = nullptr; - if (A->getType()->isVectorTy()) { - VecTy = A->getType(); + auto *VecTy = dyn_cast(A->getType()); + if (VecTy) { // If A is a vector operand, VF should be 1 or correspond to A. - assert((VF == 1 || VF == cast(VecTy)->getNumElements()) && + assert((VF == 1 || VF == VecTy->getNumElements()) && "Vector argument does not match VF"); } else @@ -608,17 +605,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return Cost; } - unsigned getScalarizationOverhead(Type *VecTy, ArrayRef Args) { + unsigned getScalarizationOverhead(VectorType *Ty, ArrayRef Args) { unsigned Cost = 0; - auto *VecVTy = cast(VecTy); - Cost += getScalarizationOverhead(VecVTy, true, false); + Cost += getScalarizationOverhead(Ty, true, false); if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, VecVTy->getNumElements()); + Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements()); else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. - Cost += getScalarizationOverhead(VecVTy, false, true); + Cost += getScalarizationOverhead(Ty, false, true); return Cost; } @@ -742,13 +738,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { break; } + auto *SrcVTy = dyn_cast(Src); + auto *DstVTy = dyn_cast(Dst); + // If the cast is marked as legal (or promote) then assume low cost. if (SrcLT.first == DstLT.first && TLI->isOperationLegalOrPromote(ISD, DstLT.second)) return SrcLT.first; // Handle scalar conversions. - if (!Src->isVectorTy() && !Dst->isVectorTy()) { + if (!SrcVTy && !DstVTy) { // Scalar bitcasts are usually free. if (Opcode == Instruction::BitCast) return 0; @@ -763,9 +762,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } // Check vector-to-vector casts. - if (Dst->isVectorTy() && Src->isVectorTy()) { - auto *SrcVTy = cast(Src); - auto *DstVTy = cast(Dst); + if (DstVTy && SrcVTy) { // If the cast is between same-sized registers, then the check is simple. if (SrcLT.first == DstLT.first && SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { @@ -819,19 +816,18 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(Dst, true, true) + Num * Cost; + return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; } // We already handled vector-to-vector and scalar-to-scalar conversions. // This // is where we handle bitcast between vectors and scalars. We need to assume // that the conversion is scalarized in one way or another. - if (Opcode == Instruction::BitCast) + if (Opcode == Instruction::BitCast) { // Illegal bitcasts are done by storing and loading from a stack slot. - return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) - : 0) + - (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) - : 0); + return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + + (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); + } llvm_unreachable("Unhandled cast"); } @@ -923,7 +919,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { // This is a vector load/store for some illegal type that is scalarized. // We must account for the cost of building or decomposing the vector. - Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, + Cost += getScalarizationOverhead(cast(Src), + Opcode != Instruction::Store, Opcode == Instruction::Store); } } @@ -1118,7 +1115,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { if (RetVF > 1 || VF > 1) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) - ScalarizationCost += getScalarizationOverhead(RetTy, true, false); + ScalarizationCost += + getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); } @@ -1224,21 +1222,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { unsigned ScalarizationCost = ScalarizationCostPassed; unsigned ScalarCalls = 1; Type *ScalarRetTy = RetTy; - if (RetTy->isVectorTy()) { + if (auto *RetVTy = dyn_cast(RetTy)) { if (ScalarizationCostPassed == std::numeric_limits::max()) - ScalarizationCost = getScalarizationOverhead(RetTy, true, false); - ScalarCalls = - std::max(ScalarCalls, cast(RetTy)->getNumElements()); + ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); + ScalarCalls = std::max(ScalarCalls, RetVTy->getNumElements()); ScalarRetTy = RetTy->getScalarType(); } SmallVector ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; - if (Ty->isVectorTy()) { + if (auto *VTy = dyn_cast(Ty)) { if (ScalarizationCostPassed == std::numeric_limits::max()) - ScalarizationCost += getScalarizationOverhead(Ty, false, true); - ScalarCalls = - std::max(ScalarCalls, cast(Ty)->getNumElements()); + ScalarizationCost += getScalarizationOverhead(VTy, false, true); + ScalarCalls = std::max(ScalarCalls, VTy->getNumElements()); Ty = Ty->getScalarType(); } ScalarTys.push_back(Ty); @@ -1588,12 +1584,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. - if (RetTy->isVectorTy()) { + if (auto *RetVTy = dyn_cast(RetTy)) { unsigned ScalarizationCost = ((ScalarizationCostPassed != std::numeric_limits::max()) ? ScalarizationCostPassed - : getScalarizationOverhead(RetTy, true, false)); - unsigned ScalarCalls = cast(RetTy)->getNumElements(); + : getScalarizationOverhead(RetVTy, true, false)); + unsigned ScalarCalls = RetVTy->getNumElements(); SmallVector ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; @@ -1604,14 +1600,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { unsigned ScalarCost = ConcreteTTI->getIntrinsicInstrCost( IID, RetTy->getScalarType(), ScalarTys, FMF, CostKind); for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { - if (Tys[i]->isVectorTy()) { + if (auto *VTy = dyn_cast(Tys[i])) { if (ScalarizationCostPassed == std::numeric_limits::max()) - ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); - ScalarCalls = - std::max(ScalarCalls, cast(Tys[i])->getNumElements()); + ScalarizationCost += getScalarizationOverhead(VTy, false, true); + ScalarCalls = std::max(ScalarCalls, VTy->getNumElements()); } } - return ScalarCalls * ScalarCost + ScalarizationCost; } diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index 49276fb1a94d44..8b2c27e7b88820 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -43,6 +43,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -60,6 +61,7 @@ extern cl::opt SwpEnableCopyToPhi; class MachinePipeliner : public MachineFunctionPass { public: MachineFunction *MF = nullptr; + MachineOptimizationRemarkEmitter *ORE = nullptr; const MachineLoopInfo *MLI = nullptr; const MachineDominatorTree *MDT = nullptr; const InstrItineraryData *InstrItins; @@ -96,6 +98,7 @@ class MachinePipeliner : public MachineFunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 743160a26966cb..95b17aa702d089 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -370,8 +370,10 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const { return TTIImpl->useColdCCForColdCall(F); } -unsigned TargetTransformInfo::getScalarizationOverhead( - Type *Ty, const APInt &DemandedElts, bool Insert, bool Extract) const { +unsigned +TargetTransformInfo::getScalarizationOverhead(VectorType *Ty, + const APInt &DemandedElts, + bool Insert, bool Extract) const { return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 3465aaada873bf..ef4b02ca9e3ef6 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -217,6 +217,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { MF = &mf; MLI = &getAnalysis(); MDT = &getAnalysis(); + ORE = &getAnalysis().getORE(); TII = MF->getSubtarget().getInstrInfo(); RegClassInfo.runOnMachineFunction(*MF); @@ -248,6 +249,12 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) { setPragmaPipelineOptions(L); if (!canPipelineLoop(L)) { LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n"); + ORE->emit([&]() { + return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "Failed to pipeline loop"; + }); + return Changed; } @@ -309,11 +316,24 @@ void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) { /// restricted to loops with a single basic block. Make sure that the /// branch in the loop can be analyzed. bool MachinePipeliner::canPipelineLoop(MachineLoop &L) { - if (L.getNumBlocks() != 1) + if (L.getNumBlocks() != 1) { + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "Not a single basic block: " + << ore::NV("NumBlocks", L.getNumBlocks()); + }); return false; + } - if (disabledByPragma) + if (disabledByPragma) { + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "Disabled by Pragma."; + }); return false; + } // Check if the branch can't be understood because we can't do pipelining // if that's the case. @@ -321,25 +341,37 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) { LI.FBB = nullptr; LI.BrCond.clear(); if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) { - LLVM_DEBUG( - dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n"); + LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n"); NumFailBranch++; + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "The branch can't be understood"; + }); return false; } LI.LoopInductionVar = nullptr; LI.LoopCompare = nullptr; if (!TII->analyzeLoopForPipelining(L.getTopBlock())) { - LLVM_DEBUG( - dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n"); + LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n"); NumFailLoop++; + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "The loop structure is not supported"; + }); return false; } if (!L.getLoopPreheader()) { - LLVM_DEBUG( - dbgs() << "Preheader not found, can NOT pipeline current Loop\n"); + LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n"); NumFailPreheader++; + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop", + L.getStartLoc(), L.getHeader()) + << "No loop preheader found"; + }); return false; } @@ -457,10 +489,13 @@ void SwingSchedulerDAG::schedule() { // Can't schedule a loop without a valid MII. if (MII == 0) { - LLVM_DEBUG( - dbgs() - << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n"); + LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n"); NumFailZeroMII++; + Pass.ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis( + DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader()) + << "Invalid Minimal Initiation Interval: 0"; + }); return; } @@ -469,6 +504,14 @@ void SwingSchedulerDAG::schedule() { LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii << ", we don't pipleline large loops\n"); NumFailLargeMaxMII++; + Pass.ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis( + DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader()) + << "Minimal Initiation Interval too large: " + << ore::NV("MII", (int)MII) << " > " + << ore::NV("SwpMaxMii", SwpMaxMii) << "." + << "Refer to -pipeliner-max-mii."; + }); return; } @@ -511,15 +554,24 @@ void SwingSchedulerDAG::schedule() { if (!Scheduled){ LLVM_DEBUG(dbgs() << "No schedule found, return\n"); NumFailNoSchedule++; + Pass.ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis( + DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader()) + << "Unable to find schedule"; + }); return; } unsigned numStages = Schedule.getMaxStageCount(); // No need to generate pipeline if there are no overlapped iterations. if (numStages == 0) { - LLVM_DEBUG( - dbgs() << "No overlapped iterations, no need to generate pipeline\n"); + LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n"); NumFailZeroStage++; + Pass.ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis( + DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader()) + << "No need to pipeline - no overlapped iterations in schedule."; + }); return; } // Check that the maximum stage count is less than user-defined limit. @@ -527,9 +579,23 @@ void SwingSchedulerDAG::schedule() { LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages << " : too many stages, abort\n"); NumFailLargeMaxStage++; + Pass.ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis( + DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader()) + << "Too many stages in schedule: " + << ore::NV("numStages", (int)numStages) << " > " + << ore::NV("SwpMaxStages", SwpMaxStages) + << ". Refer to -pipeliner-max-stages."; + }); return; } + Pass.ORE->emit([&]() { + return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(), + Loop.getHeader()) + << "Pipelined succesfully!"; + }); + // Generate the schedule as a ModuloSchedule. DenseMap Cycles, Stages; std::vector OrderedInsts; @@ -1080,7 +1146,7 @@ unsigned SwingSchedulerDAG::calculateResMII() { } } int Resmii = Resources.size(); - LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n"); + LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n"); // Delete the memory for each of the DFAs that were created earlier. for (ResourceManager *RI : Resources) { ResourceManager *D = RI; @@ -2052,9 +2118,16 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II << ")\n"); - if (scheduleFound) + if (scheduleFound) { Schedule.finalizeSchedule(this); - else + Pass.ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis( + DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader()) + << "Schedule found with Initiation Interval: " << ore::NV("II", II) + << ", MaxStageCount: " + << ore::NV("MaxStageCount", Schedule.getMaxStageCount()); + }); + } else Schedule.reset(); return scheduleFound && Schedule.getMaxStageCount() > 0; diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp index 0a4d5818703b03..775629074f6c4b 100644 --- a/llvm/lib/Support/Path.cpp +++ b/llvm/lib/Support/Path.cpp @@ -540,15 +540,9 @@ void native(SmallVectorImpl &Path, Style style) { Path = PathHome; } } else { - for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) { - if (*PI == '\\') { - auto PN = PI + 1; - if (PN < PE && *PN == '\\') - ++PI; // increment once, the for loop will move over the escaped slash - else - *PI = '/'; - } - } + for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) + if (*PI == '\\') + *PI = '/'; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 2c8a5c40421054..7c83b6dcb44b94 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs< (sequence "VGPR%u", 32, 255) >; +def CSR_AMDGPU_VGPRs : CalleeSavedRegs< + // The CSRs & scratch-registers are interleaved at a split boundary of 8. + (add (sequence "VGPR%u", 40, 47), + (sequence "VGPR%u", 56, 63), + (sequence "VGPR%u", 72, 79), + (sequence "VGPR%u", 88, 95), + (sequence "VGPR%u", 104, 111), + (sequence "VGPR%u", 120, 127), + (sequence "VGPR%u", 136, 143), + (sequence "VGPR%u", 152, 159), + (sequence "VGPR%u", 168, 175), + (sequence "VGPR%u", 184, 191), + (sequence "VGPR%u", 200, 207), + (sequence "VGPR%u", 216, 223), + (sequence "VGPR%u", 232, 239), + (sequence "VGPR%u", 248, 255)) +>; + def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< (sequence "SGPR%u", 32, 105) >; @@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< >; def CSR_AMDGPU_HighRegs : CalleeSavedRegs< - (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105) + (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105) >; // Calling convention for leaf functions diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2e6f021855f05a..a49b1ddbfe9ded 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2525,15 +2525,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::COPY) { - bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); + Register DstReg = UseMI.getOperand(0).getReg(); + bool Is16Bit = getOpSize(UseMI, 0) == 2; + bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; - if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { - if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) + APInt Imm(32, ImmOp->getImm()); + + if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) + Imm = Imm.ashr(16); + + if (RI.isAGPR(*MRI, DstReg)) { + if (!isInlineConstant(Imm)) return false; NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; } + + if (Is16Bit) { + if (isVGPRCopy) + return false; // Do not clobber vgpr_hi16 + + if (DstReg.isVirtual() && + UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) + return false; + + UseMI.getOperand(0).setSubReg(0); + if (DstReg.isPhysical()) { + DstReg = RI.get32BitRegister(DstReg); + UseMI.getOperand(0).setReg(DstReg); + } + assert(UseMI.getOperand(1).getReg().isVirtual()); + } + UseMI.setDesc(get(NewOpc)); - UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); + UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); + UseMI.getOperand(1).setTargetFlags(0); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 9fcc5caf7dfdd2..8231a96f5f6b2c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -827,11 +827,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { const MachineOperand &MO = MI.getOperand(OpNo); if (MO.isReg()) { if (unsigned SubReg = MO.getSubReg()) { - assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg( - MI.getParent()->getParent()->getRegInfo(). - getRegClass(MO.getReg()), SubReg)) >= 32 && - "Sub-dword subregs are not supported"); - return RI.getNumChannelsFromSubReg(SubReg) * 4; + return RI.getSubRegIdxSize(SubReg) / 8; } } return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 1dac45a029b3f8..d6e082d64e7afd 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -807,7 +807,7 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, CostKind); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost; + return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost; } return BaseCost; @@ -899,7 +899,7 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, // The scalarization cost should be a lot higher. We use the number of vector // elements plus the scalarization overhead. unsigned ScalarCost = - NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {}); + NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {}); if (Alignment < EltSize / 8) return ScalarCost; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index b8571476d66ae0..99845ae7ca8452 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -115,7 +115,7 @@ unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const { return (8 * ST.getVectorLength()) / ElemWidth; } -unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, +unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 4b0625a67ffd50..b2191910a238f9 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -101,7 +101,7 @@ class HexagonTTIImpl : public BasicTTIImplBase { return true; } - unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts, + unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 4bf03da45397e1..9ec7b07fc3f813 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -464,7 +464,8 @@ int SystemZTTIImpl::getArithmeticInstrCost( return DivInstrCost; } else if (ST->hasVector()) { - unsigned VF = cast(Ty)->getNumElements(); + auto *VTy = cast(Ty); + unsigned VF = VTy->getNumElements(); unsigned NumVectors = getNumVectorRegs(Ty); // These vector operations are custom handled, but are still supported @@ -477,7 +478,7 @@ int SystemZTTIImpl::getArithmeticInstrCost( if (DivRemConstPow2) return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); if (DivRemConst) - return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args); + return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args); if ((SignedDivRem || UnsignedDivRem) && VF > 4) // Temporary hack: disable high vectorization factors with integer // division/remainder, which will get scalarized and handled with @@ -500,7 +501,7 @@ int SystemZTTIImpl::getArithmeticInstrCost( // inserting and extracting the values. unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); - unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args); + unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args); // FIXME: VF 2 for these FP operations are currently just as // expensive as for VF 4. if (VF == 2) @@ -517,7 +518,7 @@ int SystemZTTIImpl::getArithmeticInstrCost( // There is no native support for FRem. if (Opcode == Instruction::FRem) { - unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args); + unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args); // FIXME: VF 2 for float is currently just as expensive as for VF 4. if (VF == 2 && ScalarBits == 32) Cost *= 2; @@ -724,8 +725,9 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, } } else if (ST->hasVector()) { - assert (Dst->isVectorTy()); - unsigned VF = cast(Src)->getNumElements(); + auto *SrcVecTy = cast(Src); + auto *DstVecTy = cast(Dst); + unsigned VF = SrcVecTy->getNumElements(); unsigned NumDstVectors = getNumVectorRegs(Dst); unsigned NumSrcVectors = getNumVectorRegs(Src); @@ -781,8 +783,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) NeedsExtracts = false; - TotCost += getScalarizationOverhead(Src, false, NeedsExtracts); - TotCost += getScalarizationOverhead(Dst, NeedsInserts, false); + TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts); + TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false); // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) @@ -793,7 +795,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (Opcode == Instruction::FPTrunc) { if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. - return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false); + return VF /*ldxbr/lexbr*/ + + getScalarizationOverhead(DstVecTy, true, false); else // double -> float return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); } @@ -806,7 +809,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return VF * 2; } // -> fp128. VF * lxdb/lxeb + extraction of elements. - return VF + getScalarizationOverhead(Src, false, true); + return VF + getScalarizationOverhead(SrcVecTy, false, true); } } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index f2f34f5f0bd10e..98f6988266057d 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2888,10 +2888,9 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } -unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, +unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { - auto* VecTy = cast(Ty); unsigned Cost = 0; // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much @@ -2917,7 +2916,7 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, // 128-bit vector is free. // NOTE: This assumes legalization widens vXf32 vectors. if (MScalarTy == MVT::f32) - for (unsigned i = 0, e = VecTy->getNumElements(); i < e; i += 4) + for (unsigned i = 0, e = Ty->getNumElements(); i < e; i += 4) if (DemandedElts[i]) Cost--; } @@ -2933,7 +2932,7 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, // vector elements, which represents the number of unpacks we'll end up // performing. unsigned NumElts = LT.second.getVectorNumElements(); - unsigned Pow2Elts = PowerOf2Ceil(VecTy->getNumElements()); + unsigned Pow2Elts = PowerOf2Ceil(Ty->getNumElements()); Cost += (std::min(NumElts, Pow2Elts) - 1) * LT.first; } } @@ -2970,7 +2969,7 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, APInt DemandedElts = APInt::getAllOnesValue(NumElem); int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, AddressSpace, CostKind); - int SplitCost = getScalarizationOverhead(Src, DemandedElts, + int SplitCost = getScalarizationOverhead(VTy, DemandedElts, Opcode == Instruction::Load, Opcode == Instruction::Store); return NumElem * Cost + SplitCost; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index eabd0f132363c6..ee9f3a67cd3be2 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -135,7 +135,7 @@ class X86TTIImpl : public BasicTTIImplBase { TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts, + unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp index c588ac83d2adc5..89eb1159c123c8 100644 --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -93,6 +93,13 @@ static cl::opt DisableInlinedAllocaMerging("disable-inlined-alloca-merging", cl::init(false), cl::Hidden); +// An integer used to limit the cost of inline deferral. The default negative +// number tells shouldBeDeferred to only take the secondary cost into account. +static cl::opt + InlineDeferralScale("inline-deferral-scale", + cl::desc("Scale to limit the cost of inline deferral"), + cl::init(-1), cl::Hidden); + namespace { enum class InlinerFunctionImportStatsOpts { @@ -338,12 +345,8 @@ shouldBeDeferred(Function *Caller, InlineCost IC, int &TotalSecondaryCost, bool ApplyLastCallBonus = Caller->hasLocalLinkage() && !Caller->hasOneUse(); // This bool tracks what happens if we DO inline C into B. bool InliningPreventsSomeOuterInline = false; + unsigned NumCallerUsers = 0; for (User *U : Caller->users()) { - // If the caller will not be removed (either because it does not have a - // local linkage or because the LastCallToStaticBonus has been already - // applied), then we can exit the loop early. - if (!ApplyLastCallBonus && TotalSecondaryCost >= IC.getCost()) - return false; CallBase *CS2 = dyn_cast(U); // If this isn't a call to Caller (it could be some other sort @@ -369,8 +372,13 @@ shouldBeDeferred(Function *Caller, InlineCost IC, int &TotalSecondaryCost, if (IC2.getCostDelta() <= CandidateCost) { InliningPreventsSomeOuterInline = true; TotalSecondaryCost += IC2.getCost(); + NumCallerUsers++; } } + + if (!InliningPreventsSomeOuterInline) + return false; + // If all outer calls to Caller would get inlined, the cost for the last // one is set very low by getInlineCost, in anticipation that Caller will // be removed entirely. We did not account for this above unless there @@ -378,7 +386,14 @@ shouldBeDeferred(Function *Caller, InlineCost IC, int &TotalSecondaryCost, if (ApplyLastCallBonus) TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus; - return InliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost(); + // If InlineDeferralScale is negative, then ignore the cost of primary + // inlining -- IC.getCost() multiplied by the number of callers to Caller. + if (InlineDeferralScale < 0) + return TotalSecondaryCost < IC.getCost(); + + int TotalCost = TotalSecondaryCost + IC.getCost() * NumCallerUsers; + int Allowance = IC.getCost() * InlineDeferralScale; + return TotalCost < Allowance; } static std::basic_ostream &operator<<(std::basic_ostream &R, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 612f32ec034bae..b139f8520df321 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5702,9 +5702,9 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { - ScalarCost += - TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), - APInt::getAllOnesValue(VF), true, false); + ScalarCost += TTI.getScalarizationOverhead( + cast(ToVectorTy(I->getType(), VF)), + APInt::getAllOnesValue(VF), true, false); ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); } @@ -5720,8 +5720,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( Worklist.push_back(J); else if (needsExtract(J, VF)) ScalarCost += TTI.getScalarizationOverhead( - ToVectorTy(J->getType(), VF), APInt::getAllOnesValue(VF), false, - true); + cast(ToVectorTy(J->getType(), VF)), + APInt::getAllOnesValue(VF), false, true); } // Scale the total scalar cost by block probability. @@ -6016,8 +6016,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, Type *RetTy = ToVectorTy(I->getType(), VF); if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, APInt::getAllOnesValue(VF), - true, false); + Cost += TTI.getScalarizationOverhead( + cast(RetTy), APInt::getAllOnesValue(VF), true, false); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6222,7 +6222,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. - Type *Vec_i1Ty = + VectorType *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF), false, true) + diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 235efc450e37aa..008d4002dd835e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -666,6 +666,15 @@ class BoUpSLP { /// may not be necessary. bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values + /// can be load combined in the backend. Load combining may not be allowed in + /// the IR optimizer, so we do not want to alter the pattern. For example, + /// partially transforming a scalar bswap() pattern into vector code is + /// effectively impossible for the backend to undo. + /// TODO: If load combining is allowed in the IR optimizer, this analysis + /// may not be necessary. + bool isLoadCombineCandidate() const; + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -3673,8 +3682,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { return true; } -static bool isLoadCombineCandidate(Value *Root, unsigned NumElts, - TargetTransformInfo *TTI) { +static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, + TargetTransformInfo *TTI) { // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional // shift-left-by-constant. @@ -3683,9 +3692,9 @@ static bool isLoadCombineCandidate(Value *Root, unsigned NumElts, match(ZextLoad, m_Shl(m_Value(), m_Constant()))) ZextLoad = cast(ZextLoad)->getOperand(0); - // Check if the input is an extended load. + // Check if the input is an extended load of the required or/shift expression. Value *LoadPtr; - if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) return false; // Require that the total load bit width is a legal integer type. @@ -3710,7 +3719,20 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { unsigned NumElts = VectorizableTree[0]->Scalars.size(); Value *FirstReduced = VectorizableTree[0]->Scalars[0]; - return isLoadCombineCandidate(FirstReduced, NumElts, TTI); + return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI); +} + +bool BoUpSLP::isLoadCombineCandidate() const { + // Peek through a final sequence of stores and check if all operations are + // likely to be load-combined. + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + for (Value *Scalar : VectorizableTree[0]->Scalars) { + Value *X; + if (!match(Scalar, m_Store(m_Value(X), m_Value())) || + !isLoadCombineCandidateImpl(X, NumElts, TTI)) + return false; + } + return true; } bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { @@ -5758,6 +5780,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, } if (R.isTreeTinyAndNotFullyVectorizable()) return false; + if (R.isLoadCombineCandidate()) + return false; R.computeMinimumValueSizes(); @@ -6010,6 +6034,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, } if (R.isTreeTinyAndNotFullyVectorizable()) continue; + if (R.isLoadCombineCandidate()) + return false; R.computeMinimumValueSizes(); int Cost = R.getTreeCost() - UserCost; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 7b6863fb17a5f5..7f6114b1e98521 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -727,9 +727,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GPRIDX-NEXT: s_mov_b32 s18, 0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 -; GPRIDX-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GPRIDX-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GPRIDX-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 @@ -793,9 +790,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GPRIDX-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; GPRIDX-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GPRIDX-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; @@ -816,9 +810,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; MOVREL-NEXT: s_mov_b32 s8, s18 ; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 ; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 -; MOVREL-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; MOVREL-NEXT: v_mov_b32_e32 v34, s19 ; MOVREL-NEXT: v_mov_b32_e32 v33, s18 ; MOVREL-NEXT: v_mov_b32_e32 v32, s17 @@ -868,10 +859,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; MOVREL-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload -; MOVREL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; MOVREL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index e123d80fb95693..9321ac0f4e6350 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -69,12 +69,12 @@ bb1: } ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36 -; GCN-DAG: v_writelane_b32 v33, s34, +; GCN: buffer_store_dword v41, off, s[0:3], s32 offset:36 +; GCN-DAG: v_writelane_b32 v41, s34, ; GCN: s_mov_b32 s34, s32 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32 +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s34 offset:32 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] @@ -89,10 +89,10 @@ bb1: ; GCN: v_readlane_b32 ; GCN-NOT: v_readlane_b32 s32 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32 +; GCN-DAG: buffer_load_dword v40, off, s[0:3], s34 offset:32 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN: v_readlane_b32 s34, v33, -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN: v_readlane_b32 s34, v41, +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index a9222d7663f521..c1f639fb6f8254 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -762,17 +762,13 @@ entry: ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN: s_getpc_b64 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { @@ -784,15 +780,11 @@ entry: ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: ; GCN-NOT: s32 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}} ; GCN: s_getpc_b64 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index de0086495870cc..79722b090323c2 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -13,15 +13,15 @@ define void @use_vcc() #1 { } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v32, s34, 2 -; GCN: v_writelane_b32 v32, s30, 0 -; GCN: v_writelane_b32 v32, s31, 1 +; GCN: v_writelane_b32 v40, s34, 2 +; GCN: v_writelane_b32 v40, s30, 0 +; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s4, v32, 0 -; GCN: v_readlane_b32 s5, v32, 1 -; GCN: v_readlane_b32 s34, v32, 2 +; GCN: v_readlane_b32 s4, v40, 0 +; GCN: v_readlane_b32 s5, v40, 1 +; GCN: v_readlane_b32 s34, v40, 2 ; GCN: ; NumSgprs: 37 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define void @indirect_use_vcc() #1 { call void @use_vcc() ret void @@ -32,7 +32,7 @@ define void @indirect_use_vcc() #1 { ; CI: ; NumSgprs: 39 ; VI-NOBUG: ; NumSgprs: 41 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { call void @indirect_use_vcc() ret void @@ -50,7 +50,7 @@ define void @use_flat_scratch() #1 { ; GCN-LABEL: {{^}}indirect_use_flat_scratch: ; CI: ; NumSgprs: 39 ; VI: ; NumSgprs: 41 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() ret void @@ -61,7 +61,7 @@ define void @indirect_use_flat_scratch() #1 { ; CI: ; NumSgprs: 39 ; VI-NOBUG: ; NumSgprs: 41 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { call void @indirect_use_flat_scratch() ret void @@ -76,7 +76,7 @@ define void @use_10_vgpr() #1 { } ; GCN-LABEL: {{^}}indirect_use_10_vgpr: -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define void @indirect_use_10_vgpr() #0 { call void @use_10_vgpr() ret void @@ -84,23 +84,23 @@ define void @indirect_use_10_vgpr() #0 { ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: ; GCN: is_dynamic_callstack = 0 -; GCN: ; NumVgprs: 33 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { call void @indirect_use_10_vgpr() ret void } -; GCN-LABEL: {{^}}use_40_vgpr: -; GCN: ; NumVgprs: 40 -define void @use_40_vgpr() #1 { - call void asm sideeffect "", "~{v39}"() #0 +; GCN-LABEL: {{^}}use_50_vgpr: +; GCN: ; NumVgprs: 50 +define void @use_50_vgpr() #1 { + call void asm sideeffect "", "~{v49}"() #0 ret void } -; GCN-LABEL: {{^}}indirect_use_40_vgpr: -; GCN: ; NumVgprs: 40 -define void @indirect_use_40_vgpr() #0 { - call void @use_40_vgpr() +; GCN-LABEL: {{^}}indirect_use_50_vgpr: +; GCN: ; NumVgprs: 50 +define void @indirect_use_50_vgpr() #0 { + call void @use_50_vgpr() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index ee77007ef59edb..05e887345637b3 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -5,7 +5,6 @@ declare hidden void @external_void_func_void() #0 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_mov_b32 s33, s7 ; GCN: s_getpc_b64 s[34:35] ; GCN-NEXT: s_add_u32 s34, s34, ; GCN-NEXT: s_addc_u32 s35, s35, @@ -24,22 +23,22 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: buffer_store_dword -; GCN: v_writelane_b32 v32, s34, 4 -; GCN: v_writelane_b32 v32, s36, 0 -; GCN: v_writelane_b32 v32, s37, 1 -; GCN: v_writelane_b32 v32, s30, 2 -; GCN: v_writelane_b32 v32, s31, 3 +; GCN: v_writelane_b32 v40, s34, 4 +; GCN: v_writelane_b32 v40, s36, 0 +; GCN: v_writelane_b32 v40, s37, 1 +; GCN: v_writelane_b32 v40, s30, 2 +; GCN: v_writelane_b32 v40, s31, 3 ; GCN: s_swappc_b64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v32, 2 -; GCN-DAG: v_readlane_b32 s5, v32, 3 -; GCN: v_readlane_b32 s37, v32, 1 -; GCN: v_readlane_b32 s36, v32, 0 +; GCN-DAG: v_readlane_b32 s4, v40, 2 +; GCN-DAG: v_readlane_b32 s5, v40, 3 +; GCN: v_readlane_b32 s37, v40, 1 +; GCN: v_readlane_b32 s36, v40, 0 -; GCN: v_readlane_b32 s34, v32, 4 +; GCN: v_readlane_b32 s34, v40, 4 ; GCN: buffer_load_dword ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { @@ -50,16 +49,16 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa } ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: buffer_store_dword v32 -; GCN: v_writelane_b32 v32, s34, 4 +; GCN: buffer_store_dword v40 +; GCN: v_writelane_b32 v40, s34, 4 ; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 s32, s32, 0x400 ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s34, v32, 4 -; GCN: buffer_load_dword v32, +; GCN: v_readlane_b32 s34, v40, 4 +; GCN: buffer_load_dword v40, define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() @@ -116,9 +115,9 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace } ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: -; GCN: v_mov_b32_e32 v32, v31 +; GCN: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: v_mov_b32_e32 v31, v32 +; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call void @external_void_func_void() @@ -129,8 +128,6 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace ; FIXME: What is the expected behavior for reserved registers here? ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; GCN: s_mov_b32 s33, s9 -; GCN: s_mov_b32 s32, s33 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 @@ -150,14 +147,13 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace( ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: -; GCN: s_mov_b32 s33, s9 -; GCN-NOT: s34 +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, s33 ; GCN-NOT: s34 ; GCN: ;;#ASMSTART @@ -180,32 +176,31 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace( ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: -; GCN: s_mov_b32 s33, s9 +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} ; GCN-NOT: v32 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 -; GCN-NOT: v32 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, s33 +; GCN-NOT: v40 ; GCN: ;;#ASMSTART -; GCN-NEXT: ; def v32 +; GCN-NEXT: ; def v40 ; GCN-NEXT: ;;#ASMEND ; GCN: s_swappc_b64 s[30:31], s[4:5] -; GCN-NOT: v32 +; GCN-NOT: v40 ; GCN: ;;#ASMSTART -; GCN-NEXT: ; use v32 +; GCN-NEXT: ; use v40 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm -define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 { - %v32 = call i32 asm sideeffect "; def $0", "={v32}"() +define amdgpu_kernel void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 { + %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call void @external_void_func_void() - call void asm sideeffect "; use $0", "{v32}"(i32 %v32) + call void asm sideeffect "; use $0", "{v40}"(i32 %v40) ret void } @@ -234,8 +229,6 @@ define hidden void @void_func_void_clobber_s34() #2 { } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_mov_b32 s33, s7 - ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 @@ -248,7 +241,6 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_mov_b32 s33, s7 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 @@ -262,12 +254,12 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { ; GCN-LABEL: {{^}}callee_saved_sgpr_func: ; GCN-NOT: s40 -; GCN: v_writelane_b32 v32, s40 +; GCN: v_writelane_b32 v40, s40 ; GCN: s_swappc_b64 ; GCN-NOT: s40 ; GCN: ; use s40 ; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v32 +; GCN: v_readlane_b32 s40, v40 ; GCN-NOT: s40 define void @callee_saved_sgpr_func() #2 { %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 @@ -294,19 +286,19 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { ; First call preserved VGPR is used so it can't be used for SGPR spills. ; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func: ; GCN-NOT: s40 -; GCN: v_writelane_b32 v33, s40 +; GCN: v_writelane_b32 v41, s40 ; GCN: s_swappc_b64 ; GCN-NOT: s40 ; GCN: ; use s40 ; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v33 +; GCN: v_readlane_b32 s40, v41 ; GCN-NOT: s40 define void @callee_saved_sgpr_vgpr_func() #2 { %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 - %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 + %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 - call void asm sideeffect "; use $0", "v"(i32 %v32) #0 + call void asm sideeffect "; use $0", "v"(i32 %v40) #0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 72423ec4189e5e..7391d7bbdcb0f3 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -61,11 +61,11 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 ; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NEXT: global_store_dword v[0:1], v32, off +; GCN-NEXT: global_store_dword v[0:1], v40, off ; GCN-NEXT: s_endpgm call void @func(i32 0) store i32 0, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index c42cadbc80c570..0331881f01280a 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -127,8 +127,8 @@ define void @callee_with_stack_and_call() #0 { ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v32, 0 -; GCN-DAG: v_readlane_b32 s5, v32, 1 +; GCN-DAG: v_readlane_b32 s4, v40, 0 +; GCN-DAG: v_readlane_b32 s5, v40, 1 ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], [[FP_SPILL_LANE]] @@ -168,6 +168,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0 + call void asm sideeffect "", "~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #0 %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -207,14 +208,14 @@ define void @spill_only_csr_sgpr() { ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s34 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill ; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:8 ; GCN: ;;#ASMSTART -; GCN-NEXT: ; clobber v33 +; GCN-NEXT: ; clobber v41 ; GCN-NEXT: ;;#ASMEND -; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: buffer_load_dword v41, off, s[0:3], s34 ; 4-byte Folded Reload ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s34, s4 @@ -223,7 +224,7 @@ define void @spill_only_csr_sgpr() { define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "; clobber v41", "~{v41}"() ret void } @@ -232,7 +233,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s34, 63 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill ; GCN-COUNT-63: v_writelane_b32 v1 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:8 ; GCN: ;;#ASMSTART @@ -246,7 +247,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { define void @last_lane_vgpr_for_fp_csr() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "; clobber v41", "~{v41}"() call void asm sideeffect "", "~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} @@ -264,14 +265,14 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill ; GCN-COUNT-64: v_writelane_b32 v1, ; GCN: buffer_store_dword ; GCN: ;;#ASMSTART ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: buffer_load_dword v41, off, s[0:3], s34 ; 4-byte Folded Reload ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s34, [[FP_COPY]] @@ -280,7 +281,7 @@ define void @last_lane_vgpr_for_fp_csr() #1 { define void @no_new_vgpr_for_fp_csr() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "; clobber v41", "~{v41}"() call void asm sideeffect "", "~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} @@ -347,20 +348,20 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s34, 2 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN-DAG: buffer_store_dword ; GCN: s_add_u32 s32, s32, 0x300{{$}} ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}} -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -377,11 +378,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} ,~{s30},~{s31}"() #0 - call void asm sideeffect "; clobber nonpreserved VGPRs", + call void asm sideeffect "; clobber nonpreserved initial VGPRs", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} - ,~{v30},~{v31}"() #1 + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1 ret void } @@ -394,19 +395,19 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s34, 2 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} ; GCN-DAG: buffer_store_dword ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0 +; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload @@ -429,7 +430,7 @@ define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval ali "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} - ,~{v30},~{v31}"() #1 + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 497ea354fc098b..80a0b7892d10f9 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -144,7 +144,7 @@ define hidden void @use_workgroup_id_yz() #1 { ; GCN-NOT: s12 ; GCN-NOT: s13 ; GCN-NOT: s14 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() ret void @@ -152,7 +152,7 @@ define hidden void @func_indirect_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() ret void @@ -160,7 +160,7 @@ define hidden void @func_indirect_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll index 601ed9698c6185..dd7ed3bbedf4ba 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -307,7 +307,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { ; Argument is in right place already ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() ret void @@ -315,7 +315,7 @@ define hidden void @func_indirect_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() ret void @@ -323,7 +323,7 @@ define hidden void @func_indirect_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: ; GCN-NOT: s4 -; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s4, v40, 0 define hidden void @func_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 421d41294a28d5..c6add9d7c9fd4c 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -396,13 +396,11 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} ; VARABI: v_and_b32_e32 v32, 0x3ff, v32 ; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VARABI-NEXT: s_waitcnt +; VARABI: s_waitcnt ; VARABI-NEXT: s_setpc_b64 ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 @@ -457,14 +455,12 @@ define void @too_many_args_use_workitem_id_x( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: ; VARABI: enable_vgpr_workitem_id = 0 -; VARABI: s_mov_b32 s33, s7 ; VARABI: s_mov_b32 s32, s33 ; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}} ; VARABI: s_swappc_b64 ; FIXEDABI: enable_vgpr_workitem_id = 2 -; FIXEDABI: s_mov_b32 s33, s17 ; FIXEDABI-DAG: s_mov_b32 s32, s33 ; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 @@ -516,15 +512,15 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 -; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload ; GCN: s_sub_u32 s32, s32, 0x400{{$}} +; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -545,13 +541,11 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x( ; frame[2] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VARABI-NEXT: s_waitcnt ; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 ; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}} -; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VARABI: s_setpc_b64 @@ -616,8 +610,7 @@ define void @too_many_args_use_workitem_id_x_byval( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: ; VARABI: enable_vgpr_workitem_id = 0 -; VARABI-DAG: s_mov_b32 s33, s7 -; VARABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; VARABI: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 ; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 ; VARABI: s_add_u32 s32, s33, 0x400{{$}} @@ -630,9 +623,8 @@ define void @too_many_args_use_workitem_id_x_byval( ; VARABI: s_swappc_b64 -; FIXEDABI: s_mov_b32 s33, s17 -; FIXEDABI-DAG: s_add_u32 s32, s33, 0x400 -; FIXEDABI-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: s_add_u32 s32, s33, 0x400{{$}} ; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33 offset:4{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 @@ -703,10 +695,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { ret void } -; Only one stack load should be emitted for all 3 values. ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VARABI: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}} ; VARABI-NOT: buffer_load_dword @@ -720,9 +709,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] -; VARABI: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VARABI-NEXT: s_waitcnt +; VARABI: s_waitcnt ; VARABI-NEXT: s_setpc_b64 @@ -789,8 +776,6 @@ define void @too_many_args_use_workitem_id_xyz( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 -; VARABI-DAG: s_mov_b32 s33, s7 -; FIXEDABI-DAG: s_mov_b32 s33, s17 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 @@ -831,7 +816,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 -; GCN: ScratchSize: 8 +; GCN: ScratchSize: 0 define void @too_many_args_use_workitem_id_x_stack_yz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -885,9 +870,6 @@ define void @too_many_args_use_workitem_id_x_stack_yz( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 -; VARABI: s_mov_b32 s33, s7 -; FIXEDABI: s_mov_b32 s33, s17 - ; GCN-NOT: v0 ; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-DAG: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index e880d25392d5c3..e5e75c38dad127 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -28,23 +28,23 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -62,23 +62,23 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -96,23 +96,23 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] @@ -130,24 +130,24 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+4 -; GCN-NEXT: v_writelane_b32 v32, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s4, v32, 0 -; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir b/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir new file mode 100644 index 00000000000000..458bdcef1a584e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir @@ -0,0 +1,257 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: fold_simm_16_sub_to_lo +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_lo + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B32_]].lo16 + ; GCN: SI_RETURN_TO_EPILOG [[COPY]] + %0:sreg_32 = S_MOV_B32 2048 + %1:sgpr_lo16 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_simm_16_sub_to_sub +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_sub + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]] + %0:sreg_32 = S_MOV_B32 2048 + %1.lo16:sreg_32 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_simm_16_sub_to_phys +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_phys + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: $sgpr0 = S_MOV_B32 2048 + ; GCN: SI_RETURN_TO_EPILOG $sgpr0_lo16 + %0:sreg_32 = S_MOV_B32 2048 + $sgpr0_lo16 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG $sgpr0_lo16 + +... + +--- +name: fold_aimm_16_sub_to_sub_2048 +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_2048 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: %1.lo16:agpr_32 = COPY killed [[S_MOV_B32_]].lo16 + ; GCN: SI_RETURN_TO_EPILOG %1 + %0:sreg_32 = S_MOV_B32 2048 + %1.lo16:agpr_32 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_aimm_16_sub_to_sub_0 +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_0 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]] + %0:sreg_32 = S_MOV_B32 0 + %1.lo16:agpr_32 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_aimm_16_sub_to_phys +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_16_sub_to_phys + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GCN: SI_RETURN_TO_EPILOG $agpr0_lo16 + %0:sreg_32 = S_MOV_B32 0 + $agpr0_lo16 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG $agpr0_lo16 + +... + +--- +name: fold_vimm_16_sub_to_lo +body: | + bb.0: + + ; GCN-LABEL: name: fold_vimm_16_sub_to_lo + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: [[COPY:%[0-9]+]]:vgpr_lo16 = COPY killed [[S_MOV_B32_]].lo16 + ; GCN: SI_RETURN_TO_EPILOG [[COPY]] + %0:sreg_32 = S_MOV_B32 2048 + %1:vgpr_lo16 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_vimm_16_sub_to_sub +body: | + bb.0: + + ; GCN-LABEL: name: fold_vimm_16_sub_to_sub + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: %1.lo16:vgpr_32 = COPY killed [[S_MOV_B32_]].lo16 + ; GCN: SI_RETURN_TO_EPILOG %1 + %0:sreg_32 = S_MOV_B32 2048 + %1.lo16:vgpr_32 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_vimm_16_sub_to_phys +body: | + bb.0: + + ; GCN-LABEL: name: fold_vimm_16_sub_to_phys + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: $vgpr0_lo16 = COPY killed [[S_MOV_B32_]].lo16 + ; GCN: SI_RETURN_TO_EPILOG $vgpr0_lo16 + %0:sreg_32 = S_MOV_B32 2048 + $vgpr0_lo16 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG $vgpr0_lo16 + +... + +--- +name: fold_vimm_16_lo_to_hi +body: | + bb.0: + + ; GCN-LABEL: name: fold_vimm_16_lo_to_hi + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: %1.hi16:vgpr_32 = COPY killed [[S_MOV_B32_]].lo16 + ; GCN: SI_RETURN_TO_EPILOG %1 + %0:sreg_32 = S_MOV_B32 2048 + %1.hi16:vgpr_32 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_vimm_16_hi_to_lo +body: | + bb.0: + + ; GCN-LABEL: name: fold_vimm_16_hi_to_lo + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: %1.lo16:vgpr_32 = COPY killed [[S_MOV_B32_]].hi16 + ; GCN: SI_RETURN_TO_EPILOG %1 + %0:sreg_32 = S_MOV_B32 2048 + %1.lo16:vgpr_32 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_simm_16_sub_to_sub_lo_to_hi +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_sub_lo_to_hi + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: %1.hi16:sreg_32 = COPY killed [[S_MOV_B32_]].lo16 + ; GCN: SI_RETURN_TO_EPILOG %1 + %0:sreg_32 = S_MOV_B32 2048 + %1.hi16:sreg_32 = COPY killed %0.lo16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_simm_16_sub_to_sub_hi_to_lo_2048 +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_sub_hi_to_lo_2048 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]] + %0:sreg_32 = S_MOV_B32 2048 + %1.lo16:sreg_32 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_simm_16_sub_to_sub_hi_to_lo_shifted_2048 +body: | + bb.0: + + ; GCN-LABEL: name: fold_simm_16_sub_to_sub_hi_to_lo_shifted_2048 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 134217728 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]] + %0:sreg_32 = S_MOV_B32 134217728 + %1.lo16:sreg_32 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_aimm_16_sub_to_sub_hi_to_lo_2048 +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_2048 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048 + ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec + ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]] + %0:sreg_32 = S_MOV_B32 2048 + %1.lo16:agpr_32 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_1 +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_1 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65536 + ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 1, implicit $exec + ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]] + %0:sreg_32 = S_MOV_B32 65536 + %1.lo16:agpr_32 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_2048 +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_2048 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 134217728 + ; GCN: %1.lo16:agpr_32 = COPY killed [[S_MOV_B32_]].hi16 + ; GCN: SI_RETURN_TO_EPILOG %1 + %0:sreg_32 = S_MOV_B32 134217728 + %1.lo16:agpr_32 = COPY killed %0.hi16 + SI_RETURN_TO_EPILOG %1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll index cc3264af104639..a0a78e96b920a6 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll @@ -3,7 +3,7 @@ ; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} define void @csr() #0 { - call void asm sideeffect "", "~{v0},~{v36},~{v37}"() #0 + call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 9722972b4a2ea0..b4925a2e046ba6 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -189,44 +189,44 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v35, s34, 4 +; GFX9-NEXT: v_writelane_b32 v43, s34, 4 ; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s34 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v35, s36, 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s34 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v43, s36, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+4 -; GFX9-NEXT: v_writelane_b32 v35, s37, 1 +; GFX9-NEXT: v_writelane_b32 v43, s37, 1 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v32, v1 -; GFX9-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-NEXT: v_writelane_b32 v35, s30, 2 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v33, v32 -; GFX9-NEXT: v_writelane_b32 v35, s31, 3 -; GFX9-NEXT: v_and_b32_e32 v34, 0xffffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_writelane_b32 v43, s30, 2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 +; GFX9-NEXT: v_writelane_b32 v43, s31, 3 +; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_mad_u32_u24 v32, v33, v32, v34 -; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_add_u32_e32 v0, v32, v34 +; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s4, v35, 2 -; GFX9-NEXT: v_readlane_b32 s5, v35, 3 -; GFX9-NEXT: v_readlane_b32 s37, v35, 1 -; GFX9-NEXT: v_readlane_b32 s36, v35, 0 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s34 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s4, v43, 2 +; GFX9-NEXT: v_readlane_b32 s5, v43, 3 +; GFX9-NEXT: v_readlane_b32 s37, v43, 1 +; GFX9-NEXT: v_readlane_b32 s36, v43, 0 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s34 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s34, v35, 4 +; GFX9-NEXT: v_readlane_b32 s34, v43, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index fdbe3a25e64e87..562e40bc5c6d5e 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -12,23 +12,23 @@ declare void @external_void_func_i32(i32) #0 ; Spill CSR VGPR used for SGPR spilling ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v32, s34, 2 +; GCN-DAG: v_writelane_b32 v40, s34, 2 ; GCN-DAG: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: v_writelane_b32 v32, s30, 0 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 v40, s30, 0 +; GCN-DAG: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s4, v32, 0 -; GCN: v_readlane_b32 s5, v32, 1 +; GCN: v_readlane_b32 s4, v40, 0 +; GCN: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir index bec7969382d0ff..0020e17a0b6fe0 100644 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -254,7 +254,7 @@ body: | ... # GCN-LABEL: csr{{$}} -# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, +# GCN: V_AND_B32_e32 $vgpr37, $vgpr0, --- name: csr tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index f887a959cbd28c..bb03589ec2fb4c 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -152,9 +152,6 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l ; FIXME: Why load and store same location for stack args? ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill - ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 @@ -163,9 +160,6 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload - ; GCN-NOT: s32 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { @@ -176,7 +170,7 @@ entry: ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:40 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -203,15 +197,15 @@ entry: ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec ; GCN: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v34, s36, 0 -; GCN-DAG: v_writelane_b32 v34, s37, 1 +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v42, s36, 0 +; GCN-DAG: v_writelane_b32 v42, s37, 1 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -220,11 +214,11 @@ entry: ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s36, v34, 0 -; GCN-DAG: v_readlane_b32 s37, v34, 1 +; GCN-DAG: v_readlane_b32 s36, v42, 0 +; GCN-DAG: v_readlane_b32 s37, v42, 1 -; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GCN: buffer_load_dword v41, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: buffer_load_dword v40, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 @@ -233,7 +227,7 @@ entry: ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { @@ -248,11 +242,11 @@ entry: ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: ; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: ; GCN-NOT: s33 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: ; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll index 69a4d7eac9ea6a..1581482bd020a2 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,17 +2,17 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v32, s34, 2 +; GCN: v_writelane_b32 v40, s34, 2 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}} -; GCN: v_readlane_b32 s34, v32, 2 +; GCN: v_readlane_b32 s34, v40, 2 ; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index be60a34b420891..2fd5a046fd80de 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -1,6 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s +; FIXME: The MUBUF loads in this test output are incorrect, their SOffset +; should use the frame offset register, not the ABI stack pointer register. We +; rely on the frame index argument of MUBUF stack accesses to survive until PEI +; so we can fix up the SOffset to use the correct frame register in +; eliminateFrameIndex. Some things like LocalStackSlotAllocation can lift the +; frame index up into something (e.g. `v_add_nc_u32`) that we cannot fold back +; into the MUBUF instruction, and so we end up emitting an incorrect offset. +; Fixing this may involve adding stack access pseudos so that we don't have to +; speculatively refer to the ABI stack pointer register at all. + ; An assert was hit when frame offset register was used to address FrameIndex. define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) { ; GCN-LABEL: kernel_background_evaluate: @@ -18,7 +28,7 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: v_mov_b32_e32 v4, 0x400000 ; GCN-NEXT: s_add_u32 s32, s33, 0xc0000 -; GCN-NEXT: v_add_nc_u32_e64 v32, 4, 0x4000 +; GCN-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 ; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 @@ -30,8 +40,8 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if.then4.i -; GCN-NEXT: buffer_load_dword v0, v32, s[36:39], s32 offen -; GCN-NEXT: buffer_load_dword v1, v32, s[36:39], s32 offen offset:4 +; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], s32 offen +; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], s32 offen offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll new file mode 100644 index 00000000000000..6aa5010b3d78f4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -0,0 +1,170 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + +declare void @extern_func() + +define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { +; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be +; preserved across the call and should get 8 scratch registers. + +; GFX9-LABEL: non_preserved_vgpr_tuple8: +; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s34 ; 4-byte Folded Spill + +; GFX9: v_mov_b32_e32 v37, v11 +; GFX9-NEXT: v_mov_b32_e32 v38, v10 +; GFX9-NEXT: v_mov_b32_e32 v49, v9 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v36, v16 +; GFX9-NEXT: v_mov_b32_e32 v35, v15 +; GFX9-NEXT: v_mov_b32_e32 v34, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v13 +; GFX9-NEXT: v_mov_b32_e32 v32, v12 + +; GFX9: ;;#ASMSTART +; GFX9-NEXT: ;;#ASMEND + +; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] + +; GFX9: buffer_load_dword v43, off, s[0:3], s34 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload +; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: non_preserved_vgpr_tuple8: +; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s34 ; 4-byte Folded Spill + +; GFX10: v_mov_b32_e32 v36, v16 +; GFX10-NEXT: v_mov_b32_e32 v35, v15 +; GFX10-NEXT: v_mov_b32_e32 v34, v14 +; GFX10-NEXT: v_mov_b32_e32 v33, v13 +; GFX10-NEXT: v_mov_b32_e32 v32, v12 + +; GFX10: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND + +; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1 +; GFX10-NEXT: v_nop +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] + +; GFX10: buffer_load_dword v43, off, s[0:3], s34 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload + +; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10: s_setpc_b64 s[4:5] +main_body: + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 + call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 + call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 + call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0 + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + call void @extern_func() + ret <4 x float> %v +} + +define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { +; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved +; across the call and should get allcoated to 8 CSRs. +; Only the lower 5 sub-registers of the tuple are preserved. +; The upper 3 sub-registers are unused. + +; GFX9-LABEL: call_preserved_vgpr_tuple8: +; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9: buffer_store_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s34 ; 4-byte Folded Spill + +; GFX9: v_mov_b32_e32 v44, v16 +; GFX9-NEXT: v_mov_b32_e32 v43, v15 +; GFX9-NEXT: v_mov_b32_e32 v42, v14 +; GFX9-NEXT: v_mov_b32_e32 v41, v13 +; GFX9-NEXT: v_mov_b32_e32 v40, v12 + +; GFX9: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1 + +; GFX9: buffer_load_dword v44, off, s[0:3], s34 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Reload + +; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: call_preserved_vgpr_tuple8: +; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX10: buffer_store_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s34 ; 4-byte Folded Spill + +; GFX10: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX10-NEXT: v_mov_b32_e32 v40, v16 +; GFX10-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[44:47] dmask:0x1 +; GFX10-NEXT: v_mov_b32_e32 v42, v14 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v43, v13 +; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1 + +; GFX10: buffer_load_dword v44, off, s[0:3], s34 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Reload +; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX10: s_setpc_b64 s[4:5] +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + store <4 x float> %v, <4 x float> addrspace(1)* undef + call void @extern_func() + %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + ret <4 x float> %v1 +} + +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 + +attributes #0 = { nounwind writeonly } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index a5d5e7c82d70b8..435c36bdedbcde 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -30,7 +30,7 @@ machineFunctionInfo: body: | bb.0: ; CHECK-LABEL: name: undef_identity_copy - ; CHECK: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) + ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 ; CHECK: $sgpr4 = COPY $sgpr95 @@ -39,9 +39,9 @@ body: | ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @bar + 4, target-flags(amdgpu-rel32-hi) @bar + 4, implicit-def dead $scc ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 ; CHECK: $sgpr4 = COPY $sgpr95 - ; CHECK: $vgpr0 = COPY renamable $vgpr32 - ; CHECK: $vgpr1 = COPY renamable $vgpr33 - ; CHECK: $vgpr2 = COPY renamable $vgpr34 + ; CHECK: $vgpr0 = COPY renamable $vgpr40 + ; CHECK: $vgpr1 = COPY renamable $vgpr41 + ; CHECK: $vgpr2 = COPY renamable $vgpr42 ; CHECK: $vgpr3 = KILL undef renamable $vgpr3 ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0 ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index e3149be899c068..ef2cce1202f1e0 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1058,30 +1058,30 @@ declare void @external_void_func_void() #1 ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: v_nop ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 ; GCN: s_mov_b32 s34, s32 ; GFX1064: s_add_u32 s32, s32, 0x400 ; GFX1032: s_add_u32 s32, s32, 0x200 -; GCN-DAG: v_writelane_b32 v32, s30, 0 -; GCN-DAG: v_writelane_b32 v32, s31, 1 +; GCN-DAG: v_writelane_b32 v40, s30, 0 +; GCN-DAG: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s4, v32, 0 -; GCN-DAG: v_readlane_b32 s5, v32, 1 +; GCN-DAG: v_readlane_b32 s4, v40, 0 +; GCN-DAG: v_readlane_b32 s5, v40, 1 ; GFX1064: s_sub_u32 s32, s32, 0x400 ; GFX1032: s_sub_u32 s32, s32, 0x200 -; GCN: v_readlane_b32 s34, v32, 2 +; GCN: v_readlane_b32 s34, v40, 2 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: v_nop ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] diff --git a/llvm/test/CodeGen/PowerPC/sms-remark.ll b/llvm/test/CodeGen/PowerPC/sms-remark.ll new file mode 100644 index 00000000000000..647b56fa7fcd3f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/sms-remark.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \ +; RUN: -pass-remarks-analysis=pipeliner -pass-remarks=pipeliner -o /dev/null 2>&1 \ +; RUN: | FileCheck %s + +@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4 +@y = dso_local global [1024 x i32] zeroinitializer, align 4 + +define dso_local i32* @foo() local_unnamed_addr { +;CHECK: Schedule found with Initiation Interval +;CHECK: Pipelined succesfully! +entry: + %.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0) + +for.body: ; preds = %for.body, %entry + %0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ] + %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ] + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %1, %1 + %add = add nsw i32 %mul, %0 + %arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next + %2 = load i32, i32* %arrayidx2.1, align 4 + %mul.1 = mul nsw i32 %2, %2 + %add.1 = add nsw i32 %mul.1, %add + %arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next + store i32 %add.1, i32* %arrayidx6.1, align 4 + %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1 + %3 = load i32, i32* %arrayidx2.2, align 4 + %mul.2 = mul nsw i32 %3, %3 + %add.2 = add nsw i32 %mul.2, %add.1 + %arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1 + store i32 %add.2, i32* %arrayidx6.2, align 4 + %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3 + %exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024 + br i1 %exitcond.2, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll index fb206b84fa936f..319d4775c5ebe2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll @@ -393,34 +393,50 @@ define void @PR39538(i8* %t0, i32* %t1) { ; CHECK-NEXT: [[T63:%.*]] = load i8, i8* [[T62]], align 1 ; CHECK-NEXT: [[T68:%.*]] = load i8, i8* [[T67]], align 1 ; CHECK-NEXT: [[T73:%.*]] = load i8, i8* [[T72]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> undef, i8 [[T3]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[T21]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[T40]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[T59]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> undef, i8 [[T7]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i8> [[TMP6]], i8 [[T25]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> [[TMP7]], i8 [[T44]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[T63]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> undef, i8 [[T12]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[T30]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[T49]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[T68]], i32 3 -; CHECK-NEXT: [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i8> undef, i8 [[T17]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[T35]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[T54]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[T73]], i32 3 -; CHECK-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shl nuw <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw <4 x i32> [[TMP15]], -; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[TMP26:%.*]] = or <4 x i32> [[TMP25]], [[TMP20]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[T1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP26]], <4 x i32>* [[TMP27]], align 4 +; CHECK-NEXT: [[T4:%.*]] = zext i8 [[T3]] to i32 +; CHECK-NEXT: [[T8:%.*]] = zext i8 [[T7]] to i32 +; CHECK-NEXT: [[T13:%.*]] = zext i8 [[T12]] to i32 +; CHECK-NEXT: [[T18:%.*]] = zext i8 [[T17]] to i32 +; CHECK-NEXT: [[T22:%.*]] = zext i8 [[T21]] to i32 +; CHECK-NEXT: [[T26:%.*]] = zext i8 [[T25]] to i32 +; CHECK-NEXT: [[T31:%.*]] = zext i8 [[T30]] to i32 +; CHECK-NEXT: [[T36:%.*]] = zext i8 [[T35]] to i32 +; CHECK-NEXT: [[T41:%.*]] = zext i8 [[T40]] to i32 +; CHECK-NEXT: [[T45:%.*]] = zext i8 [[T44]] to i32 +; CHECK-NEXT: [[T50:%.*]] = zext i8 [[T49]] to i32 +; CHECK-NEXT: [[T55:%.*]] = zext i8 [[T54]] to i32 +; CHECK-NEXT: [[T60:%.*]] = zext i8 [[T59]] to i32 +; CHECK-NEXT: [[T64:%.*]] = zext i8 [[T63]] to i32 +; CHECK-NEXT: [[T69:%.*]] = zext i8 [[T68]] to i32 +; CHECK-NEXT: [[T74:%.*]] = zext i8 [[T73]] to i32 +; CHECK-NEXT: [[T5:%.*]] = shl nuw i32 [[T4]], 24 +; CHECK-NEXT: [[T23:%.*]] = shl nuw i32 [[T22]], 24 +; CHECK-NEXT: [[T42:%.*]] = shl nuw i32 [[T41]], 24 +; CHECK-NEXT: [[T61:%.*]] = shl nuw i32 [[T60]], 24 +; CHECK-NEXT: [[T9:%.*]] = shl nuw nsw i32 [[T8]], 16 +; CHECK-NEXT: [[T27:%.*]] = shl nuw nsw i32 [[T26]], 16 +; CHECK-NEXT: [[T46:%.*]] = shl nuw nsw i32 [[T45]], 16 +; CHECK-NEXT: [[T65:%.*]] = shl nuw nsw i32 [[T64]], 16 +; CHECK-NEXT: [[T14:%.*]] = shl nuw nsw i32 [[T13]], 8 +; CHECK-NEXT: [[T32:%.*]] = shl nuw nsw i32 [[T31]], 8 +; CHECK-NEXT: [[T51:%.*]] = shl nuw nsw i32 [[T50]], 8 +; CHECK-NEXT: [[T70:%.*]] = shl nuw nsw i32 [[T69]], 8 +; CHECK-NEXT: [[T10:%.*]] = or i32 [[T9]], [[T5]] +; CHECK-NEXT: [[T15:%.*]] = or i32 [[T10]], [[T14]] +; CHECK-NEXT: [[T19:%.*]] = or i32 [[T15]], [[T18]] +; CHECK-NEXT: [[T28:%.*]] = or i32 [[T27]], [[T23]] +; CHECK-NEXT: [[T33:%.*]] = or i32 [[T28]], [[T32]] +; CHECK-NEXT: [[T37:%.*]] = or i32 [[T33]], [[T36]] +; CHECK-NEXT: [[T47:%.*]] = or i32 [[T46]], [[T42]] +; CHECK-NEXT: [[T52:%.*]] = or i32 [[T47]], [[T51]] +; CHECK-NEXT: [[T56:%.*]] = or i32 [[T52]], [[T55]] +; CHECK-NEXT: [[T66:%.*]] = or i32 [[T65]], [[T61]] +; CHECK-NEXT: [[T71:%.*]] = or i32 [[T66]], [[T70]] +; CHECK-NEXT: [[T75:%.*]] = or i32 [[T71]], [[T74]] +; CHECK-NEXT: store i32 [[T19]], i32* [[T1]], align 4 +; CHECK-NEXT: store i32 [[T37]], i32* [[T38]], align 4 +; CHECK-NEXT: store i32 [[T56]], i32* [[T57]], align 4 +; CHECK-NEXT: store i32 [[T75]], i32* [[T76]], align 4 ; CHECK-NEXT: ret void ; %t6 = getelementptr inbounds i8, i8* %t0, i64 1 diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp index b2eddd52e68a7e..a577f1b744bc97 100644 --- a/llvm/unittests/Support/Path.cpp +++ b/llvm/unittests/Support/Path.cpp @@ -1182,9 +1182,10 @@ TEST(Support, NormalizePath) { Tests.emplace_back("a", "a", "a"); Tests.emplace_back("a/b", "a\\b", "a/b"); Tests.emplace_back("a\\b", "a\\b", "a/b"); - Tests.emplace_back("a\\\\b", "a\\\\b", "a\\\\b"); + Tests.emplace_back("a\\\\b", "a\\\\b", "a//b"); Tests.emplace_back("\\a", "\\a", "/a"); Tests.emplace_back("a\\", "a\\", "a/"); + Tests.emplace_back("a\\t", "a\\t", "a/t"); for (auto &T : Tests) { SmallString<64> Win(std::get<0>(T)); diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h index 51228d3e8437a8..573f9b7c988f17 100644 --- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h +++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h @@ -286,6 +286,7 @@ class DmaWaitOp void print(OpAsmPrinter &p); LogicalResult fold(ArrayRef cstOperands, SmallVectorImpl &results); + LogicalResult verify(); }; /// Prints dimension and symbol list. diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h index 2f38a24236e3a5..7a07b6db23fce7 100644 --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -38,8 +38,9 @@ LogicalResult loopUnrollFull(AffineForOp forOp); /// Unrolls this for operation by the specified unroll factor. Returns failure /// if the loop cannot be unrolled either due to restrictions or due to invalid -/// unroll factors. +/// unroll factors. Requires positive loop bounds and step. LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor); +LogicalResult loopUnrollByFactor(loop::ForOp forOp, uint64_t unrollFactor); /// Unrolls this loop by the specified unroll factor or its trip count, /// whichever is lower. @@ -68,9 +69,10 @@ LogicalResult loopUnrollJamByFactor(AffineForOp forOp, LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor); -/// Promotes the loop body of a AffineForOp to its containing block if the -/// AffineForOp was known to have a single iteration. +/// Promotes the loop body of a AffineForOp/loop::ForOp to its containing block +/// if the loop was known to have a single iteration. LogicalResult promoteIfSingleIteration(AffineForOp forOp); +LogicalResult promoteIfSingleIteration(loop::ForOp forOp); /// Promotes all single iteration AffineForOp's in the Function, i.e., moves /// their body into the containing Block. diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt index 98b52eb8155274..4696dd65fa62a2 100644 --- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt @@ -9,10 +9,12 @@ set(SOURCES if (MLIR_CUDA_CONVERSIONS_ENABLED) list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp") set(NVPTX_LIBS + MC NVPTXCodeGen NVPTXDesc NVPTXInfo ) + endif() add_mlir_conversion_library(MLIRGPUtoCUDATransforms @@ -24,7 +26,6 @@ add_mlir_conversion_library(MLIRGPUtoCUDATransforms LINK_COMPONENTS Core - MC ${NVPTX_LIBS} LINK_LIBS PUBLIC diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp index 8ef24e2391524e..972a37d20f97b1 100644 --- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -1444,49 +1444,82 @@ ParseResult DmaStartOp::parse(OpAsmParser &parser, OperationState &result) { parser.resolveOperands(tagIndexInfos, indexType, result.operands)) return failure(); - auto memrefType0 = types[0].dyn_cast(); - if (!memrefType0) - return parser.emitError(parser.getNameLoc(), - "expected source to be of memref type"); - - auto memrefType1 = types[1].dyn_cast(); - if (!memrefType1) - return parser.emitError(parser.getNameLoc(), - "expected destination to be of memref type"); - - auto memrefType2 = types[2].dyn_cast(); - if (!memrefType2) - return parser.emitError(parser.getNameLoc(), - "expected tag to be of memref type"); - if (isStrided) { if (parser.resolveOperands(strideInfo, indexType, result.operands)) return failure(); } - // Check that source/destination index list size matches associated rank. - if (static_cast(srcIndexInfos.size()) != memrefType0.getRank() || - static_cast(dstIndexInfos.size()) != memrefType1.getRank()) - return parser.emitError(parser.getNameLoc(), - "memref rank not equal to indices count"); - if (static_cast(tagIndexInfos.size()) != memrefType2.getRank()) - return parser.emitError(parser.getNameLoc(), - "tag memref rank not equal to indices count"); return success(); } LogicalResult DmaStartOp::verify() { + unsigned numOperands = getNumOperands(); + + // Mandatory non-variadic operands are: src memref, dst memref, tag memref and + // the number of elements. + if (numOperands < 4) + return emitOpError("expected at least 4 operands"); + + // Check types of operands. The order of these calls is important: the later + // calls rely on some type properties to compute the operand position. + // 1. Source memref. + if (!getSrcMemRef().getType().isa()) + return emitOpError("expected source to be of memref type"); + if (numOperands < getSrcMemRefRank() + 4) + return emitOpError() << "expected at least " << getSrcMemRefRank() + 4 + << " operands"; + if (!getSrcIndices().empty() && + !llvm::all_of(getSrcIndices().getTypes(), + [](Type t) { return t.isIndex(); })) + return emitOpError("expected source indices to be of index type"); + + // 2. Destination memref. + if (!getDstMemRef().getType().isa()) + return emitOpError("expected destination to be of memref type"); + unsigned numExpectedOperands = getSrcMemRefRank() + getDstMemRefRank() + 4; + if (numOperands < numExpectedOperands) + return emitOpError() << "expected at least " << numExpectedOperands + << " operands"; + if (!getDstIndices().empty() && + !llvm::all_of(getDstIndices().getTypes(), + [](Type t) { return t.isIndex(); })) + return emitOpError("expected destination indices to be of index type"); + + // 3. Number of elements. + if (!getNumElements().getType().isIndex()) + return emitOpError("expected num elements to be of index type"); + + // 4. Tag memref. + if (!getTagMemRef().getType().isa()) + return emitOpError("expected tag to be of memref type"); + numExpectedOperands += getTagMemRefRank(); + if (numOperands < numExpectedOperands) + return emitOpError() << "expected at least " << numExpectedOperands + << " operands"; + if (!getTagIndices().empty() && + !llvm::all_of(getTagIndices().getTypes(), + [](Type t) { return t.isIndex(); })) + return emitOpError("expected tag indices to be of index type"); + // DMAs from different memory spaces supported. if (getSrcMemorySpace() == getDstMemorySpace()) return emitOpError("DMA should be between different memory spaces"); - if (getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() + - getDstMemRefRank() + 3 + 1 && - getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() + - getDstMemRefRank() + 3 + 1 + 2) { + // Optional stride-related operands must be either both present or both + // absent. + if (numOperands != numExpectedOperands && + numOperands != numExpectedOperands + 2) return emitOpError("incorrect number of operands"); + + // 5. Strides. + if (isStrided()) { + if (!getStride().getType().isIndex() || + !getNumElementsPerStride().getType().isIndex()) + return emitOpError( + "expected stride and num elements per stride to be of type index"); } + return success(); } @@ -1536,15 +1569,6 @@ ParseResult DmaWaitOp::parse(OpAsmParser &parser, OperationState &result) { parser.resolveOperand(numElementsInfo, indexType, result.operands)) return failure(); - auto memrefType = type.dyn_cast(); - if (!memrefType) - return parser.emitError(parser.getNameLoc(), - "expected tag to be of memref type"); - - if (static_cast(tagIndexInfos.size()) != memrefType.getRank()) - return parser.emitError(parser.getNameLoc(), - "tag memref rank not equal to indices count"); - return success(); } @@ -1554,6 +1578,32 @@ LogicalResult DmaWaitOp::fold(ArrayRef cstOperands, return foldMemRefCast(*this); } +LogicalResult DmaWaitOp::verify() { + // Mandatory non-variadic operands are tag and the number of elements. + if (getNumOperands() < 2) + return emitOpError() << "expected at least 2 operands"; + + // Check types of operands. The order of these calls is important: the later + // calls rely on some type properties to compute the operand position. + if (!getTagMemRef().getType().isa()) + return emitOpError() << "expected tag to be of memref type"; + + if (getNumOperands() != 2 + getTagMemRefRank()) + return emitOpError() << "expected " << 2 + getTagMemRefRank() + << " operands"; + + if (!getTagIndices().empty() && + !llvm::all_of(getTagIndices().getTypes(), + [](Type t) { return t.isIndex(); })) + return emitOpError() << "expected tag indices to be of index type"; + + if (!getNumElements().getType().isIndex()) + return emitOpError() + << "expected the number of elements to be of index type"; + + return success(); +} + //===----------------------------------------------------------------------===// // ExtractElementOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index 4b0cd6c8eb1da1..35581eb2a39250 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -24,6 +24,7 @@ #include "mlir/IR/Function.h" #include "mlir/IR/IntegerSet.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/Support/MathExtras.h" #include "mlir/Transforms/RegionUtils.h" #include "mlir/Transforms/Utils.h" #include "llvm/ADT/DenseMap.h" @@ -118,6 +119,34 @@ static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, lb.erase(); } +// Build the IR that performs ceil division of a positive value by a constant: +// ceildiv(a, B) = divis(a + (B-1), B) +// where divis is rounding-to-zero division. +static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, + int64_t divisor) { + assert(divisor > 0 && "expected positive divisor"); + assert(dividend.getType().isIndex() && "expected index-typed value"); + + Value divisorMinusOneCst = builder.create(loc, divisor - 1); + Value divisorCst = builder.create(loc, divisor); + Value sum = builder.create(loc, dividend, divisorMinusOneCst); + return builder.create(loc, sum, divisorCst); +} + +// Build the IR that performs ceil division of a positive value by another +// positive value: +// ceildiv(a, b) = divis(a + (b - 1), b) +// where divis is rounding-to-zero division. +static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, + Value divisor) { + assert(dividend.getType().isIndex() && "expected index-typed value"); + + Value cstOne = builder.create(loc, 1); + Value divisorMinusOne = builder.create(loc, divisor, cstOne); + Value sum = builder.create(loc, dividend, divisorMinusOne); + return builder.create(loc, sum, divisor); +} + /// Promotes the loop body of a forOp to its containing block if the forOp /// was known to have a single iteration. // TODO(bondhugula): extend this for arbitrary affine bounds. @@ -161,6 +190,35 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) { return success(); } +/// Promotes the loop body of a forOp to its containing block if the forOp +/// it can be determined that the loop has a single iteration. +LogicalResult mlir::promoteIfSingleIteration(loop::ForOp forOp) { + auto lbCstOp = + dyn_cast_or_null(forOp.lowerBound().getDefiningOp()); + auto ubCstOp = + dyn_cast_or_null(forOp.upperBound().getDefiningOp()); + auto stepCstOp = + dyn_cast_or_null(forOp.step().getDefiningOp()); + if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.getValue() < 0 || + ubCstOp.getValue() < 0 || stepCstOp.getValue() < 0) + return failure(); + int64_t tripCount = mlir::ceilDiv(ubCstOp.getValue() - lbCstOp.getValue(), + stepCstOp.getValue()); + if (tripCount != 1) + return failure(); + auto iv = forOp.getInductionVar(); + iv.replaceAllUsesWith(lbCstOp); + + // Move the loop body operations, except for its terminator, to the loop's + // containing block. + auto *parentBlock = forOp.getOperation()->getBlock(); + forOp.getBody()->back().erase(); + parentBlock->getOperations().splice(Block::iterator(forOp), + forOp.getBody()->getOperations()); + forOp.erase(); + return success(); +} + /// Promotes all single iteration 'for' ops in `f`, i.e., moves /// their body into the containing Block. void mlir::promoteSingleIterationLoops(FuncOp f) { @@ -416,6 +474,37 @@ LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp, return loopUnrollByFactor(forOp, unrollFactor); } +// Generates unrolled copies of AffineForOp or loop::ForOp 'loopBodyBlock', with +// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap +// 'forOpIV' for each unrolled body. +static void generateUnrolledLoop( + Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, + function_ref ivRemapFn) { + // Builder to insert unrolled bodies just before the terminator of the body of + // 'forOp'. + auto builder = OpBuilder::atBlockTerminator(loopBodyBlock); + + // Keep a pointer to the last non-terminator operation in the original block + // so that we know what to clone (since we are doing this in-place). + Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2); + + // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies). + for (unsigned i = 1; i < unrollFactor; i++) { + BlockAndValueMapping operandMap; + + // If the induction variable is used, create a remapping to the value for + // this unrolled instance. + if (!forOpIV.use_empty()) { + Value ivUnroll = ivRemapFn(i, forOpIV, builder); + operandMap.map(forOpIV, ivUnroll); + } + + // Clone the original body of 'forOp'. + for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) + builder.clone(*it, operandMap); + } +} + /// Unrolls this loop by the specified factor. Returns success if the loop /// is successfully unrolled. LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp, @@ -467,38 +556,114 @@ LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp, // Scale the step of loop being unrolled by unroll factor. int64_t step = forOp.getStep(); forOp.setStep(step * unrollFactor); + generateUnrolledLoop(forOp.getBody(), forOp.getInductionVar(), unrollFactor, + [&](unsigned i, Value iv, OpBuilder b) { + // iv' = iv + i * step + auto d0 = b.getAffineDimExpr(0); + auto bumpMap = AffineMap::get(1, 0, d0 + i * step); + return b.create(forOp.getLoc(), bumpMap, + iv); + }); - // Builder to insert unrolled bodies just before the terminator of the body of - // 'forOp'. - auto builder = OpBuilder::atBlockTerminator(forOp.getBody()); + // Promote the loop body up if this has turned into a single iteration loop. + promoteIfSingleIteration(forOp); + return success(); +} - // Keep a pointer to the last non-terminator operation in the original block - // so that we know what to clone (since we are doing this in-place). - Block::iterator srcBlockEnd = std::prev(forOp.getBody()->end(), 2); +/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled. +LogicalResult mlir::loopUnrollByFactor(loop::ForOp forOp, + uint64_t unrollFactor) { + assert(unrollFactor > 0 && "expected positive unroll factor"); + if (unrollFactor == 1) + return promoteIfSingleIteration(forOp); - // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies). - auto forOpIV = forOp.getInductionVar(); - for (unsigned i = 1; i < unrollFactor; i++) { - BlockAndValueMapping operandMap; + // Return if the loop body is empty. + if (llvm::hasSingleElement(forOp.getBody()->getOperations())) + return success(); - // If the induction variable is used, create a remapping to the value for - // this unrolled instance. - if (!forOpIV.use_empty()) { - // iv' = iv + 1/2/3...unrollFactor-1; - auto d0 = builder.getAffineDimExpr(0); - auto bumpMap = AffineMap::get(1, 0, d0 + i * step); - auto ivUnroll = - builder.create(forOp.getLoc(), bumpMap, forOpIV); - operandMap.map(forOpIV, ivUnroll); - } + // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate + // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases. + OpBuilder boundsBuilder(forOp); + auto loc = forOp.getLoc(); + auto step = forOp.step(); + Value upperBoundUnrolled; + Value stepUnrolled; + bool generateEpilogueLoop = true; + + auto lbCstOp = + dyn_cast_or_null(forOp.lowerBound().getDefiningOp()); + auto ubCstOp = + dyn_cast_or_null(forOp.upperBound().getDefiningOp()); + auto stepCstOp = + dyn_cast_or_null(forOp.step().getDefiningOp()); + if (lbCstOp && ubCstOp && stepCstOp) { + // Constant loop bounds computation. + int64_t lbCst = lbCstOp.getValue(); + int64_t ubCst = ubCstOp.getValue(); + int64_t stepCst = stepCstOp.getValue(); + assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 && + "expected positive loop bounds and step"); + int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst); + int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor); + int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst; + assert(upperBoundUnrolledCst <= ubCst); + int64_t stepUnrolledCst = stepCst * unrollFactor; + + // Create constant for 'upperBoundUnrolled' and set epilogue loop flag. + generateEpilogueLoop = upperBoundUnrolledCst < ubCst; + if (generateEpilogueLoop) + upperBoundUnrolled = + boundsBuilder.create(loc, upperBoundUnrolledCst); + else + upperBoundUnrolled = ubCstOp; + + // Create constant for 'stepUnrolled'. + stepUnrolled = + stepCst == stepUnrolledCst + ? step + : boundsBuilder.create(loc, stepUnrolledCst); + } else { + // Dynamic loop bounds computation. + // TODO(andydavis) Add dynamic asserts for negative lb/ub/step, or + // consider using ceilDiv from AffineApplyExpander. + auto lowerBound = forOp.lowerBound(); + auto upperBound = forOp.upperBound(); + Value diff = boundsBuilder.create(loc, upperBound, lowerBound); + Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step); + Value unrollFactorCst = + boundsBuilder.create(loc, unrollFactor); + Value tripCountRem = + boundsBuilder.create(loc, tripCount, unrollFactorCst); + // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor) + Value tripCountEvenMultiple = + boundsBuilder.create(loc, tripCount, tripCountRem); + // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step + upperBoundUnrolled = boundsBuilder.create( + loc, lowerBound, + boundsBuilder.create(loc, tripCountEvenMultiple, step)); + // Scale 'step' by 'unrollFactor'. + stepUnrolled = boundsBuilder.create(loc, step, unrollFactorCst); + } - // Clone the original body of 'forOp'. - for (auto it = forOp.getBody()->begin(); it != std::next(srcBlockEnd); - it++) { - builder.clone(*it, operandMap); - } + // Create epilogue clean up loop starting at 'upperBoundUnrolled'. + if (generateEpilogueLoop) { + OpBuilder epilogueBuilder(forOp.getOperation()->getBlock(), + std::next(Block::iterator(forOp))); + auto epilogueForOp = cast(epilogueBuilder.clone(*forOp)); + epilogueForOp.setLowerBound(upperBoundUnrolled); + promoteIfSingleIteration(epilogueForOp); } + // Create unrolled loop. + forOp.setUpperBound(upperBoundUnrolled); + forOp.setStep(stepUnrolled); + generateUnrolledLoop(forOp.getBody(), forOp.getInductionVar(), unrollFactor, + [&](unsigned i, Value iv, OpBuilder b) { + // iv' = iv + step * i; + auto stride = b.create( + loc, step, b.create(loc, i)); + return b.create(loc, iv, stride); + }); // Promote the loop body up if this has turned into a single iteration loop. promoteIfSingleIteration(forOp); return success(); @@ -1032,34 +1197,6 @@ Loops mlir::tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef sizes) { return ::tile(forOps, sizes, forOps.back()); } -// Build the IR that performs ceil division of a positive value by a constant: -// ceildiv(a, B) = divis(a + (B-1), B) -// where divis is rounding-to-zero division. -static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, - int64_t divisor) { - assert(divisor > 0 && "expected positive divisor"); - assert(dividend.getType().isIndex() && "expected index-typed value"); - - Value divisorMinusOneCst = builder.create(loc, divisor - 1); - Value divisorCst = builder.create(loc, divisor); - Value sum = builder.create(loc, dividend, divisorMinusOneCst); - return builder.create(loc, sum, divisorCst); -} - -// Build the IR that performs ceil division of a positive value by another -// positive value: -// ceildiv(a, b) = divis(a + (b - 1), b) -// where divis is rounding-to-zero division. -static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, - Value divisor) { - assert(dividend.getType().isIndex() && "expected index-typed value"); - - Value cstOne = builder.create(loc, 1); - Value divisorMinusOne = builder.create(loc, divisor, cstOne); - Value sum = builder.create(loc, dividend, divisorMinusOne); - return builder.create(loc, sum, divisor); -} - // Hoist the ops within `outer` that appear before `inner`. // Such ops include the ops that have been introduced by parametric tiling. // Ops that come from triangular loops (i.e. that belong to the program slice diff --git a/mlir/test/Dialect/Loops/loop-unroll.mlir b/mlir/test/Dialect/Loops/loop-unroll.mlir new file mode 100644 index 00000000000000..fa3ebc173e510c --- /dev/null +++ b/mlir/test/Dialect/Loops/loop-unroll.mlir @@ -0,0 +1,250 @@ +// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2' | FileCheck %s --check-prefix UNROLL-BY-2 +// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=3' | FileCheck %s --check-prefix UNROLL-BY-3 +// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=0' | FileCheck %s --check-prefix UNROLL-OUTER-BY-2 +// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=1' | FileCheck %s --check-prefix UNROLL-INNER-BY-2 + +func @dynamic_loop_unroll(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3: memref) { + %0 = constant 7.0 : f32 + loop.for %i0 = %arg0 to %arg1 step %arg2 { + store %0, %arg3[%i0] : memref + } + return +} +// UNROLL-BY-2-LABEL: func @dynamic_loop_unroll +// UNROLL-BY-2-SAME: %[[LB:.*0]]: index, +// UNROLL-BY-2-SAME: %[[UB:.*1]]: index, +// UNROLL-BY-2-SAME: %[[STEP:.*2]]: index, +// UNROLL-BY-2-SAME: %[[MEM:.*3]]: memref +// +// UNROLL-BY-2-DAG: %[[V0:.*]] = subi %[[UB]], %[[LB]] : index +// UNROLL-BY-2-DAG: %[[C1:.*]] = constant 1 : index +// UNROLL-BY-2-DAG: %[[V1:.*]] = subi %[[STEP]], %[[C1]] : index +// UNROLL-BY-2-DAG: %[[V2:.*]] = addi %[[V0]], %[[V1]] : index +// Compute trip count in V3. +// UNROLL-BY-2-DAG: %[[V3:.*]] = divi_signed %[[V2]], %[[STEP]] : index +// Store unroll factor in C2. +// UNROLL-BY-2-DAG: %[[C2:.*]] = constant 2 : index +// UNROLL-BY-2-DAG: %[[V4:.*]] = remi_signed %[[V3]], %[[C2]] : index +// UNROLL-BY-2-DAG: %[[V5:.*]] = subi %[[V3]], %[[V4]] : index +// UNROLL-BY-2-DAG: %[[V6:.*]] = muli %[[V5]], %[[STEP]] : index +// Compute upper bound of unrolled loop in V7. +// UNROLL-BY-2-DAG: %[[V7:.*]] = addi %[[LB]], %[[V6]] : index +// Compute step of unrolled loop in V8. +// UNROLL-BY-2-DAG: %[[V8:.*]] = muli %[[STEP]], %[[C2]] : index +// UNROLL-BY-2: loop.for %[[IV:.*]] = %[[LB]] to %[[V7]] step %[[V8]] { +// UNROLL-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-2-NEXT: %[[C1_IV:.*]] = constant 1 : index +// UNROLL-BY-2-NEXT: %[[V9:.*]] = muli %[[STEP]], %[[C1_IV]] : index +// UNROLL-BY-2-NEXT: %[[V10:.*]] = addi %[[IV]], %[[V9]] : index +// UNROLL-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[V10]]] : memref +// UNROLL-BY-2-NEXT: } +// UNROLL-BY-2-NEXT: loop.for %[[IV:.*]] = %[[V7]] to %[[UB]] step %[[STEP]] { +// UNROLL-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-2-NEXT: } +// UNROLL-BY-2-NEXT: return + +// UNROLL-BY-3-LABEL: func @dynamic_loop_unroll +// UNROLL-BY-3-SAME: %[[LB:.*0]]: index, +// UNROLL-BY-3-SAME: %[[UB:.*1]]: index, +// UNROLL-BY-3-SAME: %[[STEP:.*2]]: index, +// UNROLL-BY-3-SAME: %[[MEM:.*3]]: memref +// +// UNROLL-BY-3-DAG: %[[V0:.*]] = subi %[[UB]], %[[LB]] : index +// UNROLL-BY-3-DAG: %[[C1:.*]] = constant 1 : index +// UNROLL-BY-3-DAG: %[[V1:.*]] = subi %[[STEP]], %[[C1]] : index +// UNROLL-BY-3-DAG: %[[V2:.*]] = addi %[[V0]], %[[V1]] : index +// Compute trip count in V3. +// UNROLL-BY-3-DAG: %[[V3:.*]] = divi_signed %[[V2]], %[[STEP]] : index +// Store unroll factor in C3. +// UNROLL-BY-3-DAG: %[[C3:.*]] = constant 3 : index +// UNROLL-BY-3-DAG: %[[V4:.*]] = remi_signed %[[V3]], %[[C3]] : index +// UNROLL-BY-3-DAG: %[[V5:.*]] = subi %[[V3]], %[[V4]] : index +// UNROLL-BY-3-DAG: %[[V6:.*]] = muli %[[V5]], %[[STEP]] : index +// Compute upper bound of unrolled loop in V7. +// UNROLL-BY-3-DAG: %[[V7:.*]] = addi %[[LB]], %[[V6]] : index +// Compute step of unrolled loop in V8. +// UNROLL-BY-3-DAG: %[[V8:.*]] = muli %[[STEP]], %[[C3]] : index +// UNROLL-BY-3: loop.for %[[IV:.*]] = %[[LB]] to %[[V7]] step %[[V8]] { +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-3-NEXT: %[[C1_IV:.*]] = constant 1 : index +// UNROLL-BY-3-NEXT: %[[V9:.*]] = muli %[[STEP]], %[[C1_IV]] : index +// UNROLL-BY-3-NEXT: %[[V10:.*]] = addi %[[IV]], %[[V9]] : index +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[V10]]] : memref +// UNROLL-BY-3-NEXT: %[[C2_IV:.*]] = constant 2 : index +// UNROLL-BY-3-NEXT: %[[V11:.*]] = muli %[[STEP]], %[[C2_IV]] : index +// UNROLL-BY-3-NEXT: %[[V12:.*]] = addi %[[IV]], %[[V11]] : index +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[V12]]] : memref +// UNROLL-BY-3-NEXT: } +// UNROLL-BY-3-NEXT: loop.for %[[IV:.*]] = %[[V7]] to %[[UB]] step %[[STEP]] { +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-3-NEXT: } +// UNROLL-BY-3-NEXT: return + +func @dynamic_loop_unroll_outer_by_2( + %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, + %arg5 : index, %arg6: memref) { + %0 = constant 7.0 : f32 + loop.for %i0 = %arg0 to %arg1 step %arg2 { + loop.for %i1 = %arg3 to %arg4 step %arg5 { + store %0, %arg6[%i1] : memref + } + } + return +} +// UNROLL-OUTER-BY-2-LABEL: func @dynamic_loop_unroll_outer_by_2 +// UNROLL-OUTER-BY-2-SAME: %[[LB0:.*0]]: index, +// UNROLL-OUTER-BY-2-SAME: %[[UB0:.*1]]: index, +// UNROLL-OUTER-BY-2-SAME: %[[STEP0:.*2]]: index, +// UNROLL-OUTER-BY-2-SAME: %[[LB1:.*3]]: index, +// UNROLL-OUTER-BY-2-SAME: %[[UB1:.*4]]: index, +// UNROLL-OUTER-BY-2-SAME: %[[STEP1:.*5]]: index, +// UNROLL-OUTER-BY-2-SAME: %[[MEM:.*6]]: memref +// +// UNROLL-OUTER-BY-2: loop.for %[[IV0:.*]] = %[[LB0]] to %{{.*}} step %{{.*}} { +// UNROLL-OUTER-BY-2-NEXT: loop.for %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { +// UNROLL-OUTER-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV1]]] : memref +// UNROLL-OUTER-BY-2-NEXT: } +// UNROLL-OUTER-BY-2-NEXT: loop.for %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { +// UNROLL-OUTER-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV1]]] : memref +// UNROLL-OUTER-BY-2-NEXT: } +// UNROLL-OUTER-BY-2-NEXT: } +// UNROLL-OUTER-BY-2-NEXT: loop.for %[[IV0:.*]] = %{{.*}} to %[[UB0]] step %[[STEP0]] { +// UNROLL-OUTER-BY-2-NEXT: loop.for %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] { +// UNROLL-OUTER-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV1]]] : memref +// UNROLL-OUTER-BY-2-NEXT: } +// UNROLL-OUTER-BY-2-NEXT: } +// UNROLL-OUTER-BY-2-NEXT: return + +func @dynamic_loop_unroll_inner_by_2( + %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, + %arg5 : index, %arg6: memref) { + %0 = constant 7.0 : f32 + loop.for %i0 = %arg0 to %arg1 step %arg2 { + loop.for %i1 = %arg3 to %arg4 step %arg5 { + store %0, %arg6[%i1] : memref + } + } + return +} +// UNROLL-INNER-BY-2-LABEL: func @dynamic_loop_unroll_inner_by_2 +// UNROLL-INNER-BY-2-SAME: %[[LB0:.*0]]: index, +// UNROLL-INNER-BY-2-SAME: %[[UB0:.*1]]: index, +// UNROLL-INNER-BY-2-SAME: %[[STEP0:.*2]]: index, +// UNROLL-INNER-BY-2-SAME: %[[LB1:.*3]]: index, +// UNROLL-INNER-BY-2-SAME: %[[UB1:.*4]]: index, +// UNROLL-INNER-BY-2-SAME: %[[STEP1:.*5]]: index, +// UNROLL-INNER-BY-2-SAME: %[[MEM:.*6]]: memref +// +// UNROLL-INNER-BY-2: loop.for %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] { +// UNROLL-INNER-BY-2: loop.for %[[IV1:.*]] = %[[LB1]] to %{{.*}} step %{{.*}} { +// UNROLL-INNER-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV1]]] : memref +// UNROLL-INNER-BY-2-NEXT: %[[C1_IV:.*]] = constant 1 : index +// UNROLL-INNER-BY-2-NEXT: %[[V0:.*]] = muli %[[STEP1]], %[[C1_IV]] : index +// UNROLL-INNER-BY-2-NEXT: %[[V1:.*]] = addi %[[IV1]], %[[V0]] : index +// UNROLL-INNER-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[V1]]] : memref +// UNROLL-INNER-BY-2-NEXT: } +// UNROLL-INNER-BY-2-NEXT: loop.for %[[IV1:.*]] = %{{.*}} to %[[UB1]] step %[[STEP1]] { +// UNROLL-INNER-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV1]]] : memref +// UNROLL-INNER-BY-2-NEXT: } +// UNROLL-INNER-BY-2-NEXT: } +// UNROLL-INNER-BY-2-NEXT: return + +// Test that no epilogue clean-up loop is generated because the trip count is +// a multiple of the unroll factor. +func @static_loop_unroll_by_2(%arg0 : memref) { + %0 = constant 7.0 : f32 + %lb = constant 0 : index + %ub = constant 20 : index + %step = constant 1 : index + loop.for %i0 = %lb to %ub step %step { + store %0, %arg0[%i0] : memref + } + return +} +// UNROLL-BY-2-LABEL: func @static_loop_unroll_by_2 +// UNROLL-BY-2-SAME: %[[MEM:.*0]]: memref +// +// UNROLL-BY-2-DAG: %[[C0:.*]] = constant 0 : index +// UNROLL-BY-2-DAG: %[[C1:.*]] = constant 1 : index +// UNROLL-BY-2-DAG: %[[C20:.*]] = constant 20 : index +// UNROLL-BY-2-DAG: %[[C2:.*]] = constant 2 : index +// UNROLL-BY-2: loop.for %[[IV:.*]] = %[[C0]] to %[[C20]] step %[[C2]] { +// UNROLL-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-2-NEXT: %[[C1_IV:.*]] = constant 1 : index +// UNROLL-BY-2-NEXT: %[[V0:.*]] = muli %[[C1]], %[[C1_IV]] : index +// UNROLL-BY-2-NEXT: %[[V1:.*]] = addi %[[IV]], %[[V0]] : index +// UNROLL-BY-2-NEXT: store %{{.*}}, %[[MEM]][%[[V1]]] : memref +// UNROLL-BY-2-NEXT: } +// UNROLL-BY-2-NEXT: return + +// Test that epilogue clean up loop is generated (trip count is not +// a multiple of unroll factor). +func @static_loop_unroll_by_3(%arg0 : memref) { + %0 = constant 7.0 : f32 + %lb = constant 0 : index + %ub = constant 20 : index + %step = constant 1 : index + loop.for %i0 = %lb to %ub step %step { + store %0, %arg0[%i0] : memref + } + return +} + +// UNROLL-BY-3-LABEL: func @static_loop_unroll_by_3 +// UNROLL-BY-3-SAME: %[[MEM:.*0]]: memref +// +// UNROLL-BY-3-DAG: %[[C0:.*]] = constant 0 : index +// UNROLL-BY-3-DAG: %[[C1:.*]] = constant 1 : index +// UNROLL-BY-3-DAG: %[[C20:.*]] = constant 20 : index +// UNROLL-BY-3-DAG: %[[C18:.*]] = constant 18 : index +// UNROLL-BY-3-DAG: %[[C3:.*]] = constant 3 : index +// UNROLL-BY-3: loop.for %[[IV:.*]] = %[[C0]] to %[[C18]] step %[[C3]] { +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-3-NEXT: %[[C1_IV:.*]] = constant 1 : index +// UNROLL-BY-3-NEXT: %[[V0:.*]] = muli %[[C1]], %[[C1_IV]] : index +// UNROLL-BY-3-NEXT: %[[V1:.*]] = addi %[[IV]], %[[V0]] : index +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[V1]]] : memref +// UNROLL-BY-3-NEXT: %[[C2_IV:.*]] = constant 2 : index +// UNROLL-BY-3-NEXT: %[[V2:.*]] = muli %[[C1]], %[[C2_IV]] : index +// UNROLL-BY-3-NEXT: %[[V3:.*]] = addi %[[IV]], %[[V2]] : index +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[V3]]] : memref +// UNROLL-BY-3-NEXT: } +// UNROLL-BY-3-NEXT: loop.for %[[IV:.*]] = %[[C18]] to %[[C20]] step %[[C1]] { +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-3-NEXT: } +// UNROLL-BY-3-NEXT: return + +// Test that the single iteration epilogue loop body is promoted to the loops +// containing block. +func @static_loop_unroll_by_3_promote_epilogue(%arg0 : memref) { + %0 = constant 7.0 : f32 + %lb = constant 0 : index + %ub = constant 10 : index + %step = constant 1 : index + loop.for %i0 = %lb to %ub step %step { + store %0, %arg0[%i0] : memref + } + return +} +// UNROLL-BY-3-LABEL: func @static_loop_unroll_by_3_promote_epilogue +// UNROLL-BY-3-SAME: %[[MEM:.*0]]: memref +// +// UNROLL-BY-3-DAG: %[[C0:.*]] = constant 0 : index +// UNROLL-BY-3-DAG: %[[C1:.*]] = constant 1 : index +// UNROLL-BY-3-DAG: %[[C10:.*]] = constant 10 : index +// UNROLL-BY-3-DAG: %[[C9:.*]] = constant 9 : index +// UNROLL-BY-3-DAG: %[[C3:.*]] = constant 3 : index +// UNROLL-BY-3: loop.for %[[IV:.*]] = %[[C0]] to %[[C9]] step %[[C3]] { +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[IV]]] : memref +// UNROLL-BY-3-NEXT: %[[C1_IV:.*]] = constant 1 : index +// UNROLL-BY-3-NEXT: %[[V0:.*]] = muli %[[C1]], %[[C1_IV]] : index +// UNROLL-BY-3-NEXT: %[[V1:.*]] = addi %[[IV]], %[[V0]] : index +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[V1]]] : memref +// UNROLL-BY-3-NEXT: %[[C2_IV:.*]] = constant 2 : index +// UNROLL-BY-3-NEXT: %[[V2:.*]] = muli %[[C1]], %[[C2_IV]] : index +// UNROLL-BY-3-NEXT: %[[V3:.*]] = addi %[[IV]], %[[V2]] : index +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[V3]]] : memref +// UNROLL-BY-3-NEXT: } +// UNROLL-BY-3-NEXT: store %{{.*}}, %[[MEM]][%[[C9]]] : memref +// UNROLL-BY-3-NEXT: return diff --git a/mlir/test/Examples/standalone/lit.local.cfg b/mlir/test/Examples/standalone/lit.local.cfg new file mode 100644 index 00000000000000..481b809a0e486a --- /dev/null +++ b/mlir/test/Examples/standalone/lit.local.cfg @@ -0,0 +1,3 @@ +config.substitutions.append(("%cmake", config.host_cmake)) +config.substitutions.append(("%host_cxx", config.host_cxx)) +config.substitutions.append(("%host_cc", config.host_cc)) diff --git a/mlir/test/Examples/standalone/test.toy b/mlir/test/Examples/standalone/test.toy new file mode 100644 index 00000000000000..4f9ba5cc78e114 --- /dev/null +++ b/mlir/test/Examples/standalone/test.toy @@ -0,0 +1,4 @@ +# RUN: %cmake %mlir_src_root/examples/standalone -DCMAKE_CXX_COMPILER=%host_cxx -DCMAKE_C_COMPILER=%host_cc -DMLIR_DIR=%llvm_lib_dir/cmake/mlir ; %cmake --build . --target check-standalone-opt | tee %t | FileCheck %s + +# CHECK: Expected Passes: 1 +# UNSUPPORTED: windows, android diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir index 2145c1bbc17220..2a14c3ae6c419c 100644 --- a/mlir/test/IR/invalid-ops.mlir +++ b/mlir/test/IR/invalid-ops.mlir @@ -303,6 +303,13 @@ func @invalid_cmp_shape(%idx : () -> ()) { // ----- +func @dma_start_not_enough_operands() { + // expected-error@+1 {{expected at least 4 operands}} + "std.dma_start"() : () -> () +} + +// ----- + func @dma_no_src_memref(%m : f32, %tag : f32, %c0 : index) { // expected-error@+1 {{expected source to be of memref type}} dma_start %m[%c0], %m[%c0], %c0, %tag[%c0] : f32, f32, f32 @@ -310,6 +317,24 @@ func @dma_no_src_memref(%m : f32, %tag : f32, %c0 : index) { // ----- +func @dma_start_not_enough_operands_for_src( + %src: memref<2x2x2xf32>, %idx: index) { + // expected-error@+1 {{expected at least 7 operands}} + "std.dma_start"(%src, %idx, %idx, %idx) : (memref<2x2x2xf32>, index, index, index) -> () +} + +// ----- + +func @dma_start_src_index_wrong_type( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref, %flt: f32) { + // expected-error@+1 {{expected source indices to be of index type}} + "std.dma_start"(%src, %idx, %flt, %dst, %idx, %tag, %idx) + : (memref<2x2xf32>, index, f32, memref<2xf32,1>, index, memref, index) -> () +} + +// ----- + func @dma_no_dst_memref(%m : f32, %tag : f32, %c0 : index) { %mref = alloc() : memref<8 x f32> // expected-error@+1 {{expected destination to be of memref type}} @@ -318,6 +343,36 @@ func @dma_no_dst_memref(%m : f32, %tag : f32, %c0 : index) { // ----- +func @dma_start_not_enough_operands_for_dst( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref) { + // expected-error@+1 {{expected at least 7 operands}} + "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx) + : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index) -> () +} + +// ----- + +func @dma_start_dst_index_wrong_type( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref, %flt: f32) { + // expected-error@+1 {{expected destination indices to be of index type}} + "std.dma_start"(%src, %idx, %idx, %dst, %flt, %tag, %idx) + : (memref<2x2xf32>, index, index, memref<2xf32,1>, f32, memref, index) -> () +} + +// ----- + +func @dma_start_dst_index_wrong_type( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref, %flt: f32) { + // expected-error@+1 {{expected num elements to be of index type}} + "std.dma_start"(%src, %idx, %idx, %dst, %idx, %flt, %tag) + : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, f32, memref) -> () +} + +// ----- + func @dma_no_tag_memref(%tag : f32, %c0 : index) { %mref = alloc() : memref<8 x f32> // expected-error@+1 {{expected tag to be of memref type}} @@ -326,9 +381,80 @@ func @dma_no_tag_memref(%tag : f32, %c0 : index) { // ----- +func @dma_start_not_enough_operands_for_tag( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref<2xi32,2>) { + // expected-error@+1 {{expected at least 8 operands}} + "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag) + : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref<2xi32,2>) -> () +} + +// ----- + +func @dma_start_dst_index_wrong_type( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref<2xi32,2>, %flt: f32) { + // expected-error@+1 {{expected tag indices to be of index type}} + "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag, %flt) + : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref<2xi32,2>, f32) -> () +} + +// ----- + +func @dma_start_same_space( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32>, + %tag: memref) { + // expected-error@+1 {{DMA should be between different memory spaces}} + dma_start %src[%idx, %idx], %dst[%idx], %idx, %tag[] : memref<2x2xf32>, memref<2xf32>, memref +} + +// ----- + +func @dma_start_too_many_operands( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref) { + // expected-error@+1 {{incorrect number of operands}} + "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag, %idx, %idx, %idx) + : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref, index, index, index) -> () +} + + +// ----- + +func @dma_start_wrong_stride_type( + %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>, + %tag: memref, %flt: f32) { + // expected-error@+1 {{expected stride and num elements per stride to be of type index}} + "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag, %idx, %flt) + : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref, index, f32) -> () +} + +// ----- + +func @dma_wait_not_enough_operands() { + // expected-error@+1 {{expected at least 2 operands}} + "std.dma_wait"() : () -> () +} + +// ----- + func @dma_wait_no_tag_memref(%tag : f32, %c0 : index) { // expected-error@+1 {{expected tag to be of memref type}} - dma_wait %tag[%c0], %arg0 : f32 + "std.dma_wait"(%tag, %c0, %c0) : (f32, index, index) -> () +} + +// ----- + +func @dma_wait_wrong_index_type(%tag : memref<2xi32>, %idx: index, %flt: f32) { + // expected-error@+1 {{expected tag indices to be of index type}} + "std.dma_wait"(%tag, %flt, %idx) : (memref<2xi32>, f32, index) -> () +} + +// ----- + +func @dma_wait_wrong_num_elements_type(%tag : memref<2xi32>, %idx: index, %flt: f32) { + // expected-error@+1 {{expected the number of elements to be of index type}} + "std.dma_wait"(%tag, %idx, %flt) : (memref<2xi32>, index, f32) -> () } // ----- diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index 248da51bcec722..e7b31b3d0bcfec 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -14,6 +14,7 @@ add_mlir_library(MLIRTestTransforms TestLiveness.cpp TestLoopMapping.cpp TestLoopParametricTiling.cpp + TestLoopUnrolling.cpp TestOpaqueLoc.cpp TestMemRefBoundCheck.cpp TestMemRefDependenceCheck.cpp diff --git a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp b/mlir/test/lib/Transforms/TestLoopUnrolling.cpp new file mode 100644 index 00000000000000..7cd221f37f8c00 --- /dev/null +++ b/mlir/test/lib/Transforms/TestLoopUnrolling.cpp @@ -0,0 +1,68 @@ +//===-------- TestLoopUnrolling.cpp --- loop unrolling test pass ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to unroll loops by a specified unroll factor. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/LoopOps/LoopOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +namespace { + +static unsigned getNestingDepth(Operation *op) { + Operation *currOp = op; + unsigned depth = 0; + while ((currOp = currOp->getParentOp())) { + if (isa(currOp)) + depth++; + } + return depth; +} + +class TestLoopUnrollingPass + : public PassWrapper { +public: + TestLoopUnrollingPass() = default; + TestLoopUnrollingPass(const TestLoopUnrollingPass &) {} + explicit TestLoopUnrollingPass(uint64_t unrollFactorParam, + unsigned loopDepthParam) { + unrollFactor = unrollFactorParam; + loopDepth = loopDepthParam; + } + + void runOnFunction() override { + FuncOp func = getFunction(); + SmallVector loops; + func.walk([&](loop::ForOp forOp) { + if (getNestingDepth(forOp) == loopDepth) + loops.push_back(forOp); + }); + for (auto loop : loops) { + loopUnrollByFactor(loop, unrollFactor); + } + } + Option unrollFactor{*this, "unroll-factor", + llvm::cl::desc("Loop unroll factor."), + llvm::cl::init(1)}; + Option loopDepth{*this, "loop-depth", llvm::cl::desc("Loop depth."), + llvm::cl::init(0)}; +}; +} // end namespace + +namespace mlir { +void registerTestLoopUnrollingPass() { + PassRegistration( + "test-loop-unrolling", "Tests loop unrolling transformation"); +} +} // namespace mlir diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 65f80315d57aa4..e78c82815b15a6 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -31,6 +31,7 @@ config.substitutions.append(('%PATH%', config.environment['PATH'])) config.substitutions.append(('%shlibext', config.llvm_shlib_ext)) +config.substitutions.append(("%mlir_src_root", config.mlir_src_root)) llvm_config.with_system_environment( ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index dafb1c9a3eb861..dc6286a827bb73 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -23,6 +23,7 @@ config.llvm_bindings = "@LLVM_BINDINGS@".split(' ') config.host_os = "@HOST_OS@" config.host_cc = "@HOST_CC@" config.host_cxx = "@HOST_CXX@" +config.host_cmake = "@CMAKE_COMMAND@" # Note: ldflags can contain double-quoted paths, so must use single quotes here. config.host_ldflags = '@HOST_LDFLAGS@' config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt index 2504b04420b79a..ca39f37a8d8d03 100644 --- a/mlir/tools/mlir-opt/CMakeLists.txt +++ b/mlir/tools/mlir-opt/CMakeLists.txt @@ -30,7 +30,6 @@ set(LIBS MLIRTestTransforms MLIRSupport MLIRIR - MLIROptLib ) # Exclude from libMLIR.so because this has static options intended for @@ -42,6 +41,10 @@ add_mlir_library(MLIRMlirOptMain LINK_LIBS ${LIBS} + + DEPENDS + intrinsics_gen + mlir-headers ) add_llvm_tool(mlir-opt diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index c5cc533ab1199a..9d583dc2a3198a 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -53,6 +53,7 @@ void registerTestLinalgTransforms(); void registerTestLivenessPass(); void registerTestLoopFusion(); void registerTestLoopMappingPass(); +void registerTestLoopUnrollingPass(); void registerTestMatchers(); void registerTestMemRefDependenceCheck(); void registerTestMemRefStrideCalculation(); @@ -119,6 +120,7 @@ void registerTestPasses() { registerTestLivenessPass(); registerTestLoopFusion(); registerTestLoopMappingPass(); + registerTestLoopUnrollingPass(); registerTestMatchers(); registerTestMemRefDependenceCheck(); registerTestMemRefStrideCalculation();