@@ -2505,38 +2505,44 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
25052505 return CompoundStmt::Create (SemaRef.getASTContext (), BodyStmts, {}, {});
25062506 }
25072507
2508- void markParallelWorkItemCalls () {
2509- if (getKernelInvocationKind (KernelCallerFunc) ==
2510- InvokeParallelForWorkGroup) {
2511- // Fetch the kernel object and the associated call operator
2512- // (of either the lambda or the function object).
2513- CXXRecordDecl *KernelObj =
2514- GetSYCLKernelObjectType (KernelCallerFunc)->getAsCXXRecordDecl ();
2515- CXXMethodDecl *WGLambdaFn = nullptr ;
2516- if (KernelObj->isLambda ())
2517- WGLambdaFn = KernelObj->getLambdaCallOperator ();
2518- else
2519- WGLambdaFn = getOperatorParens (KernelObj);
2520- assert (WGLambdaFn && " non callable object is passed as kernel obj" );
2521- // Mark the function that it "works" in a work group scope:
2522- // NOTE: In case of parallel_for_work_item the marker call itself is
2523- // marked with work item scope attribute, here the '()' operator of the
2524- // object passed as parameter is marked. This is an optimization -
2525- // there are a lot of locals created at parallel_for_work_group
2526- // scope before calling the lambda - it is more efficient to have
2527- // all of them in the private address space rather then sharing via
2528- // the local AS. See parallel_for_work_group implementation in the
2529- // SYCL headers.
2530- if (!WGLambdaFn->hasAttr <SYCLScopeAttr>()) {
2531- WGLambdaFn->addAttr (SYCLScopeAttr::CreateImplicit (
2532- SemaRef.getASTContext (), SYCLScopeAttr::Level::WorkGroup));
2533- // Search and mark parallel_for_work_item calls:
2534- MarkWIScopeFnVisitor MarkWIScope (SemaRef.getASTContext ());
2535- MarkWIScope.TraverseDecl (WGLambdaFn);
2536- // Now mark local variables declared in the PFWG lambda with work group
2537- // scope attribute
2538- addScopeAttrToLocalVars (*WGLambdaFn);
2539- }
2508+ void annotateHierarchicalParallelismAPICalls () {
2509+ // Is this a hierarchical parallelism kernel invocation?
2510+ if (getKernelInvocationKind (KernelCallerFunc) != InvokeParallelForWorkGroup)
2511+ return ;
2512+
2513+ // Mark kernel object with work-group scope attribute to avoid work-item
2514+ // scope memory allocation.
2515+ KernelObjClone->addAttr (SYCLScopeAttr::CreateImplicit (
2516+ SemaRef.getASTContext (), SYCLScopeAttr::Level::WorkGroup));
2517+
2518+ // Fetch the kernel object and the associated call operator
2519+ // (of either the lambda or the function object).
2520+ CXXRecordDecl *KernelObj =
2521+ GetSYCLKernelObjectType (KernelCallerFunc)->getAsCXXRecordDecl ();
2522+ CXXMethodDecl *WGLambdaFn = nullptr ;
2523+ if (KernelObj->isLambda ())
2524+ WGLambdaFn = KernelObj->getLambdaCallOperator ();
2525+ else
2526+ WGLambdaFn = getOperatorParens (KernelObj);
2527+ assert (WGLambdaFn && " non callable object is passed as kernel obj" );
2528+ // Mark the function that it "works" in a work group scope:
2529+ // NOTE: In case of parallel_for_work_item the marker call itself is
2530+ // marked with work item scope attribute, here the '()' operator of the
2531+ // object passed as parameter is marked. This is an optimization -
2532+ // there are a lot of locals created at parallel_for_work_group
2533+ // scope before calling the lambda - it is more efficient to have
2534+ // all of them in the private address space rather then sharing via
2535+ // the local AS. See parallel_for_work_group implementation in the
2536+ // SYCL headers.
2537+ if (!WGLambdaFn->hasAttr <SYCLScopeAttr>()) {
2538+ WGLambdaFn->addAttr (SYCLScopeAttr::CreateImplicit (
2539+ SemaRef.getASTContext (), SYCLScopeAttr::Level::WorkGroup));
2540+ // Search and mark parallel_for_work_item calls:
2541+ MarkWIScopeFnVisitor MarkWIScope (SemaRef.getASTContext ());
2542+ MarkWIScope.TraverseDecl (WGLambdaFn);
2543+ // Now mark local variables declared in the PFWG lambda with work group
2544+ // scope attribute
2545+ addScopeAttrToLocalVars (*WGLambdaFn);
25402546 }
25412547 }
25422548
@@ -2768,13 +2774,11 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
27682774 TypeSourceInfo *TSInfo =
27692775 KernelObj->isLambda () ? KernelObj->getLambdaTypeInfo () : nullptr ;
27702776 auto Type = QualType (KernelObj->getTypeForDecl (), 0 );
2771- Type->getAsRecordDecl ()->setAnonymousStructOrUnion (true );
2777+ if (KernelObj->isLambda ())
2778+ Type->getAsRecordDecl ()->setAnonymousStructOrUnion (true );
27722779 VarDecl *VD = VarDecl::Create (
27732780 Ctx, DC, KernelObj->getLocation (), KernelObj->getLocation (),
27742781 KernelObj->getIdentifier (), Type, TSInfo, SC_None);
2775- if (getKernelInvocationKind (KernelCallerFunc) == InvokeParallelForWorkGroup)
2776- VD->addAttr (
2777- SYCLScopeAttr::CreateImplicit (Ctx, SYCLScopeAttr::Level::WorkGroup));
27782782 return VD;
27792783 }
27802784
@@ -2856,7 +2860,7 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
28562860 KernelObj(KernelObj), KernelCallerFunc(KernelCallerFunc),
28572861 KernelCallerSrcLoc(KernelCallerFunc->getLocation ()) {
28582862 CollectionInitExprs.push_back (createInitListExpr (KernelObj));
2859- markParallelWorkItemCalls ();
2863+ annotateHierarchicalParallelismAPICalls ();
28602864
28612865 Stmt *DS = new (S.Context ) DeclStmt (DeclGroupRef (KernelObjClone),
28622866 KernelCallerSrcLoc, KernelCallerSrcLoc);
0 commit comments