Skip to content

Improving performance of FPTL algorithm by 0.3 ms on console. #5866

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 18, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,22 @@ int GetCoarseLightIndex(int l, int iNrCoarseLights)
return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;
}

groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];

void StoreLightVolumeCache(int lightIndex, int coarseIndex, uint volumeType)
{
// 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.
// 29 bits for a coarse light index.
s_lightVolumesCache[lightIndex] = (volumeType & 0x7) | (coarseIndex << 3);
}

void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)
{
int data = s_lightVolumesCache[lightIndex];
coarseIndex = data >> 3;
volumeType = data & 0x7;
}

// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
Expand All @@ -414,22 +430,31 @@ void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vL
int l=0;
// need this outer loop even on xb1 and ps4 since direct lights and
// reflection lights are kept in separate regions.
while(l<iNrCoarseLights)

if (threadID < (uint)iNrCoarseLights)
{
// fetch light
int idxCoarse = GetCoarseLightIndex(l, iNrCoarseLights);
uint uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
int idxCoarse = GetCoarseLightIndex(threadID, iNrCoarseLights);
uint uLightVolume = _LightVolumeData[idxCoarse].lightVolume;
StoreLightVolumeCache(threadID, idxCoarse, uLightVolume);
}

// spot
while(l<iNrCoarseLights && uLightVolume==LIGHTVOLUMETYPE_CONE)
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif

//When using LDS to cache the volume data, this produces the best most optimal code.
//Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
for (; l < iNrCoarseLights; ++l)
{
int idxCoarse;
uint uLightVolume;
LoadLightVolumeCache(l, idxCoarse, uLightVolume);
bool lightValid = false;
if (uLightVolume == LIGHTVOLUMETYPE_CONE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
// TODO: Change by SebL
const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;

// serially check PIXEL_PER_THREAD pixels
uint uVal = 0;
for(int i=0; i<PIXEL_PER_THREAD; i++)
for(int i=0; i<PIXEL_PER_THREAD && !lightValid; i++)
{
int idx = t + i*NR_THREADS;

Expand All @@ -444,22 +469,20 @@ void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vL
float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );

float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);
if( all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) ) ) uVal = 1;
bool isValid = all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) );
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
//a wave is on the same tile, and the loop is uniform for the wave.
// thus we early out if at least 1 thread in the wave passed this light, saving some ALU.
lightValid = WaveActiveAnyTrue(isValid);
#else
lightValid = isValid;
#endif
}

uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = GetCoarseLightIndex(l, iNrCoarseLights);
uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
}

// sphere
while(l<iNrCoarseLights && uLightVolume==LIGHTVOLUMETYPE_SPHERE)
else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];

// serially check PIXEL_PER_THREAD pixels
uint uVal = 0;
for(int i=0; i<PIXEL_PER_THREAD; i++)
for(int i=0; i<PIXEL_PER_THREAD && !lightValid; i++)
{
int idx = t + i*NR_THREADS;

Expand All @@ -471,22 +494,18 @@ void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vL
float3 toLight = vLp - vVPos;
float distSq = dot(toLight,toLight);

if(lightData.radiusSq>distSq) uVal = 1;
}

uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = GetCoarseLightIndex(l, iNrCoarseLights);
uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
bool isValid = lightData.radiusSq>distSq;
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(isValid);
#else
lightValid = isValid;
#endif
}
}

// Box
while(l<iNrCoarseLights && uLightVolume==LIGHTVOLUMETYPE_BOX)
else if (uLightVolume == LIGHTVOLUMETYPE_BOX)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];

// serially check PIXEL_PER_THREAD pixels
uint uVal = 0;
for(int i=0; i<PIXEL_PER_THREAD; i++)
for(int i=0; i<PIXEL_PER_THREAD && !lightValid; i++)
{
int idx = t + i*NR_THREADS;

Expand All @@ -498,16 +517,18 @@ void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vL

float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
if( max(max(dist.x, dist.y), dist.z)<1 ) uVal = 1; // but allows us to not write out OuterDists
bool isValid = max(max(dist.x, dist.y), dist.z)<1; // but allows us to not write out OuterDists
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(isValid);
#else
lightValid = isValid;
#endif
}

uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = GetCoarseLightIndex(l, iNrCoarseLights);
uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
}
else
break;

// in case we have some corrupt data make sure we terminate
if(uLightVolume >=LIGHTVOLUMETYPE_COUNT) ++l;
uLightsFlags[l<32 ? 0 : 1] |= ((lightValid ? 1 : 0)<<(l&31));
}

InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
Expand Down