-
Notifications
You must be signed in to change notification settings - Fork 16
Decompiler Intermediate Representation Design
- Design Considerations
- Open Questions
-
Details
- Dynamic indexing can be complicated
- Object methods may have out variables and complicated overloading
- Object methods without variables need variable for unused parameters
- Object methods have template overload methods that depend on the type
- Multiple variables can share one register and be written to in one instruction
- Interfaces and Classes
- Instruction Lifting
- Loops
- Geometry Shaders can merge multiple output signatures into a single register based on semantic
- Emit and cut operate on implicit registers
- Hull Shaders
- Method signatures need to be derived from a mixture declaration tokens and input output chunks
- SM6 operations are scalarized
- SM6 operates on resource handles rather then resource registers
- SM6 supports function calls in library shaders
- [SM6 only allows for unstructured control flow][#sm6-only-allows-for-unstructured-control-flow]
DXDecompiler should support SM2, SM3, SM4, SM5 shader models. SM1 and SM6 support may be considered in the future, so the decompiler should allow for the possibility. HLSL output should be correct and readable. To implement this, the architecture should be designed as such:
- Parse Bytecode into a Bytecode Container (SM2-3 and SM4-5 have their own individual containers).
- Convert Bytecode Container into a intermediate representation
- Convert intermediate representation to HLSL source.
There are a number of issues and goals that make directly translating shader bytecode difficult.
- Type information is stripped from instruction operands and needs to be reconstructed
- The ability to decompile multiple shader stages in a single batch should be supported in order to decompile effect shaders and to merge shared resource definitions of separate stages.
- Some bytecode instructions do not correspond cleanly to HLSL and need transformation to better represent HLSL semantics
- Some bytecode structures do not correspond cleanly to HLSL and need transformation to better represent HLSL semantics, such as hull shaders and classes & interfaces.
- Instruction lifting needs to be performed, to transform low level bytecode instructions into high level HLSL instructions. This may be a one to many or many to one process.
- Information is spread across instruction declarations and bytecode chunks and should be consolidated.
- SM6 shaders scalarize all instructions. A single vector instruction such as
vec1.xyzw + vec2.xyzw
translates into 4 scalar instructions for each component. - SM6 only supports unstructured control flow (jump and label statements) and need to be converted into structured control flow constructs (if statements, while loops).
- How should analysis such as type reconstruction and instruction lifting be done in a way that supports operand swizzles?
- How closely should the IR format resemble HLSL vs bytecode?
- Should the format be a AST or a list of instructions that operate on registers?
- Should destination registers be part of the IR format?
- Should the format be a tree structure or a list?
- How should declarations be implemented?
- How should IR passes be structured (lifting passes, type reconstruction passes)?
- How should a mapping between assembly instructions and hlsl instructions be maintained during transforms.
struct foo {
float4 sValue1[5];
float4 sValue2[5];
};
StructuredBuffer<struct foo> tex;
int index1;
int index2;
float4 main() : SV_Target
{
float4 result = 0;
result += tex.Load(index1).sValue2[index2];
return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_resource_structured t0, 160
dcl_output o0.xyzw
dcl_temps 3
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r0.xyzw, cb0[0].x, l(80), t0.xyzw
ieq r1.x, cb0[0].y, l(0)
and r0.xyzw, r0.xyzw, r1.xxxx
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(96), t0.xyzw
ieq r2.xyzw, cb0[0].yyyy, l(1, 2, 3, 4)
and r1.xyzw, r1.xyzw, r2.xxxx
or r0.xyzw, r0.xyzw, r1.xyzw
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(112), t0.xyzw
and r1.xyzw, r1.xyzw, r2.yyyy
or r0.xyzw, r0.xyzw, r1.xyzw
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(128), t0.xyzw
and r1.xyzw, r1.xyzw, r2.zzzz
or r0.xyzw, r0.xyzw, r1.xyzw
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(144), t0.xyzw
and r1.xyzw, r1.xyzw, r2.wwww
or o0.xyzw, r0.xyzw, r1.xyzw
ret
Texture2D tex;
float4 main() : SV_Target
{
float4 result = 0;
//In Variables
uint mipLevel = 1;
//Out Variables
uint width;
uint height;
uint numberOfLevels;
tex.GetDimensions(width, height);
result += width;
result += height;
tex.GetDimensions(mipLevel, width, height, numberOfLevels);
result += width;
result += height;
result += numberOfLevels;
return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_resource_texture2d (float,float,float,float) t0
dcl_output o0.xyzw
dcl_temps 1
resinfo_indexable(texture2d)(float,float,float,float)_uint r0.xy, l(0), t0.xyzw
utof r0.xy, r0.xyxx
add r0.x, r0.y, r0.x
resinfo_indexable(texture2d)(float,float,float,float)_uint r0.yzw, l(1), t0.zxyw
utof r0.yzw, r0.yyzw
add r0.x, r0.y, r0.x
add r0.x, r0.z, r0.x
add o0.xyzw, r0.wwww, r0.xxxx
ret
dcl_resource_structured t111, 16
bufinfo_indexable(structured_buffer, stride=16)(mixed,mixed,mixed,mixed) r0.y, t111.yxzw
StructuredBuffer<struct foo> struct_buf;
void main(){
uint udim, numStructs, stride;
struct_buf.GetDimensions(numStructs, stride);
}
dcl_resource_buffer (float,float,float,float) t109
dcl_resource_structured t111, 16
bufinfo_indexable(buffer)(float,float,float,float) r0.x, t109.xyzw
bufinfo_indexable(structured_buffer, stride=16)(mixed,mixed,mixed,mixed) r0.y, t111.yxzw
Buffer<float4> buf_resource;
StructuredBuffer<struct foo> struct_buf;
void main(){
uint udim, numStructs, stride;
buf_resource.GetDimensions(udim);
struct_buf.GetDimensions(numStructs, stride);
}
float g1;
float3 g2;
struct Input {
float f1 : TEXCOORD0;
float3 f2 : TEXCOORD1;
};
float4 main(Input input) : SV_TARGET {
float4 result = 0;
result += g1 + input.f1;
result += float4(g2.xyz, 0) + float4(input.f2.xyz, 0);
return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_input_ps linear v0.x
dcl_input_ps linear v0.yzw
dcl_output o0.xyzw
dcl_temps 2
mov r0.xyz, v0.yzwy
add r0.xyz, r0.xyzx, cb0[0].yzwy
add r1.x, v0.x, cb0[0].x
mov r0.w, l(0)
add o0.xyzw, r0.xyzw, r1.xxxx
ret
- Methods can have multiple bodies
- Assembly does not specify parameters
interface iBaseLight
{
float3 IlluminateAmbient(float3 vNormal);
float3 IlluminateDiffuse(float3 vNormal);
};
class cAmbientLight : iBaseLight
{
float3 m_vLightColor;
bool m_bEnable;
float3 IlluminateAmbient(float3 vNormal) {
return vNormal + 2;
}
float3 IlluminateDiffuse(float3 vNormal) {
return vNormal + 3;
}
};
class dAmbientLight : iBaseLight
{
float3 m_vLightColor;
bool m_bEnable;
float3 IlluminateAmbient(float3 vNormal) {
return vNormal + 4;
}
float3 IlluminateDiffuse(float3 vNormal) {
return vNormal + 5;
}
};
struct PS_INPUT
{
float4 vPosition : SV_POSITION;
float3 vNormal : NORMAL;
float2 vTexcoord : TEXCOORD0;
};
iBaseLight g_abstractAmbientLighting;
iBaseLight g_abstractDirectLighting;
float4 main( PS_INPUT Input ) : SV_TARGET {
float4 result = (float4)0;
float3 temp = (float3)0.0f;
temp += g_abstractAmbientLighting.IlluminateAmbient( Input.vNormal );
temp += g_abstractAmbientLighting.IlluminateAmbient( Input.vNormal );
temp += g_abstractAmbientLighting.IlluminateDiffuse( Input.vNormal );
result += float4(temp, 0);
return result;
}
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions:
//
// interfaces $ThisPointer
// {
//
// interface iBaseLight g_abstractAmbientLighting;// Offset: 0 Size: 1
// interface iBaseLight g_abstractDirectLighting;// Offset: N/A Size: N/A [unused]
//
// }
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_POSITION 0 xyzw 0 POS float
// NORMAL 0 xyz 1 NONE float xyz
// TEXCOORD 0 xy 2 NONE float
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_TARGET 0 xyzw 0 TARGET float xyzw
//
//
// Available Class Types:
//
// Name ID CB Stride Texture Sampler
// ------------------------------ ---- --------- ------- -------
// dAmbientLight 0 0 0 0
// cAmbientLight 1 0 0 0
//
// Interface slots, 1 total:
//
// Slots
// +----------+---------+---------------------------------------
// | Type ID | 0 |0 1
// | Table ID | |0 1
// +----------+---------+---------------------------------------
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_function_body fb0
dcl_function_body fb1
dcl_function_body fb2
dcl_function_body fb3
dcl_function_body fb4
dcl_function_body fb5
dcl_function_table ft0 = {fb0, fb2, fb4}
dcl_function_table ft1 = {fb1, fb3, fb5}
dcl_interface fp0[1][3] = {ft0, ft1}
dcl_input_ps linear v1.xyz
dcl_output o0.xyzw
dcl_temps 2
fcall fp0[0][0]
fcall fp0[0][1]
add r0.xyz, r0.xyzx, r1.xyzx
fcall fp0[0][2]
add o0.xyz, r0.xyzx, r1.xyzx
mov o0.w, l(0)
ret
label fb0
add r0.xyz, v1.xyzx, l(4.000000, 4.000000, 4.000000, 0.000000)
ret
label fb1
add r0.xyz, v1.xyzx, l(2.000000, 2.000000, 2.000000, 0.000000)
ret
label fb2
add r1.xyz, v1.xyzx, l(4.000000, 4.000000, 4.000000, 0.000000)
ret
label fb3
add r1.xyz, v1.xyzx, l(2.000000, 2.000000, 2.000000, 0.000000)
ret
label fb4
add r1.xyz, v1.xyzx, l(5.000000, 5.000000, 5.000000, 0.000000)
ret
label fb5
add r1.xyz, v1.xyzx, l(3.000000, 3.000000, 3.000000, 0.000000)
ret
// Approximately 19 instruction slots used
Matrix operations are converted to vector operations
VS_OUTPUT VSMain(VS_INPUT Input)
{
VS_OUTPUT Output;
Output.vPosition = mul(Input.vPosition, g_mWorldViewProjection);
return Output;
}
vs_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[4], immediateIndexed
dcl_input v0.xyzw
dcl_output_siv o0.xyzw, position
dp4 o0.x, v0.xyzw, cb0[0].xyzw
dp4 o0.y, v0.xyzw, cb0[1].xyzw
dp4 o0.z, v0.xyzw, cb0[2].xyzw
dp4 o0.w, v0.xyzw, cb0[3].xyzw
ret
For example
normalize(value)
is converted to three instructions
dp4 r0.x, cb0[0].xyzw, cb0[0].xyzw
rsq r0.x, r0.x
mul o0.xyzw, r0.xxxx, cb0[0].xyzw
The syntax for instructions is generally
add[_sat] dest[.mask], [-]src0[_abs][.swizzle], [-]src1[_abs][.swizzle]
which allows for modifiers such as saturate, absolute value and negation that should be handled somehow.
UBFE, IBFE and BFI are defined as:
Given width, offset:
if( width == 0 )
{
dest = 0
}
else if( width + offset < 32 )
{
shl dest, src2, 32-(width+offset)
ushr dest, dest, 32-width
}
else
{
ushr dest, src2, offset
}
UAddc and USubb store output in two destination registers, one for the main output and one for the result of carry/borrow.
UDiv stores output in two destination registers, one for the quotient and one for the remainder
IMul and UMul store the output in two destination registers, one for the high 32 bits and one for the low 32 bits.
swapc dest0[.mask] dest1[.mask] src0[.swizzle] src1[.swizzle] src2[.swizzle]
; expands to:
movc temp[dest0 s mask] src0[.swizzle] src2[.swizzle] src1[.swizzle]
movc dest1[.mask] src0[.swizzle] src1[.swizzle] src2[.swizzle]
mov dest0.mask, temp
Sin and Cos are implemented as a single instruction that has two destination registers
return sin(x) + cos(x);
sincos r0.xyzw, r1.xyzw, cb0[0].xyzw
add o0.xyzw, r0.xyzw, r1.xyzw
Convert loops into something more easily readable
float g1;
float4 main() : SV_TARGET {
float4 result = 10;
for(int i = 0; i < 3; i++){
result /= g1;
}
while(true){
if(result.x > 10) break;
result /= g1;
}
return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_output o0.xyzw
dcl_temps 3
mov r0.xyzw, l(10.000000,10.000000,10.000000,10.000000)
mov r1.x, l(0)
loop
ige r1.y, r1.x, l(3)
breakc_nz r1.y
div r0.xyzw, r0.xyzw, cb0[0].xxxx
iadd r1.x, r1.x, l(1)
endloop
mov r1.xyzw, r0.xyzw
loop
lt r2.x, l(10.000000), r1.x
if_nz r2.x
break
endif
div r1.xyzw, r1.xyzw, cb0[0].xxxx
endloop
mov o0.xyzw, r1.xyzw
ret
struct InVertex
{
float3 pos : POSITION;
float3 scale : NORMAL;
};
struct OutVertex1
{
float3 pos : OUT_POSITION;
};
struct OutVertex2
{
int3 pos : OUT_POSITION;
};
[maxvertexcount(18)]
void main(
line InVertex verts[2],
inout PointStream<OutVertex1> myStream1,
inout PointStream<OutVertex2> myStream2 )
{
OutVertex1 myVert1;
OutVertex2 myVert2;
myVert1.pos = verts[0].pos * verts[0].scale;
myVert2.pos = verts[1].pos * verts[1].scale;
myStream1.Append( myVert1 );
myStream2.Append( myVert2 );
}
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// POSITION 0 xyz 0 NONE float xyz
// NORMAL 0 xyz 1 NONE float xyz
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// m0:OUT_POSITION 0 xyz 0 NONE float xyz
// m1:OUT_POSITION 0 xyz 0 NONE int xyz
//
gs_5_0
dcl_globalFlags refactoringAllowed
dcl_input v[2][0].xyz
dcl_input v[2][1].xyz
dcl_temps 1
dcl_inputprimitive line
dcl_stream m0
dcl_outputtopology pointlist
dcl_output o0.xyz
dcl_stream m1
dcl_outputtopology pointlist
dcl_output o0.xyz
dcl_maxout 18
mul r0.xyz, v[0][0].xyzx, v[0][1].xyzx
mov o0.xyz, r0.xyzx
emit_stream m0
mul r0.xyz, v[1][0].xyzx, v[1][1].xyzx
ftoi r0.xyz, r0.xyzx
mov o0.xyz, r0.xyzx
emit_stream m1
ret
// Approximately 8 instruction slots used
emit causes all declared o# registers to be read out of the Geometry Shader to generate a vertex.
only valid if streams have not been declared
Hull shaders are made up of 2 phases, a control point phase and a fork/join phase. The control point phase is optional
hs_5_0
hs_decls
; dcls start
hs_control_point_phase
; dcls end
; function body here
ret
hs_fork_phase
dcl_hs_fork_phase_instance_count 4
; dcls here
; function body here
ret
hs_fork_phase
dcl_hs_fork_phase_instance_count 2
; dcls here
; function body here
ret
The original patch constant function cannot be recovered and the instructions of each fork and join phases need to be by a loop that iterates the corresponding number of phase-instance-count iterations
[maxvertexcount(3)]
void main(
triangle float4 ipos[3] : SV_Position,
inout TriangleStream<PSSceneIn> OutputStream,
uint id : SV_GSInstanceID,
uint prim : SV_PrimitiveID
)
{
PSSceneIn o = (PSSceneIn)0;
OutputStream.Append(o);
}
dcl_globalFlags refactoringAllowed
dcl_input_siv v[3][0].xyzw, position
dcl_input vGSInstanceID
dcl_input vPrim
dcl_temps 1
dcl_inputprimitive triangle
dcl_stream m0
dcl_outputtopology trianglestrip
dcl_output_siv o0.xyzw, position
dcl_output o1.xyzw
dcl_output_siv o2.x, clip_distance
dcl_output_siv o2.y, cull_distance
dcl_output_sgv o3.x, is_front_face
dcl_output_siv o3.y, rendertarget_array_index
dcl_output_siv o3.z, viewport_array_index
dcl_maxout 3
HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS(
InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
uint PatchID : SV_PrimitiveID )
{
HS_CONSTANT_DATA_OUTPUT Output;
return Output;
}
[domain("quad")]
[partitioning("integer")]
[outputtopology("triangle_cw")]
[outputcontrolpoints(16)]
[patchconstantfunc("SubDToBezierConstantsHS")]
BEZIER_CONTROL_POINT main(
InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
uint i : SV_OutputControlPointID,
uint PatchID : SV_PrimitiveID )
{
BEZIER_CONTROL_POINT Output;
return Output;
}
hs_5_0
hs_decls
dcl_input_control_point_count 32
dcl_output_control_point_count 16
dcl_tessellator_domain domain_quad
dcl_tessellator_partitioning partitioning_integer
dcl_tessellator_output_primitive output_triangle_cw
dcl_globalFlags refactoringAllowed
hs_control_point_phase
dcl_input vPrim
dcl_output o0.x
ret
hs_fork_phase
dcl_hs_fork_phase_instance_count 4
dcl_input vForkInstanceID
dcl_output_siv o0.x, finalQuadUeq0EdgeTessFactor
dcl_output_siv o1.x, finalQuadVeq0EdgeTessFactor
dcl_output_siv o2.x, finalQuadUeq1EdgeTessFactor
dcl_output_siv o3.x, finalQuadVeq1EdgeTessFactor
dcl_temps 1
dcl_indexrange o0.x 4
ret
hs_fork_phase
dcl_hs_fork_phase_instance_count 2
dcl_input vForkInstanceID
dcl_output_siv o4.x, finalQuadUInsideTessFactor
dcl_output_siv o5.x, finalQuadVInsideTessFactor
dcl_temps 1
dcl_indexrange o4.x 2
ret
[domain("quad")]
DS_OUTPUT main( HS_CONSTANT_DATA_OUTPUT input,
float2 UV : SV_DomainLocation,
const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch )
{
DS_OUTPUT Output = (DS_OUTPUT)0;
return Output;
}
dcl_input_control_point_count 16
dcl_tessellator_domain domain_quad
dcl_globalFlags refactoringAllowed
dcl_output o0.xyz
dcl_output o1.xy
dcl_output o2.xyz
dcl_output o3.xyz
dcl_output_siv o4.xyzw, position
RWByteAddressBuffer buf : register(u0);
[numthreads(4, 2, 1)]
void main(
uint3 tid : SV_DispatchThreadID,
uint3 gid : SV_GroupID,
uint gi : SV_GroupIndex,
uint3 gtid: SV_GroupThreadID
)
{
buf.InterlockedCompareStore(0, 1, (uint)(tid + gid + gi + gtid));
}
cs_5_0
dcl_globalFlags refactoringAllowed
dcl_uav_raw u0
dcl_input vThreadIDInGroupFlattened
dcl_input vThreadGroupID.x
dcl_input vThreadIDInGroup.x
dcl_input vThreadID.x
dcl_temps 1
dcl_thread_group 4, 2, 1
void main(
nointerpolation float texcoord0 : TEXCOORD0,
linear float texcoord1 : TEXCOORD1,
linear centroid float texcoord2 : TEXCOORD2,
linear noperspective float texcoord3 : TEXCOORD3,
linear noperspective centroid float texcoord4 : TEXCOORD4,
linear sample float texcoord5 : TEXCOORD5,
linear noperspective sample float texcoord6 : TEXCOORD6,
linear sample float4 ipos : SV_Position,
out float4 target0 : SV_Target0,
uint coverage_in : SV_Coverage,
out uint coverage_out : SV_Coverage,
float clip : SV_ClipDistance0,
uint vi : SV_ViewportArrayIndex,
out float depth_out : SV_Depth,
uint rt : SV_RenderTargetArrayIndex,
uint prim : SV_PrimitiveID,
uint si : SV_SampleIndex
)
{}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps constant v0.x
dcl_input_ps_siv constant v0.y, viewport_array_index
dcl_input_ps_siv constant v0.z, rendertarget_array_index
dcl_input_ps linear v1.x
dcl_input_ps linear centroid v2.x
dcl_input_ps linear noperspective v3.x
dcl_input_ps linear noperspective centroid v4.x
dcl_input_ps linear sample v5.x
dcl_input_ps linear noperspective sample v6.x
dcl_input_ps_siv linear noperspective sample v7.xyzw, position
dcl_input_ps_siv linear v8.x, clip_distance
dcl_input_ps_sgv constant v9.x, sampleIndex
dcl_input vCoverage
dcl_output o0.xyzw
dcl_output oMask
dcl_output oDepth
dcl_temps 2
void main(
float4 ipos : SV_Position,
uint id : SV_VertexID,
out float4 opos : SV_Position,
out float clip0 : SV_ClipDistance0,
out float cull0 : SV_CullDistance0,
out min16float v0 : TEXCOORD0,
out min10float v1 : TEXCOORD1,
out min16int v2 : TEXCOORD2,
out min12int v3 : TEXCOORD3,
out min16uint v4 : TEXCOORD4
)
{ }
vs_5_0
dcl_globalFlags refactoringAllowed | enableMinimumPrecision
dcl_input v0.xyzw
dcl_input_sgv v1.x, vertex_id
dcl_output_siv o0.xyzw, position
dcl_output_siv o1.x, clip_distance
dcl_output_siv o1.y, cull_distance
dcl_temps 1
float4 PSMain(VS_OUTPUT Input) : SV_TARGET
{
float4 vDiffuse = g_txDiffuse.Sample(g_samLinear, Input.vTexcoord);
float fLighting = saturate(dot(g_vLightDir, Input.vNormal));
fLighting = max(fLighting, g_fAmbient);
return vDiffuse * fLighting;
}
define void @PSMain() {
%1 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 0, i1 false) ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
%2 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 3, i32 0, i32 0, i1 false) ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
%3 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 1, i1 false) ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
%4 = call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 0, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%5 = call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 1, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%6 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%7 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%8 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 2, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%9 = call %dx.types.ResRet.f32 @dx.op.sample.f32(i32 60, %dx.types.Handle %1, %dx.types.Handle %2, float %4, float %5, float undef, float undef, i32 0, i32 0, i32 undef, float undef) ; Sample(srv,sampler,coord0,coord1,coord2,coord3,offset0,offset1,offset2,clamp)
%10 = extractvalue %dx.types.ResRet.f32 %9, 0
%11 = extractvalue %dx.types.ResRet.f32 %9, 1
%12 = extractvalue %dx.types.ResRet.f32 %9, 2
%13 = extractvalue %dx.types.ResRet.f32 %9, 3
%14 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %3, i32 0) ; CBufferLoadLegacy(handle,regIndex)
%15 = extractvalue %dx.types.CBufRet.f32 %14, 0
%16 = extractvalue %dx.types.CBufRet.f32 %14, 1
%17 = extractvalue %dx.types.CBufRet.f32 %14, 2
%18 = call float @dx.op.dot3.f32(i32 55, float %15, float %16, float %17, float %6, float %7, float %8) ; Dot3(ax,ay,az,bx,by,bz)
%19 = call float @dx.op.unary.f32(i32 7, float %18) ; Saturate(value)
%20 = extractvalue %dx.types.CBufRet.f32 %14, 3
%21 = call float @dx.op.binary.f32(i32 35, float %19, float %20) ; FMax(a,b)
%22 = fmul fast float %21, %10
%23 = fmul fast float %21, %11
%24 = fmul fast float %21, %12
%25 = fmul fast float %21, %13
call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %22) ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float %23) ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float %24) ; StoreOutput(outputSigId,rowIndex,colIndex,value)
call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float %25) ; StoreOutput(outputSigId,rowIndex,colIndex,value)
ret void
}
The difference is most prominate with the ResourceDescriptorheap syntax
uint ID;
float main(uint i:I): SV_Target {
Buffer<float> buf = ResourceDescriptorHeap[ID];
return buf[i];
}
define void @main() {
%1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false) ; CreateHandleFromBinding(bind,index,nonUniformIndex)
%2 = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 13, i32 4 }) ; AnnotateHandle(res,props) resource: CBuffer
%4 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %3, i32 0) ; CBufferLoadLegacy(handle,regIndex)
%5 = extractvalue %dx.types.CBufRet.i32 %4, 0
%6 = call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218, i32 %5, i1 false, i1 false) ; CreateHandleFromHeap(index,samplerHeap,nonUniformIndex)
%7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 10, i32 265 }) ; AnnotateHandle(res,props) resource: TypedBuffer<F32>
%8 = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %7, i32 %2, i32 undef) ; BufferLoad(srv,index,wot)
%9 = extractvalue %dx.types.ResRet.f32 %8, 0
call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %9) ; StoreOutput(outputSigId,rowIndex,colIndex,value)
ret void
}
cbuffer A {
float a;
}
void StoreOutputMat(float2x2 m, uint gidx);
float2x2 LoadInputMat(uint x, uint y);
float2x2 RotateMat(float2x2 m, uint x, uint y);
[numthreads(8,8,1)]
void entry( uint2 tid : SV_DispatchThreadID, uint2 gid : SV_GroupID, uint2 gtid : SV_GroupThreadID, uint gidx : SV_GroupIndex )
{
float2x2 f2x2 = LoadInputMat(gid.x, gid.y);
f2x2 = RotateMat(f2x2, tid.x, tid.y) + a;
StoreOutputMat(f2x2, gidx);
}
define void @entry() {
%1 = load %A, %A* @A, align 4
%2 = call %dx.types.Handle @dx.op.createHandleForLib.A(i32 160, %A %1) ; CreateHandleForLib(Resource)
%3 = call i32 @dx.op.threadId.i32(i32 93, i32 0) ; ThreadId(component)
%4 = call i32 @dx.op.threadId.i32(i32 93, i32 1) ; ThreadId(component)
%5 = call i32 @dx.op.groupId.i32(i32 94, i32 0) ; GroupId(component)
%6 = call i32 @dx.op.groupId.i32(i32 94, i32 1) ; GroupId(component)
%7 = call i32 @dx.op.flattenedThreadIdInGroup.i32(i32 96) ; FlattenedThreadIdInGroup()
%8 = alloca <4 x float>, align 8
%9 = alloca <4 x float>, align 8
%10 = alloca <4 x float>, align 8
%11 = alloca <4 x float>, align 8
%12 = call %class.matrix.float.2.2 @"\01?LoadInputMat@@YA?AV?$matrix@M$01$01@@II@Z"(i32 %5, i32 %6) #3
%13 = bitcast <4 x float>* %11 to %class.matrix.float.2.2*
store %class.matrix.float.2.2 %12, %class.matrix.float.2.2* %13, align 8
%14 = load <4 x float>, <4 x float>* %11, align 8
%15 = bitcast <4 x float>* %10 to %class.matrix.float.2.2*
store <4 x float> %14, <4 x float>* %10, align 8
%16 = load %class.matrix.float.2.2, %class.matrix.float.2.2* %15, align 8
%17 = call %class.matrix.float.2.2 @"\01?RotateMat@@YA?AV?$matrix@M$01$01@@V1@II@Z"(%class.matrix.float.2.2 %16, i32 %3, i32 %4) #3
%18 = bitcast <4 x float>* %9 to %class.matrix.float.2.2*
store %class.matrix.float.2.2 %17, %class.matrix.float.2.2* %18, align 8
%19 = load <4 x float>, <4 x float>* %9, align 8
%20 = extractelement <4 x float> %19, i32 0
%21 = extractelement <4 x float> %19, i32 1
%22 = extractelement <4 x float> %19, i32 2
%23 = extractelement <4 x float> %19, i32 3
%24 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %2, i32 0) ; CBufferLoadLegacy(handle,regIndex)
%25 = extractvalue %dx.types.CBufRet.f32 %24, 0
%26 = fadd fast float %25, %20
%27 = fadd fast float %25, %21
%28 = fadd fast float %25, %22
%29 = fadd fast float %25, %23
%30 = insertelement <4 x float> undef, float %26, i32 0
%31 = insertelement <4 x float> %30, float %27, i32 1
%32 = insertelement <4 x float> %31, float %28, i32 2
%33 = insertelement <4 x float> %32, float %29, i32 3
%34 = bitcast <4 x float>* %8 to %class.matrix.float.2.2*
store <4 x float> %33, <4 x float>* %8, align 8
%35 = load %class.matrix.float.2.2, %class.matrix.float.2.2* %34, align 8
call void @"\01?StoreOutputMat@@YAXV?$matrix@M$01$01@@I@Z"(%class.matrix.float.2.2 %35, i32 %7) #3
ret void
}
declare %class.matrix.float.2.2 @"\01?LoadInputMat@@YA?AV?$matrix@M$01$01@@II@Z"(i32, i32) #0
declare %class.matrix.float.2.2 @"\01?RotateMat@@YA?AV?$matrix@M$01$01@@V1@II@Z"(%class.matrix.float.2.2, i32, i32) #0
declare void @"\01?StoreOutputMat@@YAXV?$matrix@M$01$01@@I@Z"(%class.matrix.float.2.2, i32) #0
SM6 only allows for jumps (br statements) and switch statements. Loops and if statements are converted to these primatives
float main(float2 a : A, int3 b : B) : SV_Target
{
float s = 0;
[loop]
for(int i = 0; i < b.x; i++) {
s += a.x;
}
return s;
}
define void @main() {
%1 = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%2 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef) ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
%3 = icmp sgt i32 %1, 0
br i1 %3, label %4, label %12
; <label>:4 ; preds = %0
br label %5
; <label>:5 ; preds = %5, %4
%6 = phi float [ %8, %5 ], [ 0.000000e+00, %4 ]
%7 = phi i32 [ %9, %5 ], [ 0, %4 ]
%8 = fadd fast float %6, %2
%9 = add nuw nsw i32 %7, 1
%10 = icmp eq i32 %9, %1
br i1 %10, label %11, label %5, !llvm.loop !14
; <label>:11 ; preds = %5
br label %12
; <label>:12 ; preds = %11, %0
%13 = phi float [ 0.000000e+00, %0 ], [ %8, %11 ]
call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %13) ; StoreOutput(outputSigId,rowIndex,colIndex,value)
ret void
}