Skip to content

Decompiler Intermediate Representation Design

spacehamster edited this page Mar 4, 2021 · 5 revisions

Design Considerations

DXDecompiler should support SM2, SM3, SM4, SM5 shader models. SM1 and SM6 support may be considered in the future, so the decompiler should allow for the possibility. HLSL output should be correct and readable. To implement this, the architecture should be designed as such:

  1. Parse Bytecode into a Bytecode Container (SM2-3 and SM4-5 have their own individual containers).
  2. Convert Bytecode Container into a intermediate representation
  3. Convert intermediate representation to HLSL source.

There are a number of issues and goals that make directly translating shader bytecode difficult.

  • Type information is stripped from instruction operands and needs to be reconstructed
  • The ability to decompile multiple shader stages in a single batch should be supported in order to decompile effect shaders and to merge shared resource definitions of separate stages.
  • Some bytecode instructions do not correspond cleanly to HLSL and need transformation to better represent HLSL semantics
  • Some bytecode structures do not correspond cleanly to HLSL and need transformation to better represent HLSL semantics, such as hull shaders and classes & interfaces.
  • Instruction lifting needs to be performed, to transform low level bytecode instructions into high level HLSL instructions. This may be a one to many or many to one process.
  • Information is spread across instruction declarations and bytecode chunks and should be consolidated.
  • SM6 shaders scalarize all instructions. A single vector instruction such as vec1.xyzw + vec2.xyzw translates into 4 scalar instructions for each component.
  • SM6 only supports unstructured control flow (jump and label statements) and need to be converted into structured control flow constructs (if statements, while loops).

Open Questions

  • How should analysis such as type reconstruction and instruction lifting be done in a way that supports operand swizzles?
  • How closely should the IR format resemble HLSL vs bytecode?
  • Should the format be a AST or a list of instructions that operate on registers?
  • Should destination registers be part of the IR format?
  • Should the format be a tree structure or a list?
  • How should declarations be implemented?
  • How should IR passes be structured (lifting passes, type reconstruction passes)?
  • How should a mapping between assembly instructions and hlsl instructions be maintained during transforms.

Details

Dynamic indexing can be complicated

struct foo {
	float4 sValue1[5];
	float4 sValue2[5];
};
StructuredBuffer<struct foo> tex;
int index1;
int index2;
float4 main() : SV_Target
{
	float4 result = 0;
	result += tex.Load(index1).sValue2[index2];
	return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_resource_structured t0, 160
dcl_output o0.xyzw
dcl_temps 3
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r0.xyzw, cb0[0].x, l(80), t0.xyzw
ieq r1.x, cb0[0].y, l(0)
and r0.xyzw, r0.xyzw, r1.xxxx
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(96), t0.xyzw
ieq r2.xyzw, cb0[0].yyyy, l(1, 2, 3, 4)
and r1.xyzw, r1.xyzw, r2.xxxx
or r0.xyzw, r0.xyzw, r1.xyzw
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(112), t0.xyzw
and r1.xyzw, r1.xyzw, r2.yyyy
or r0.xyzw, r0.xyzw, r1.xyzw
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(128), t0.xyzw
and r1.xyzw, r1.xyzw, r2.zzzz
or r0.xyzw, r0.xyzw, r1.xyzw
ld_structured_indexable(structured_buffer, stride=160)(mixed,mixed,mixed,mixed) r1.xyzw, cb0[0].x, l(144), t0.xyzw
and r1.xyzw, r1.xyzw, r2.wwww
or o0.xyzw, r0.xyzw, r1.xyzw
ret 

Object methods may have out variables and complicated overloading

Texture2D tex;
float4 main() : SV_Target
{
	float4 result = 0;
    //In Variables
    uint mipLevel = 1;
    //Out Variables
    uint width;
    uint height;
    uint numberOfLevels;
	tex.GetDimensions(width, height);
    result += width;
    result += height;
    tex.GetDimensions(mipLevel, width, height, numberOfLevels);
    result += width;
    result += height;
    result += numberOfLevels;
	return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_resource_texture2d (float,float,float,float) t0
dcl_output o0.xyzw
dcl_temps 1
resinfo_indexable(texture2d)(float,float,float,float)_uint r0.xy, l(0), t0.xyzw
utof r0.xy, r0.xyxx
add r0.x, r0.y, r0.x
resinfo_indexable(texture2d)(float,float,float,float)_uint r0.yzw, l(1), t0.zxyw
utof r0.yzw, r0.yyzw
add r0.x, r0.y, r0.x
add r0.x, r0.z, r0.x
add o0.xyzw, r0.wwww, r0.xxxx
ret 

Object methods without variables need variable for unused parameters

dcl_resource_structured t111, 16
bufinfo_indexable(structured_buffer, stride=16)(mixed,mixed,mixed,mixed) r0.y, t111.yxzw
StructuredBuffer<struct foo> struct_buf;
void main(){
    uint udim, numStructs, stride;
    struct_buf.GetDimensions(numStructs, stride);
}

Object methods have template overload methods that depend on the type

dcl_resource_buffer (float,float,float,float) t109
dcl_resource_structured t111, 16
bufinfo_indexable(buffer)(float,float,float,float) r0.x, t109.xyzw
bufinfo_indexable(structured_buffer, stride=16)(mixed,mixed,mixed,mixed) r0.y, t111.yxzw
Buffer<float4> buf_resource;
StructuredBuffer<struct foo> struct_buf;
void main(){
    uint udim, numStructs, stride;
    buf_resource.GetDimensions(udim);
    struct_buf.GetDimensions(numStructs, stride);
}

Multiple variables can share one register and be written to in one instruction

float g1;
float3 g2;
struct Input {
    float f1 : TEXCOORD0;
    float3 f2 : TEXCOORD1;
};
float4 main(Input input) : SV_TARGET {
    float4 result = 0;
    result += g1 + input.f1;
    result += float4(g2.xyz, 0) + float4(input.f2.xyz, 0);

    return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_input_ps linear v0.x
dcl_input_ps linear v0.yzw
dcl_output o0.xyzw
dcl_temps 2
mov r0.xyz, v0.yzwy
add r0.xyz, r0.xyzx, cb0[0].yzwy
add r1.x, v0.x, cb0[0].x
mov r0.w, l(0)
add o0.xyzw, r0.xyzw, r1.xxxx
ret 

Interfaces and Classes

  • Methods can have multiple bodies
  • Assembly does not specify parameters
interface iBaseLight
{
   float3 IlluminateAmbient(float3 vNormal);
   float3 IlluminateDiffuse(float3 vNormal);
};
class cAmbientLight : iBaseLight
{
   float3            m_vLightColor;     
   bool     m_bEnable;
   float3 IlluminateAmbient(float3 vNormal) {
	   return vNormal + 2;
   }
   float3 IlluminateDiffuse(float3 vNormal) {
	   return vNormal + 3;
   }
};
class dAmbientLight : iBaseLight
{
   float3            m_vLightColor;     
   bool     m_bEnable;
   float3 IlluminateAmbient(float3 vNormal) {
	   return vNormal + 4;
   }
   float3 IlluminateDiffuse(float3 vNormal) {
	   return vNormal + 5;
   }
};
struct PS_INPUT
{
    float4 vPosition : SV_POSITION;
    float3 vNormal   : NORMAL;
    float2 vTexcoord : TEXCOORD0;
};
iBaseLight     g_abstractAmbientLighting;
iBaseLight     g_abstractDirectLighting;
float4 main( PS_INPUT Input ) : SV_TARGET {
	float4 result = (float4)0;
	float3 temp = (float3)0.0f;       
    temp += g_abstractAmbientLighting.IlluminateAmbient( Input.vNormal );
	temp += g_abstractAmbientLighting.IlluminateAmbient( Input.vNormal );
	temp += g_abstractAmbientLighting.IlluminateDiffuse( Input.vNormal );

	result += float4(temp, 0);
	return result;
}
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions: 
//
// interfaces $ThisPointer
// {
//
//   interface iBaseLight g_abstractAmbientLighting;// Offset:    0 Size:     1
//   interface iBaseLight g_abstractDirectLighting;// Offset:  N/A Size:   N/A [unused]
//
// }
//
//
//
// Input signature:
//
// Name                 Index   Mask Register SysValue  Format   Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_POSITION              0   xyzw        0      POS   float       
// NORMAL                   0   xyz         1     NONE   float   xyz 
// TEXCOORD                 0   xy          2     NONE   float       
//
//
// Output signature:
//
// Name                 Index   Mask Register SysValue  Format   Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_TARGET                0   xyzw        0   TARGET   float   xyzw
//
//
// Available Class Types:
//
// Name                             ID CB Stride Texture Sampler
// ------------------------------ ---- --------- ------- -------
// dAmbientLight                     0         0       0       0
// cAmbientLight                     1         0       0       0
//
// Interface slots, 1 total:
//
//             Slots
// +----------+---------+---------------------------------------
// | Type ID  |   0     |0    1    
// | Table ID |         |0    1    
// +----------+---------+---------------------------------------
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_function_body fb0
dcl_function_body fb1
dcl_function_body fb2
dcl_function_body fb3
dcl_function_body fb4
dcl_function_body fb5
dcl_function_table ft0 = {fb0, fb2, fb4}
dcl_function_table ft1 = {fb1, fb3, fb5}
dcl_interface fp0[1][3] = {ft0, ft1}
dcl_input_ps linear v1.xyz
dcl_output o0.xyzw
dcl_temps 2
fcall fp0[0][0]
fcall fp0[0][1]
add r0.xyz, r0.xyzx, r1.xyzx
fcall fp0[0][2]
add o0.xyz, r0.xyzx, r1.xyzx
mov o0.w, l(0)
ret 
label fb0
add r0.xyz, v1.xyzx, l(4.000000, 4.000000, 4.000000, 0.000000)
ret 
label fb1
add r0.xyz, v1.xyzx, l(2.000000, 2.000000, 2.000000, 0.000000)
ret 
label fb2
add r1.xyz, v1.xyzx, l(4.000000, 4.000000, 4.000000, 0.000000)
ret 
label fb3
add r1.xyz, v1.xyzx, l(2.000000, 2.000000, 2.000000, 0.000000)
ret 
label fb4
add r1.xyz, v1.xyzx, l(5.000000, 5.000000, 5.000000, 0.000000)
ret 
label fb5
add r1.xyz, v1.xyzx, l(3.000000, 3.000000, 3.000000, 0.000000)
ret 
// Approximately 19 instruction slots used

Instruction Lifting

Matrix operations

Matrix operations are converted to vector operations

VS_OUTPUT VSMain(VS_INPUT Input)
{
	VS_OUTPUT Output;
	Output.vPosition = mul(Input.vPosition, g_mWorldViewProjection);
	return Output;
}
vs_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[4], immediateIndexed
dcl_input v0.xyzw
dcl_output_siv o0.xyzw, position
dp4 o0.x, v0.xyzw, cb0[0].xyzw
dp4 o0.y, v0.xyzw, cb0[1].xyzw
dp4 o0.z, v0.xyzw, cb0[2].xyzw
dp4 o0.w, v0.xyzw, cb0[3].xyzw
ret 

Many operations can be merged into a single high level function

For example

normalize(value)

is converted to three instructions

dp4 r0.x, cb0[0].xyzw, cb0[0].xyzw
rsq r0.x, r0.x
mul o0.xyzw, r0.xxxx, cb0[0].xyzw

Instructions include operand modifiers

The syntax for instructions is generally

add[_sat] dest[.mask], [-]src0[_abs][.swizzle], [-]src1[_abs][.swizzle]

which allows for modifiers such as saturate, absolute value and negation that should be handled somehow.

UBFE, IBFE and BFI

UBFE, IBFE and BFI are defined as:

Given width, offset:
if( width == 0 )
{
    dest = 0
}
else if( width + offset < 32 )
{
    shl dest, src2, 32-(width+offset)
    ushr dest, dest, 32-width
}
else
{
	ushr dest, src2, offset
}

UAddc & USubb

UAddc and USubb store output in two destination registers, one for the main output and one for the result of carry/borrow.

UDiv

UDiv stores output in two destination registers, one for the quotient and one for the remainder

IMul & UMul

IMul and UMul store the output in two destination registers, one for the high 32 bits and one for the low 32 bits.

SwapC needs to create temps

swapc dest0[.mask] dest1[.mask] src0[.swizzle] src1[.swizzle] src2[.swizzle]

; expands to:

movc temp[dest0 s mask] src0[.swizzle] src2[.swizzle] src1[.swizzle]
movc dest1[.mask] src0[.swizzle] src1[.swizzle] src2[.swizzle]
mov  dest0.mask, temp

SinCos

Sin and Cos are implemented as a single instruction that has two destination registers

return sin(x) + cos(x);
sincos r0.xyzw, r1.xyzw, cb0[0].xyzw
add o0.xyzw, r0.xyzw, r1.xyzw

Loops

Convert loops into something more easily readable

float g1;
float4 main() : SV_TARGET {
    float4 result = 10;
    for(int i = 0; i < 3; i++){
        result /= g1;
    }
    while(true){
        if(result.x > 10) break;
        result /= g1;
    }
    return result;
}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_constantbuffer CB0[1], immediateIndexed
dcl_output o0.xyzw
dcl_temps 3
mov r0.xyzw, l(10.000000,10.000000,10.000000,10.000000)
mov r1.x, l(0)
loop 
  ige r1.y, r1.x, l(3)
  breakc_nz r1.y
  div r0.xyzw, r0.xyzw, cb0[0].xxxx
  iadd r1.x, r1.x, l(1)
endloop 
mov r1.xyzw, r0.xyzw
loop 
  lt r2.x, l(10.000000), r1.x
  if_nz r2.x
    break 
  endif 
  div r1.xyzw, r1.xyzw, cb0[0].xxxx
endloop 
mov o0.xyzw, r1.xyzw
ret 

Geometry Shaders can merge multiple output signatures into a single register based on semantic

struct InVertex
{
    float3 pos          : POSITION;
    float3 scale         : NORMAL;
};
struct OutVertex1
{
    float3 pos          : OUT_POSITION;
};

struct OutVertex2
{
    int3 pos          : OUT_POSITION;
};

[maxvertexcount(18)]
void main( 
    line InVertex verts[2], 
    inout PointStream<OutVertex1> myStream1, 
    inout PointStream<OutVertex2> myStream2 )
{
	OutVertex1 myVert1;
	OutVertex2 myVert2;
    myVert1.pos = verts[0].pos * verts[0].scale;
    myVert2.pos = verts[1].pos * verts[1].scale;
    myStream1.Append( myVert1 );
    myStream2.Append( myVert2 );
}
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
//
// Input signature:
//
// Name                 Index   Mask Register SysValue  Format   Used
// -------------------- ----- ------ -------- -------- ------- ------
// POSITION                 0   xyz         0     NONE   float   xyz 
// NORMAL                   0   xyz         1     NONE   float   xyz 
//
//
// Output signature:
//
// Name                 Index   Mask Register SysValue  Format   Used
// -------------------- ----- ------ -------- -------- ------- ------
// m0:OUT_POSITION          0   xyz         0     NONE   float   xyz 
// m1:OUT_POSITION          0   xyz         0     NONE     int   xyz 
//
gs_5_0
dcl_globalFlags refactoringAllowed
dcl_input v[2][0].xyz
dcl_input v[2][1].xyz
dcl_temps 1
dcl_inputprimitive line 
dcl_stream m0
dcl_outputtopology pointlist 
dcl_output o0.xyz
dcl_stream m1
dcl_outputtopology pointlist 
dcl_output o0.xyz
dcl_maxout 18
mul r0.xyz, v[0][0].xyzx, v[0][1].xyzx
mov o0.xyz, r0.xyzx
emit_stream m0
mul r0.xyz, v[1][0].xyzx, v[1][1].xyzx
ftoi r0.xyz, r0.xyzx
mov o0.xyz, r0.xyzx
emit_stream m1
ret 
// Approximately 8 instruction slots used

Emit and cut operate on implicit registers

emit causes all declared o# registers to be read out of the Geometry Shader to generate a vertex.

only valid if streams have not been declared

Hull Shaders

Hull shaders are made up of 2 phases, a control point phase and a fork/join phase. The control point phase is optional

hs_5_0
hs_decls
; dcls start
hs_control_point_phase
; dcls end
; function body here
ret
hs_fork_phase
dcl_hs_fork_phase_instance_count 4
; dcls here
; function body here
ret
hs_fork_phase
dcl_hs_fork_phase_instance_count 2
; dcls here
; function body here
ret

The original patch constant function cannot be recovered and the instructions of each fork and join phases need to be by a loop that iterates the corresponding number of phase-instance-count iterations

Method signatures need to be derived from a mixture declaration tokens and input output chunks

Geometry Shader

[maxvertexcount(3)]
void main(
	triangle float4 ipos[3] : SV_Position,
	inout TriangleStream<PSSceneIn> OutputStream,
	uint id : SV_GSInstanceID,
	uint prim : SV_PrimitiveID
	)
{
	PSSceneIn o = (PSSceneIn)0;
	OutputStream.Append(o);
}
dcl_globalFlags refactoringAllowed
dcl_input_siv v[3][0].xyzw, position
dcl_input vGSInstanceID
dcl_input vPrim
dcl_temps 1
dcl_inputprimitive triangle
dcl_stream m0
dcl_outputtopology trianglestrip
dcl_output_siv o0.xyzw, position
dcl_output o1.xyzw
dcl_output_siv o2.x, clip_distance
dcl_output_siv o2.y, cull_distance
dcl_output_sgv o3.x, is_front_face
dcl_output_siv o3.y, rendertarget_array_index
dcl_output_siv o3.z, viewport_array_index
dcl_maxout 3

Hull Shader

HS_CONSTANT_DATA_OUTPUT SubDToBezierConstantsHS(
		InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,
		uint PatchID : SV_PrimitiveID )
{
	HS_CONSTANT_DATA_OUTPUT Output;		
	return Output;
}
[domain("quad")] 
[partitioning("integer")] 
[outputtopology("triangle_cw")] 
[outputcontrolpoints(16)] 
[patchconstantfunc("SubDToBezierConstantsHS")] 
BEZIER_CONTROL_POINT main(  
		InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> ip,  
		uint i : SV_OutputControlPointID, 
		uint PatchID : SV_PrimitiveID ) 
{ 
	BEZIER_CONTROL_POINT Output; 
	return Output; 
}
hs_5_0
hs_decls 
dcl_input_control_point_count 32
dcl_output_control_point_count 16
dcl_tessellator_domain domain_quad
dcl_tessellator_partitioning partitioning_integer
dcl_tessellator_output_primitive output_triangle_cw
dcl_globalFlags refactoringAllowed
hs_control_point_phase 
dcl_input vPrim
dcl_output o0.x
ret 
hs_fork_phase 
dcl_hs_fork_phase_instance_count 4
dcl_input vForkInstanceID
dcl_output_siv o0.x, finalQuadUeq0EdgeTessFactor
dcl_output_siv o1.x, finalQuadVeq0EdgeTessFactor
dcl_output_siv o2.x, finalQuadUeq1EdgeTessFactor
dcl_output_siv o3.x, finalQuadVeq1EdgeTessFactor
dcl_temps 1
dcl_indexrange o0.x 4
ret 
hs_fork_phase 
dcl_hs_fork_phase_instance_count 2
dcl_input vForkInstanceID
dcl_output_siv o4.x, finalQuadUInsideTessFactor
dcl_output_siv o5.x, finalQuadVInsideTessFactor
dcl_temps 1
dcl_indexrange o4.x 2
ret 

Domain Shader

[domain("quad")]
DS_OUTPUT main( HS_CONSTANT_DATA_OUTPUT input, 
		float2 UV : SV_DomainLocation,
		const OutputPatch<BEZIER_CONTROL_POINT, 16> bezpatch )
{
	DS_OUTPUT Output = (DS_OUTPUT)0;
	return Output;    
}
dcl_input_control_point_count 16
dcl_tessellator_domain domain_quad
dcl_globalFlags refactoringAllowed
dcl_output o0.xyz
dcl_output o1.xy
dcl_output o2.xyz
dcl_output o3.xyz
dcl_output_siv o4.xyzw, position

Compute Shader

RWByteAddressBuffer buf : register(u0);
[numthreads(4, 2, 1)]
void main(
	uint3 tid : SV_DispatchThreadID,
	uint3 gid : SV_GroupID,
	uint gi : SV_GroupIndex,
	uint3 gtid: SV_GroupThreadID
	)
{
	buf.InterlockedCompareStore(0, 1, (uint)(tid + gid + gi + gtid));
}
cs_5_0
dcl_globalFlags refactoringAllowed
dcl_uav_raw u0
dcl_input vThreadIDInGroupFlattened
dcl_input vThreadGroupID.x
dcl_input vThreadIDInGroup.x
dcl_input vThreadID.x
dcl_temps 1
dcl_thread_group 4, 2, 1

Pixel Shader

void main(
	nointerpolation float texcoord0 : TEXCOORD0,
	linear float texcoord1 : TEXCOORD1,
	linear centroid float texcoord2 : TEXCOORD2,
	linear noperspective float texcoord3 : TEXCOORD3,
	linear noperspective centroid float texcoord4 : TEXCOORD4,
	linear sample float texcoord5 : TEXCOORD5,
	linear noperspective sample float texcoord6 : TEXCOORD6,

	linear sample float4 ipos : SV_Position,
	out float4 target0 : SV_Target0,
	uint coverage_in : SV_Coverage,
	out uint coverage_out : SV_Coverage,
	float clip : SV_ClipDistance0,
	uint vi : SV_ViewportArrayIndex,
	out float depth_out : SV_Depth,
	uint rt : SV_RenderTargetArrayIndex,
	uint prim : SV_PrimitiveID,
	uint si : SV_SampleIndex
	)
{}
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps constant v0.x
dcl_input_ps_siv constant v0.y, viewport_array_index
dcl_input_ps_siv constant v0.z, rendertarget_array_index
dcl_input_ps linear v1.x
dcl_input_ps linear centroid v2.x
dcl_input_ps linear noperspective v3.x
dcl_input_ps linear noperspective centroid v4.x
dcl_input_ps linear sample v5.x
dcl_input_ps linear noperspective sample v6.x
dcl_input_ps_siv linear noperspective sample v7.xyzw, position
dcl_input_ps_siv linear v8.x, clip_distance
dcl_input_ps_sgv constant v9.x, sampleIndex
dcl_input vCoverage
dcl_output o0.xyzw
dcl_output oMask
dcl_output oDepth
dcl_temps 2

Vertex Shader

void main(
	float4 ipos : SV_Position,
	uint id : SV_VertexID,
	out float4 opos : SV_Position,
	out float clip0 : SV_ClipDistance0,
	out float cull0 : SV_CullDistance0,
	out min16float v0 : TEXCOORD0,
	out min10float v1 : TEXCOORD1,
	out min16int v2 : TEXCOORD2,
	out min12int v3 : TEXCOORD3,
	out min16uint v4 : TEXCOORD4
	)
{ }
vs_5_0
dcl_globalFlags refactoringAllowed | enableMinimumPrecision
dcl_input v0.xyzw
dcl_input_sgv v1.x, vertex_id
dcl_output_siv o0.xyzw, position
dcl_output_siv o1.x, clip_distance
dcl_output_siv o1.y, cull_distance
dcl_temps 1

SM6 operations are scalarized

float4 PSMain(VS_OUTPUT Input) : SV_TARGET
{
	float4 vDiffuse = g_txDiffuse.Sample(g_samLinear, Input.vTexcoord);
	float fLighting = saturate(dot(g_vLightDir, Input.vNormal));
	fLighting = max(fLighting, g_fAmbient);
	return vDiffuse * fLighting;
}
define void @PSMain() {
  %1 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
  %2 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 3, i32 0, i32 0, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
  %3 = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 1, i1 false)  ; CreateHandle(resourceClass,rangeId,index,nonUniformIndex)
  %4 = call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 0, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %5 = call float @dx.op.loadInput.f32(i32 4, i32 1, i32 0, i8 1, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %6 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %7 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %8 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 2, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %9 = call %dx.types.ResRet.f32 @dx.op.sample.f32(i32 60, %dx.types.Handle %1, %dx.types.Handle %2, float %4, float %5, float undef, float undef, i32 0, i32 0, i32 undef, float undef)  ; Sample(srv,sampler,coord0,coord1,coord2,coord3,offset0,offset1,offset2,clamp)
  %10 = extractvalue %dx.types.ResRet.f32 %9, 0
  %11 = extractvalue %dx.types.ResRet.f32 %9, 1
  %12 = extractvalue %dx.types.ResRet.f32 %9, 2
  %13 = extractvalue %dx.types.ResRet.f32 %9, 3
  %14 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
  %15 = extractvalue %dx.types.CBufRet.f32 %14, 0
  %16 = extractvalue %dx.types.CBufRet.f32 %14, 1
  %17 = extractvalue %dx.types.CBufRet.f32 %14, 2
  %18 = call float @dx.op.dot3.f32(i32 55, float %15, float %16, float %17, float %6, float %7, float %8)  ; Dot3(ax,ay,az,bx,by,bz)
  %19 = call float @dx.op.unary.f32(i32 7, float %18)  ; Saturate(value)
  %20 = extractvalue %dx.types.CBufRet.f32 %14, 3
  %21 = call float @dx.op.binary.f32(i32 35, float %19, float %20)  ; FMax(a,b)
  %22 = fmul fast float %21, %10
  %23 = fmul fast float %21, %11
  %24 = fmul fast float %21, %12
  %25 = fmul fast float %21, %13
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %22)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float %23)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float %24)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float %25)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  ret void
}

SM6 operates on resource handles rather then resource registers

The difference is most prominate with the ResourceDescriptorheap syntax

uint ID;
float main(uint i:I): SV_Target {
  Buffer<float> buf = ResourceDescriptorHeap[ID];
  return buf[i];
}
define void @main() {
  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
  %2 = call i32 @dx.op.loadInput.i32(i32 4, i32 0, i32 0, i8 0, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %3 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 13, i32 4 })  ; AnnotateHandle(res,props)  resource: CBuffer
  %4 = call %dx.types.CBufRet.i32 @dx.op.cbufferLoadLegacy.i32(i32 59, %dx.types.Handle %3, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
  %5 = extractvalue %dx.types.CBufRet.i32 %4, 0
  %6 = call %dx.types.Handle @dx.op.createHandleFromHeap(i32 218, i32 %5, i1 false, i1 false)  ; CreateHandleFromHeap(index,samplerHeap,nonUniformIndex)
  %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 10, i32 265 })  ; AnnotateHandle(res,props)  resource: TypedBuffer<F32>
  %8 = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %7, i32 %2, i32 undef)  ; BufferLoad(srv,index,wot)
  %9 = extractvalue %dx.types.ResRet.f32 %8, 0
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %9)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  ret void
}

SM6 supports function calls in library shaders

cbuffer A {
  float a;
}

void StoreOutputMat(float2x2  m, uint gidx);
float2x2 LoadInputMat(uint x, uint y);
float2x2 RotateMat(float2x2 m, uint x, uint y);

[numthreads(8,8,1)]
void entry( uint2 tid : SV_DispatchThreadID, uint2 gid : SV_GroupID, uint2 gtid : SV_GroupThreadID, uint gidx : SV_GroupIndex )
{
    float2x2 f2x2 = LoadInputMat(gid.x, gid.y);
    f2x2 = RotateMat(f2x2, tid.x, tid.y) + a;
    StoreOutputMat(f2x2, gidx);
}
define void @entry() {
  %1 = load %A, %A* @A, align 4
  %2 = call %dx.types.Handle @dx.op.createHandleForLib.A(i32 160, %A %1)  ; CreateHandleForLib(Resource)
  %3 = call i32 @dx.op.threadId.i32(i32 93, i32 0)  ; ThreadId(component)
  %4 = call i32 @dx.op.threadId.i32(i32 93, i32 1)  ; ThreadId(component)
  %5 = call i32 @dx.op.groupId.i32(i32 94, i32 0)  ; GroupId(component)
  %6 = call i32 @dx.op.groupId.i32(i32 94, i32 1)  ; GroupId(component)
  %7 = call i32 @dx.op.flattenedThreadIdInGroup.i32(i32 96)  ; FlattenedThreadIdInGroup()
  %8 = alloca <4 x float>, align 8
  %9 = alloca <4 x float>, align 8
  %10 = alloca <4 x float>, align 8
  %11 = alloca <4 x float>, align 8
  %12 = call %class.matrix.float.2.2 @"\01?LoadInputMat@@YA?AV?$matrix@M$01$01@@II@Z"(i32 %5, i32 %6) #3
  %13 = bitcast <4 x float>* %11 to %class.matrix.float.2.2*
  store %class.matrix.float.2.2 %12, %class.matrix.float.2.2* %13, align 8
  %14 = load <4 x float>, <4 x float>* %11, align 8
  %15 = bitcast <4 x float>* %10 to %class.matrix.float.2.2*
  store <4 x float> %14, <4 x float>* %10, align 8
  %16 = load %class.matrix.float.2.2, %class.matrix.float.2.2* %15, align 8
  %17 = call %class.matrix.float.2.2 @"\01?RotateMat@@YA?AV?$matrix@M$01$01@@V1@II@Z"(%class.matrix.float.2.2 %16, i32 %3, i32 %4) #3
  %18 = bitcast <4 x float>* %9 to %class.matrix.float.2.2*
  store %class.matrix.float.2.2 %17, %class.matrix.float.2.2* %18, align 8
  %19 = load <4 x float>, <4 x float>* %9, align 8
  %20 = extractelement <4 x float> %19, i32 0
  %21 = extractelement <4 x float> %19, i32 1
  %22 = extractelement <4 x float> %19, i32 2
  %23 = extractelement <4 x float> %19, i32 3
  %24 = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %2, i32 0)  ; CBufferLoadLegacy(handle,regIndex)
  %25 = extractvalue %dx.types.CBufRet.f32 %24, 0
  %26 = fadd fast float %25, %20
  %27 = fadd fast float %25, %21
  %28 = fadd fast float %25, %22
  %29 = fadd fast float %25, %23
  %30 = insertelement <4 x float> undef, float %26, i32 0
  %31 = insertelement <4 x float> %30, float %27, i32 1
  %32 = insertelement <4 x float> %31, float %28, i32 2
  %33 = insertelement <4 x float> %32, float %29, i32 3
  %34 = bitcast <4 x float>* %8 to %class.matrix.float.2.2*
  store <4 x float> %33, <4 x float>* %8, align 8
  %35 = load %class.matrix.float.2.2, %class.matrix.float.2.2* %34, align 8
  call void @"\01?StoreOutputMat@@YAXV?$matrix@M$01$01@@I@Z"(%class.matrix.float.2.2 %35, i32 %7) #3
  ret void
}

declare %class.matrix.float.2.2 @"\01?LoadInputMat@@YA?AV?$matrix@M$01$01@@II@Z"(i32, i32) #0

declare %class.matrix.float.2.2 @"\01?RotateMat@@YA?AV?$matrix@M$01$01@@V1@II@Z"(%class.matrix.float.2.2, i32, i32) #0

declare void @"\01?StoreOutputMat@@YAXV?$matrix@M$01$01@@I@Z"(%class.matrix.float.2.2, i32) #0

SM6 only allows for unstructured control flow

SM6 only allows for jumps (br statements) and switch statements. Loops and if statements are converted to these primatives

float main(float2 a : A, int3 b : B) : SV_Target
{
  float s = 0;
  [loop]
  for(int i = 0; i < b.x; i++) {
    s += a.x;
  }

  return s;
}
define void @main() {
  %1 = call i32 @dx.op.loadInput.i32(i32 4, i32 1, i32 0, i8 0, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %2 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 undef)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
  %3 = icmp sgt i32 %1, 0
  br i1 %3, label %4, label %12

; <label>:4                                       ; preds = %0
  br label %5

; <label>:5                                       ; preds = %5, %4
  %6 = phi float [ %8, %5 ], [ 0.000000e+00, %4 ]
  %7 = phi i32 [ %9, %5 ], [ 0, %4 ]
  %8 = fadd fast float %6, %2
  %9 = add nuw nsw i32 %7, 1
  %10 = icmp eq i32 %9, %1
  br i1 %10, label %11, label %5, !llvm.loop !14

; <label>:11                                      ; preds = %5
  br label %12

; <label>:12                                      ; preds = %11, %0
  %13 = phi float [ 0.000000e+00, %0 ], [ %8, %11 ]
  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %13)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
  ret void
}