@@ -2782,7 +2782,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
2782
2782
assert (src->IsIntegralConst (0 ));
2783
2783
assert (willUseSimdMov);
2784
2784
#ifdef TARGET_AMD64
2785
- assert (size % 16 == 0 );
2785
+ assert (size >= XMM_REGSIZE_BYTES );
2786
2786
#else
2787
2787
assert (size % 8 == 0 );
2788
2788
#endif
@@ -2797,24 +2797,33 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
2797
2797
{
2798
2798
regNumber srcXmmReg = node->GetSingleTempReg (RBM_ALLFLOAT);
2799
2799
2800
+ unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn (InstructionSet_AVX)
2801
+ ? YMM_REGSIZE_BYTES
2802
+ : XMM_REGSIZE_BYTES;
2803
+
2800
2804
if (src->gtSkipReloadOrCopy ()->IsIntegralConst (0 ))
2801
2805
{
2802
2806
// If the source is constant 0 then always use xorps, it's faster
2803
2807
// than copying the constant from a GPR to a XMM register.
2804
- emit->emitIns_R_R (INS_xorps, EA_16BYTE , srcXmmReg, srcXmmReg);
2808
+ emit->emitIns_R_R (INS_xorps, EA_ATTR (regSize) , srcXmmReg, srcXmmReg);
2805
2809
}
2806
2810
else
2807
2811
{
2808
2812
emit->emitIns_Mov (INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false );
2809
2813
emit->emitIns_R_R (INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
2814
+
2815
+ if (regSize == YMM_REGSIZE_BYTES)
2816
+ {
2817
+ // Extend the bytes in the lower lanes to the upper lanes
2818
+ emit->emitIns_R_R_R_I (INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1 );
2819
+ }
2810
2820
#ifdef TARGET_X86
2811
2821
// For x86, we need one more to convert it from 8 bytes to 16 bytes.
2812
2822
emit->emitIns_R_R (INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
2813
2823
#endif
2814
2824
}
2815
2825
2816
2826
instruction simdMov = simdUnalignedMovIns ();
2817
- unsigned regSize = XMM_REGSIZE_BYTES;
2818
2827
unsigned bytesWritten = 0 ;
2819
2828
2820
2829
while (bytesWritten < size)
@@ -2828,8 +2837,21 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
2828
2837
#endif
2829
2838
if (bytesWritten + regSize > size)
2830
2839
{
2840
+ #ifdef TARGET_AMD64
2841
+ if (size - bytesWritten <= XMM_REGSIZE_BYTES)
2842
+ {
2843
+ regSize = XMM_REGSIZE_BYTES;
2844
+ }
2845
+
2846
+ // Shift dstOffset back to use full SIMD move
2847
+ unsigned shiftBack = regSize - (size - bytesWritten);
2848
+ assert (shiftBack <= regSize);
2849
+ bytesWritten -= shiftBack;
2850
+ dstOffset -= shiftBack;
2851
+ #else
2831
2852
assert (srcIntReg != REG_NA);
2832
2853
break ;
2854
+ #endif
2833
2855
}
2834
2856
2835
2857
if (dstLclNum != BAD_VAR_NUM)
@@ -2849,14 +2871,51 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
2849
2871
size -= bytesWritten;
2850
2872
}
2851
2873
2852
- // Fill the remainder using normal stores.
2874
+ // Fill the remainder using normal stores.
2875
+ #ifdef TARGET_AMD64
2876
+ unsigned regSize = REGSIZE_BYTES;
2877
+
2878
+ while (regSize > size)
2879
+ {
2880
+ regSize /= 2 ;
2881
+ }
2882
+
2883
+ for (; size > regSize; size -= regSize, dstOffset += regSize)
2884
+ {
2885
+ if (dstLclNum != BAD_VAR_NUM)
2886
+ {
2887
+ emit->emitIns_S_R (INS_mov, EA_ATTR (regSize), srcIntReg, dstLclNum, dstOffset);
2888
+ }
2889
+ else
2890
+ {
2891
+ emit->emitIns_ARX_R (INS_mov, EA_ATTR (regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
2892
+ dstAddrIndexScale, dstOffset);
2893
+ }
2894
+ }
2895
+
2896
+ if (size > 0 )
2897
+ {
2898
+ unsigned shiftBack = regSize - size;
2899
+ assert (shiftBack <= regSize);
2900
+ dstOffset -= shiftBack;
2901
+
2902
+ if (dstLclNum != BAD_VAR_NUM)
2903
+ {
2904
+ emit->emitIns_S_R (INS_mov, EA_ATTR (regSize), srcIntReg, dstLclNum, dstOffset);
2905
+ }
2906
+ else
2907
+ {
2908
+ emit->emitIns_ARX_R (INS_mov, EA_ATTR (regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
2909
+ dstAddrIndexScale, dstOffset);
2910
+ }
2911
+ }
2912
+ #else // TARGET_X86
2853
2913
for (unsigned regSize = REGSIZE_BYTES; size > 0 ; size -= regSize, dstOffset += regSize)
2854
2914
{
2855
2915
while (regSize > size)
2856
2916
{
2857
2917
regSize /= 2 ;
2858
2918
}
2859
-
2860
2919
if (dstLclNum != BAD_VAR_NUM)
2861
2920
{
2862
2921
emit->emitIns_S_R (INS_mov, EA_ATTR (regSize), srcIntReg, dstLclNum, dstOffset);
@@ -2867,6 +2926,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
2867
2926
dstAddrIndexScale, dstOffset);
2868
2927
}
2869
2928
}
2929
+ #endif
2870
2930
}
2871
2931
2872
2932
#ifdef TARGET_AMD64
@@ -3017,8 +3077,13 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
3017
3077
regNumber tempReg = node->GetSingleTempReg (RBM_ALLFLOAT);
3018
3078
3019
3079
instruction simdMov = simdUnalignedMovIns ();
3020
- for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize;
3021
- size -= regSize, srcOffset += regSize, dstOffset += regSize)
3080
+
3081
+ // Get the largest SIMD register available if the size is large enough
3082
+ unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn (InstructionSet_AVX)
3083
+ ? YMM_REGSIZE_BYTES
3084
+ : XMM_REGSIZE_BYTES;
3085
+
3086
+ for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
3022
3087
{
3023
3088
if (srcLclNum != BAD_VAR_NUM)
3024
3089
{
@@ -3041,15 +3106,109 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
3041
3106
}
3042
3107
}
3043
3108
3044
- // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores.
3045
- // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to
3046
- // allocate a GPR just for the remainder.
3109
+ if (size > 0 )
3110
+ {
3111
+ if (size <= XMM_REGSIZE_BYTES)
3112
+ {
3113
+ regSize = XMM_REGSIZE_BYTES;
3114
+ }
3115
+
3116
+ // Copy the remainder by moving the last regSize bytes of the buffer
3117
+ unsigned shiftBack = regSize - size;
3118
+ assert (shiftBack <= regSize);
3119
+
3120
+ srcOffset -= shiftBack;
3121
+ dstOffset -= shiftBack;
3122
+
3123
+ if (srcLclNum != BAD_VAR_NUM)
3124
+ {
3125
+ emit->emitIns_R_S (simdMov, EA_ATTR (regSize), tempReg, srcLclNum, srcOffset);
3126
+ }
3127
+ else
3128
+ {
3129
+ emit->emitIns_R_ARX (simdMov, EA_ATTR (regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
3130
+ srcAddrIndexScale, srcOffset);
3131
+ }
3132
+
3133
+ if (dstLclNum != BAD_VAR_NUM)
3134
+ {
3135
+ emit->emitIns_S_R (simdMov, EA_ATTR (regSize), tempReg, dstLclNum, dstOffset);
3136
+ }
3137
+ else
3138
+ {
3139
+ emit->emitIns_ARX_R (simdMov, EA_ATTR (regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
3140
+ dstAddrIndexScale, dstOffset);
3141
+ }
3142
+ }
3143
+
3144
+ return ;
3047
3145
}
3048
3146
3147
+ // Fill the remainder with normal loads/stores
3049
3148
if (size > 0 )
3050
3149
{
3051
3150
regNumber tempReg = node->GetSingleTempReg (RBM_ALLINT);
3052
3151
3152
+ #ifdef TARGET_AMD64
3153
+ unsigned regSize = REGSIZE_BYTES;
3154
+
3155
+ while (regSize > size)
3156
+ {
3157
+ regSize /= 2 ;
3158
+ }
3159
+
3160
+ for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
3161
+ {
3162
+ if (srcLclNum != BAD_VAR_NUM)
3163
+ {
3164
+ emit->emitIns_R_S (INS_mov, EA_ATTR (regSize), tempReg, srcLclNum, srcOffset);
3165
+ }
3166
+ else
3167
+ {
3168
+ emit->emitIns_R_ARX (INS_mov, EA_ATTR (regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
3169
+ srcAddrIndexScale, srcOffset);
3170
+ }
3171
+
3172
+ if (dstLclNum != BAD_VAR_NUM)
3173
+ {
3174
+ emit->emitIns_S_R (INS_mov, EA_ATTR (regSize), tempReg, dstLclNum, dstOffset);
3175
+ }
3176
+ else
3177
+ {
3178
+ emit->emitIns_ARX_R (INS_mov, EA_ATTR (regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
3179
+ dstAddrIndexScale, dstOffset);
3180
+ }
3181
+ }
3182
+
3183
+ if (size > 0 )
3184
+ {
3185
+ unsigned shiftBack = regSize - size;
3186
+ assert (shiftBack <= regSize);
3187
+
3188
+ srcOffset -= shiftBack;
3189
+ dstOffset -= shiftBack;
3190
+
3191
+ if (srcLclNum != BAD_VAR_NUM)
3192
+ {
3193
+ emit->emitIns_R_S (INS_mov, EA_ATTR (regSize), tempReg, srcLclNum, srcOffset);
3194
+ }
3195
+ else
3196
+ {
3197
+ emit->emitIns_R_ARX (INS_mov, EA_ATTR (regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
3198
+ srcAddrIndexScale, srcOffset);
3199
+ }
3200
+
3201
+ if (dstLclNum != BAD_VAR_NUM)
3202
+ {
3203
+ emit->emitIns_S_R (INS_mov, EA_ATTR (regSize), tempReg, dstLclNum, dstOffset);
3204
+ }
3205
+ else
3206
+ {
3207
+ emit->emitIns_ARX_R (INS_mov, EA_ATTR (regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
3208
+ dstAddrIndexScale, dstOffset);
3209
+ }
3210
+ }
3211
+ #else // TARGET_X86
3053
3212
for (unsigned regSize = REGSIZE_BYTES; size > 0 ; size -= regSize, srcOffset += regSize, dstOffset += regSize)
3054
3213
{
3055
3214
while (regSize > size)
@@ -3077,6 +3236,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
3077
3236
dstAddrIndexScale, dstOffset);
3078
3237
}
3079
3238
}
3239
+ #endif
3080
3240
}
3081
3241
}
3082
3242
0 commit comments