Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 11 additions & 16 deletions src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,6 @@
#include <cmath>
#include <limits>

// Disabling SIMD for now to do scalar math verification.
// FIXME: Re-enable SIMD and verify SIMD math is correct.
#undef OCIO_USE_SSE2
#undef OCIO_USE_AVX

#if OCIO_USE_SSE2
#include "SSE2.h"
#endif

#if OCIO_USE_AVX
#include "AVX.h"
#endif

namespace OCIO_NAMESPACE
{

Expand Down Expand Up @@ -265,6 +252,7 @@ f3 JMh_to_RGB(const f3 &JMh, const JMhParams &p)
// Tonescale / Chroma compress
//

/*
//TODO: move to header
// https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
// https://gcc.godbolt.org/#compilers:!((compiler:g6,options:%27-xc+-O3+-Wall+-fverbose-asm+-march%3Dhaswell+-mno-avx%27,sourcez:MQSwdgxgNgrgJgUwAQB4QFt3gC4CdwB0AFgHwBQoksiqAztnDseWQPStLZEi1I9JQQAa2QBZAIa5x2AOS0ANEgBGMbJwDuCcSLicA9kgh70ABxBRk2A0oTZsCXIb2ICbDtHFgA5khhgAZnq42H7SCFAAnkiIECCIvFa%2BtMjoegBu4ia0rLREMP4muuD0Wrp6/kipaURQWWT%2BUHrSSES0MOgA%2BlkdtMkAjAAUHR3ofQBMABxIaQCUSADeSEvLK6tr62vs00gAvEgA2kgAIkgAwkgAPkgAQkgAgkgAumQrw6OTSLn5y3sjnV/%2BBoILq0AZpRTgpAdUSiDoAZQAEgBVABiKIAMgBRAZjRQAZkUAAZFH0ZjMANxLLaHc4nK63B7PV4jcZTNroXhLX6YDriOBwEFgxQAimbDjs3h7Q5HADU51OMrpNxlDzuMtuTOWAI2XKhPKqCBqgoBwvatFFqy2B2W5yWStl8sVTypHAsai4yCMpnMDiQ4jSejifsq6WUUVwCBgtHAPgBLy1Zp1uz1nT5At6Awlwry/gp8aWEZCuDArx5EDS2F6HX8eLGmbNeYAvmR6o1mq12iCesk8UMWR9Zgt81D%2B2yc8m/iN0oa4DATILZpT1lslLgmnAIOJ6Ehwgh0AgwJWkAS%2BvokLjCcO3qzPonuan%2BQvs/k8yttet71O0oaoMac6aOVFLZuC8IgWnEKAKgAWhIAQ9HUcDIOHCUNk/NNu3rDln1zclh0LGBiw2Sdy0rWhq1rTDzVw5sWy2Wh0AgixHCMGhowALwQRQVDUKMYEYqJaEaTRHD8CxeicRAoPY5B%2BHEJATDXJQLHQVsmjUDtOm6Ig0z7d4pkHeYr1HW8OQnHltMfLIhWmV8E1Mj8Uw6CyBSsrMJVsgtbAIkt1mIisqxrOt3OoltQH8RAKmGO4ADUAA1hlU9t2TGABWAA2Lt/QAD101K0psodmT06YhN1P48o6Td6CyCqslZMEPJHYrqhAUCzPQCqECyvBxAgbB/FZJ8kFJSlgNasDWSM5qb3a3lLNBNIhIhECiAtZYtjTTgiGQUrJpWfDCI0rtegQXs0lZNakw2sAijAQQwEsbbPh7aYHGjPQwEUdRuAgMD%2BD0ExsAwCCkECRwYti4ctgGMADHULd7Fu%2BhcBgPqQA%2BhQ/Ruv0oCgJByi2vc/QjQmkAAFluLAwAwdoZjIZtgAPRh/BbNx2HZrY4D0GAlIQMguZ5iwWnZLp02SOtr0mXQ0jgOZFiTBWtjSZNDluK5GSmm8/AisqeW1hB/HABAXNBS6FY2TmPpkNR1CCXAIi%2B5APsiJJkDTOEjkUB7v0cKxUe4bxSa8SQlHELxkCUEAj3UKOwM8JAUQABT9flNY%2BAFsFMT8DSNKz9f8RRiIRwoF1li0tg9En%2BFhkNqlqOA04mXR30/KrsG6QpMxzTOTEag6fL8ysBX8NKyaGHl0NoOAwTgbCyRC2iOY5vhD1rMgcGFzsEDMWtuwQCXRxAJAsrligQHCg2RwhhKitZI/uFH3XOj8ExeqEbgOm3kBR4GLLFBPpcOoth4iggDBwnhdCwzAFBeIeBUZA2/PcOKAhbC8CjJ8f0bswz2CQDHLg3M1APWNjGYMVQKDhGSI3e%2B38yZPx6DmIEn8d51j/lCGE8JkRoixAMPoRJ8SKDGPPchN1z5UJMo/NCj4v4UQfmTf%2BjVJYTGobWOhAIgSNCYSAPoaVMKj0LuwxEqIMTYl4UgYkx4BHzxWFsOE8MTCk1KtgW2O5lIHkrGI9kKjJECmkUFdoeiWggFrH3LyhFB7RkGtGCinjBGAOsRwOEcJMRjFrg3eJ/dHJdR6n1TRMioBEjNjYpJcidzfhLPgmAbUo5OG9GJM8VQkb2D5PjCoclBD2CkHjEwWTcC6FwLWeQWVMDyEvDRDeR1fHdh7D0ISPQGK4xYggXSd9j4zEMisLKs1nK5JYfI3CGytnoV8b/PZeFQklnCVoyYPQgksKbKzReKdMiI1BmudASAJBSFkLwCGLQ7AmAAFzsHoG/acuAGjwQIF6VgABHGACB6DozANkPotY%2BgAE4%2BhkzSqwIg8EoJWCku0KCHV0pQTxfgNiH1sD8VYKisYDKyZ9EJMAeljLmWJXUiLboDEvnLI%2BCfQqywjDIrUIokyABFPiug9hZX2cKjGYrjKNCOHxPGex2RSr5PKl0gTVUgz2AMJAUFFAmuPniJAMpj4AHZ/4pKtVlfKdMVgiu3OK7g%2Br1WORzr%2BVy7QtWz0ldKxqdF2ieuTEas1ZqsqngdRah1KVLU2v/oSJNWV7XH1oQ6p1w5XVKuKuycN3jBQqrVYoD1aqQ2ugMIa41pr63H1TQ6jNWUs3HxzS6xVTUbyNGTIWtVOqkBjQjXW0d0bY3mrTYmh11qkDOoVaK7tHxuCzTURYY0Yay0mU9USGNVaTIjqjQ2rKTbj4TvTWm%2BNmap1pvyjOuduau3ivZLNSeoJGjlpAOXDg6EsjRANlob5hgoCeC8HIT4DChYAyBlgDiuAzlFguWWfyZFAqYXuUAA%3D%3D)),filterAsm:(binary:!t,commentOnly:!t,directives:!t,intel:!t,labels:!t),version:3
Expand All @@ -279,6 +267,7 @@ inline float hsum_ps_sse1(__m128 v) { // v = [
return _mm_cvtss_f32(sums); // A+B+C+D
}
#endif

#ifdef OCIO_USE_AVX
inline float hsum256_ps_avx(__m256 v) { // v = [ H G | F E | D C | B A ]
__m128 vlow = _mm256_castps256_ps128(v); // vlow = [ D C | B A ]
Expand All @@ -287,7 +276,8 @@ inline float hsum256_ps_avx(__m256 v) { // v = [ H G | F E |
return hsum_ps_sse1(v128);
}
#endif

#endif
*/

float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_scale)
{
Expand All @@ -302,9 +292,11 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s
sin_hr1, sin_hr2, sin_hr3, 1.0f
};
alignas(AVX_ALIGNMENT) static constexpr float weights[8] = { // TODO: investigate reordering of the entries so we are summing equal magnitude values first?
11.34072f, 16.46899f, 14.66441f, 0.0f,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on https://draftdocs.acescentral.com/output-transforms/technical-details/chroma-compression/#normalization the GPU side seems to have the correct weights thus updated the CPU-side. Please verify.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, this looks like an editing mistake on my part.

4.66441f, -6.37224f, 9.19364f, 77.12896f
11.34072f, 16.46899f, 7.88380f, 0.0f,
14.66441f, -6.37224f, 9.19364f, 77.12896f
};

/*
// TODO: benchmark this across multiple platforms to justify the multiple code paths.
#if OCIO_USE_SSE2
#if OCIO_USE_AVX
Expand All @@ -324,14 +316,17 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s
const float M = hsum_ps_sse1(t3);
#endif
#else
*/
const float M = weights[0] * trig_angles_hr[0] +
weights[1] * trig_angles_hr[1] +
weights[2] * trig_angles_hr[2] +
weights[4] * trig_angles_hr[4] +
weights[5] * trig_angles_hr[5] +
weights[6] * trig_angles_hr[6] +
weights[7];
/*
#endif
*/

return M * chroma_compress_scale; // TODO: is it worth prescaling the above weights?
}
Expand Down
8 changes: 5 additions & 3 deletions src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ std::string _Add_Tonescale_func(
}

ss.newLine() << ss.floatDecl("J_ts") << " = " << ACES2::J_scale << " * pow((F_L_Y / ( " << ACES2::cam_nl_offset << " + F_L_Y)) * " << p.inv_A_w_J << ", " << p.cz << ");";
// TODO: copysign is missing here. /coz
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see

ss.newLine() << "return J_ts;";

ss.dedent();
Expand All @@ -682,14 +683,15 @@ void _Add_ChromaCompressionNorm_Shader(
ss.newLine() << "{";
ss.indent();

ss.newLine() << ss.floatDecl("cos_hr2") << " = cos_hr * cos_hr - sin_hr * sin_hr;";
// TODO: optimization: can bake weights into terms and convert dotprods to addition. /coz
ss.newLine() << ss.floatDecl("cos_hr2") << " = 2.0 * cos_hr * cos_hr - 1.0;";
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will slightly be better in terms of robustness and speed, I guess.

ss.newLine() << ss.floatDecl("sin_hr2") << " = 2.0 * cos_hr * sin_hr;";
ss.newLine() << ss.floatDecl("cos_hr3") << " = 4.0 * cos_hr * cos_hr * cos_hr - 3.0 * cos_hr;";
ss.newLine() << ss.floatDecl("sin_hr3") << " = 3.0 * sin_hr - 4.0 * sin_hr * sin_hr * sin_hr;";
ss.newLine() << ss.float3Decl("cosines") << " = " << ss.float3Const("cos_hr", "cos_hr2", "cos_hr3") <<";";
ss.newLine() << ss.float3Decl("cosine_weights") << " = " << ss.float3Const(11.34072 * c.chroma_compress_scale,
16.46899 * c.chroma_compress_scale,
7.88380 * c.chroma_compress_scale) <<";";
7.88380 * c.chroma_compress_scale) <<";";
ss.newLine() << ss.float3Decl("sines") << " = " << ss.float3Const("sin_hr", "sin_hr2", "sin_hr3") <<";";
ss.newLine() << ss.float3Decl("sine_weights") << " = " << ss.float3Const(14.66441 * c.chroma_compress_scale,
-6.37224 * c.chroma_compress_scale,
Expand Down Expand Up @@ -1396,7 +1398,7 @@ void Add_ACES_OutputTransform_Inv_Shader(
ss.newLine() << "";
ss.newLine() << "// Add ToneScale and ChromaCompress (inv)";
ss.newLine() << "";
ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".b);";
ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".r);";
ss.newLine() << "{";
ss.indent();
_Add_Tonescale_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, s, c);
Expand Down
124 changes: 62 additions & 62 deletions tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,41 +467,41 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20)

const float expected_32f[num_samples*4] = {
// ACEScg primaries and secondaries scaled by 4
4.966040611f, -0.032990534f, 0.041587759f, 1.0f,
3.969455719f, 3.825795889f, -0.056159109f, 1.0f,
-0.075445443f, 3.689064741f, 0.270243138f, 1.0f,
-0.095422804f, 3.650515079f, 3.459970713f, 1.0f,
-0.029242843f, 0.196083903f, 2.797096968f, 1.0f,
4.900825977f, -0.064415932f, 3.838272572f, 1.0f,
4.966013432f, -0.033002287f, 0.041583523f, 1.0f,
3.969460726f, 3.825797558f, -0.056160748f, 1.0f,
-0.075460039f, 3.689072609f, 0.270235062f, 1.0f,
-0.095436633f, 3.650521517f, 3.459975719f, 1.0f,
-0.028881177f, 0.196473420f, 2.796123743f, 1.0f,
4.900828362f, -0.064385533f, 3.838270903f, 1.0f,
// OCIO test values
0.096831776f, -0.001114858f, 0.018976377f, 0.5f,
0.811647296f, 0.478211939f, 0.816507518f, 1.0f,
0.110244252f, 0.919241786f, 0.726084292f, 0.0f,
0.096890487f, -0.001135427f, 0.018971475f, 0.5f,
0.809613585f, 0.479857147f, 0.814239979f, 1.0f,
0.107417941f, 0.920530438f, 0.726379037f, 0.0f,
// ColorChecker24 (SMPTE 2065-1 2021)
0.115581684f, 0.050785132f, 0.030158322f, 1.0f,
0.482630610f, 0.301559567f, 0.228200614f, 1.0f,
0.097509719f, 0.160682827f, 0.278755993f, 1.0f,
0.071118668f, 0.107350536f, 0.035066456f, 1.0f,
0.206827119f, 0.198065758f, 0.376981646f, 1.0f,
0.197157621f, 0.480333209f, 0.393290222f, 1.0f,
0.570664287f, 0.197219044f, 0.042163782f, 1.0f,
0.045591675f, 0.069720201f, 0.292005479f, 1.0f,
0.425108939f, 0.083108872f, 0.102091998f, 1.0f,
0.059560396f, 0.022268835f, 0.091132581f, 1.0f,
0.360384226f, 0.478674322f, 0.086890966f, 1.0f,
0.691989481f, 0.372686356f, 0.070826821f, 1.0f,
0.012042155f, 0.021904279f, 0.198501319f, 1.0f,
0.076645926f, 0.256147027f, 0.060666814f, 1.0f,
0.300039411f, 0.023424838f, 0.030365985f, 1.0f,
0.803476214f, 0.596933603f, 0.085341305f, 1.0f,
0.388712883f, 0.079724148f, 0.245922253f, 1.0f,
0.011061139f, 0.196086824f, 0.307065904f, 1.0f,
0.921007156f, 0.921683431f, 0.912948132f, 1.0f,
0.590166390f, 0.588430583f, 0.587841213f, 1.0f,
0.337742388f, 0.337684810f, 0.338159621f, 1.0f,
0.169266224f, 0.169178173f, 0.169558540f, 1.0f,
0.058399219f, 0.059382606f, 0.060239695f, 1.0f,
0.012618840f, 0.012950940f, 0.013591323f, 1.0f,
0.115475342f, 0.050812997f, 0.030212998f, 1.0f,
0.484880149f, 0.301042914f, 0.226769030f, 1.0f,
0.098463453f, 0.160814837f, 0.277010798f, 1.0f,
0.071130276f, 0.107334509f, 0.035097614f, 1.0f,
0.207111374f, 0.198474824f, 0.375326097f, 1.0f,
0.195447117f, 0.481112540f, 0.393299103f, 1.0f,
0.571913302f, 0.196873263f, 0.041634843f, 1.0f,
0.045791976f, 0.069875412f, 0.291233569f, 1.0f,
0.424848884f, 0.083199054f, 0.102153927f, 1.0f,
0.059589352f, 0.022219239f, 0.091246955f, 1.0f,
0.360364884f, 0.478741497f, 0.086726815f, 1.0f,
0.695661962f, 0.371994466f, 0.068298057f, 1.0f,
0.011806240f, 0.021665439f, 0.199594870f, 1.0f,
0.076526135f, 0.256237596f, 0.060564563f, 1.0f,
0.300064713f, 0.023416281f, 0.030360531f, 1.0f,
0.805483222f, 0.596904039f, 0.082996234f, 1.0f,
0.388385385f, 0.079899333f, 0.245818958f, 1.0f,
0.010951802f, 0.196106046f, 0.307181537f, 1.0f,
0.921020269f, 0.921707630f, 0.912857533f, 1.0f,
0.590191603f, 0.588424563f, 0.587825298f, 1.0f,
0.337743223f, 0.337686002f, 0.338155240f, 1.0f,
0.169266403f, 0.169178575f, 0.169557154f, 1.0f,
0.058346011f, 0.059387885f, 0.060296256f, 1.0f,
0.012581199f, 0.012947144f, 0.013654212f, 1.0f,
// Spectrally non-selective 18 % reflecting diffuser
0.145115077f, 0.145115703f, 0.145115480f, 1.0f,
// Perfect reflecting diffuser
Expand All @@ -524,7 +524,7 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20)
__LINE__);

#if DUMP_RESULT
std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
std::cout << "aces_output_transform_20 results: \n" << std::setprecision(9) << std::fixed;
for (unsigned i = 0; i < num_samples; ++i)
{
std::cout << input2_32f[i * 4 + 0] << "f, "
Expand Down Expand Up @@ -794,35 +794,35 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20)

const float expected_32f[num_samples*4] = {
// ACEScg primaries and secondaries scaled by 4
110.702453613f, 211.242279053f, 25.025110245f, 1.0f,
168.016815186f, 129.795593262f, 106.183448792f, 1.0f,
140.814849854f, 193.450653076f, 147.056488037f, 1.0f,
156.429504395f, 110.935348511f, 192.204727173f, 1.0f,
80.456558228f, 98.743263245f, 268.442108154f, 1.0f,
135.172225952f, 175.572814941f, 341.715240479f, 1.0f,
110.702453613f, 211.251770020f, 25.025110245f, 1.0f,
168.016815186f, 129.796249390f, 106.183448792f, 1.0f,
140.814849854f, 193.459197998f, 147.056488037f, 1.0f,
156.429504395f, 110.938423157f, 192.204727173f, 1.0f,
80.456558228f, 98.490531921f, 268.442108154f, 1.0f,
135.172225952f, 175.559326172f, 341.715240479f, 1.0f,
// OCIO test values
18.187316895f, 33.767055511f, 4.173158169f, 0.5f,
80.413101196f, 21.547714233f, 332.159759521f, 1.0f,
83.447883606f, 37.597621918f, 182.925750732f, 0.0f,
18.187316895f, 33.819190979f, 4.173158169f, 0.5f,
80.413101196f, 21.309329987f, 332.159759521f, 1.0f,
83.447883606f, 37.852523804f, 182.925750732f, 0.0f,
// ColorChecker24 (SMPTE 2065-1 2021)
27.411968231f, 13.410449982f, 38.146659851f, 1.0f,
59.987659454f, 14.175936699f, 39.841842651f, 1.0f,
43.298923492f, 12.367712021f, 249.107116699f, 1.0f,
31.489654541f, 14.086299896f, 128.878036499f, 1.0f,
50.749198914f, 12.862657547f, 285.658966064f, 1.0f,
64.728637695f, 18.433788300f, 179.324264526f, 1.0f,
53.399444580f, 37.239288330f, 50.924011230f, 1.0f,
34.719596863f, 21.685737610f, 271.008331299f, 1.0f,
43.910709381f, 36.826980591f, 13.975610733f, 1.0f,
23.196529388f, 15.087531090f, 317.544281006f, 1.0f,
63.348682404f, 33.255519867f, 119.145133972f, 1.0f,
64.908874512f, 34.922687531f, 70.842193604f, 1.0f,
24.876913071f, 23.019479752f, 273.228973389f, 1.0f,
44.203376770f, 28.884298325f, 144.154159546f, 1.0f,
32.824359894f, 43.442367554f, 17.892261505f, 1.0f,
75.830871582f, 39.538505554f, 90.752044678f, 1.0f,
45.823120117f, 34.710170746f, 348.832092285f, 1.0f,
43.597236633f, 23.048465729f, 218.454376221f, 1.0f,
27.411968231f, 13.382784843f, 38.146659851f, 1.0f,
59.987659454f, 14.391894341f, 39.841842651f, 1.0f,
43.298923492f, 12.199877739f, 249.107116699f, 1.0f,
31.489654541f, 14.075141907f, 128.878036499f, 1.0f,
50.749198914f, 12.731806755f, 285.658966064f, 1.0f,
64.728637695f, 18.593791962f, 179.324264526f, 1.0f,
53.399444580f, 37.394416809f, 50.924011230f, 1.0f,
34.719596863f, 21.616765976f, 271.008331299f, 1.0f,
43.910709381f, 36.788166046f, 13.975610733f, 1.0f,
23.196529388f, 15.118354797f, 317.544281006f, 1.0f,
63.348682404f, 33.283519745f, 119.145133972f, 1.0f,
64.908874512f, 35.371063232f, 70.842193604f, 1.0f,
24.876913071f, 23.143159866f, 273.228973389f, 1.0f,
44.203376770f, 28.918329239f, 144.154159546f, 1.0f,
32.824359894f, 43.447853088f, 17.892261505f, 1.0f,
75.830871582f, 39.872489929f, 90.752044678f, 1.0f,
45.823120117f, 34.652057648f, 348.832092285f, 1.0f,
43.597236633f, 23.079071045f, 218.454376221f, 1.0f,
};

OCIO::FixedFunctionOpData::Params params = {1000.f};
Expand All @@ -836,7 +836,7 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20)
__LINE__);

#if DUMP_RESULTS
std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
std::cout << "aces_tonescale_compress_20 results: \n" << std::setprecision(9) << std::fixed;
for (unsigned i = 0; i < num_samples; ++i)
{
std::cout << input2_32f[i * 4 + 0] << "f, "
Expand Down
Loading