diff --git a/lib/astc-encoder/.gitrepo b/lib/astc-encoder/.gitrepo index 370aac4a1d..568c3dd6bc 100644 --- a/lib/astc-encoder/.gitrepo +++ b/lib/astc-encoder/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/ARM-software/astc-encoder.git branch = main - commit = e7cb1e453968b0e16e48ef6d68fc9d1227d8a378 - parent = b87158d371723ba9670f5dc13e50a3e2a2799eba + commit = 42a8f6ee01715f45edffb6773e34b8bb914a47df + parent = 2b1c072a13e9b69ada8457665b51ed5b59ddb408 method = merge cmdver = 0.4.3 diff --git a/lib/astc-encoder/CMakeLists.txt b/lib/astc-encoder/CMakeLists.txt index ad608f1a38..761098054f 100644 --- a/lib/astc-encoder/CMakeLists.txt +++ b/lib/astc-encoder/CMakeLists.txt @@ -24,7 +24,7 @@ if(MSVC) add_compile_options("/wd4324") # Disable structure was padded due to alignment specifier endif() -project(astcencoder VERSION 3.2.0) +project(astcencoder VERSION 3.3.0) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -215,7 +215,7 @@ if(PACKAGE) set(PKG_VER ${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR}) - set(CPACK_PACKAGE_FILE_NAME "astcenc-${PKG_VER}-${PKG_OS}${PACKAGE}") + set(CPACK_PACKAGE_FILE_NAME "astcenc-${PKG_VER}-${PKG_OS}-${PACKAGE}") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY FALSE) set(CPACK_PACKAGE_CHECKSUM SHA256) set(CPACK_GENERATOR ZIP) diff --git a/lib/astc-encoder/README.md b/lib/astc-encoder/README.md index b1044f4618..f4f58db2c1 100644 --- a/lib/astc-encoder/README.md +++ b/lib/astc-encoder/README.md @@ -58,16 +58,12 @@ from 0.89 bits/pixel up to 8 bits/pixel. Release build binaries for the `astcenc` stable releases are provided in the [GitHub Releases page][3]. -**Latest 3.x stable release:** 3.2 +**Latest 3.x stable release:** 3.3 * Change log: [3.x series](./Docs/ChangeLog-3x.md) **Latest 2.x stable release:** 2.5 * Change log: [2.x series](./Docs/ChangeLog-2x.md) -**Latest development release:** 3.3-develop -* Change log: [3.x series](./Docs/ChangeLog-3x.md) -* Roadmap: [Ideas ...](./Docs/Roadmap.md) - Binaries are provided for 64-bit builds on Windows, macOS, and Linux. The builds of the astcenc are provided as multiple binaries, each tuned for a specific SIMD instruction set. diff --git a/lib/astc-encoder/Source/astcenc.h b/lib/astc-encoder/Source/astcenc.h index 3da8929930..f98fa7c68f 100644 --- a/lib/astc-encoder/Source/astcenc.h +++ b/lib/astc-encoder/Source/astcenc.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2020-2021 Arm Limited +// Copyright 2020-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -405,36 +405,6 @@ struct astcenc_config /** @brief The ASTC block size Z dimension. */ unsigned int block_z; - /** @brief The size of the texel kernel for error weighting (-v). */ - unsigned int v_rgba_radius; - - /** @brief The mean and stdev component mix for error weighting (-v). */ - float v_rgba_mean_stdev_mix; - - /** @brief The texel RGB power for error weighting (-v). */ - float v_rgb_power; - - /** @brief The texel RGB base weight for error weighting (-v). */ - float v_rgb_base; - - /** @brief The texel RGB mean weight for error weighting (-v). */ - float v_rgb_mean; - - /** @brief The texel RGB stdev for error weighting (-v). */ - float v_rgb_stdev; - - /** @brief The texel A power for error weighting (-va). */ - float v_a_power; - - /** @brief The texel A base weight for error weighting (-va). */ - float v_a_base; - - /** @brief The texel A mean weight for error weighting (-va). */ - float v_a_mean; - - /** @brief The texel A stdev for error weighting (-va). */ - float v_a_stdev; - /** @brief The red component weight scale for error weighting (-cw). */ float cw_r_weight; @@ -456,13 +426,6 @@ struct astcenc_config */ unsigned int a_scale_radius; - /** - * @brief The additional weight for block edge texels (-b). - * - * This is generic tool for reducing artefacts visible on block changes. - */ - float b_deblock_weight; - /** @brief The RGBM scale factor for the shared multiplier (-rgbm). */ float rgbm_m_scale; diff --git a/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp b/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp index 3dac01e831..3002928d1c 100644 --- a/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp +++ b/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -28,45 +28,30 @@ void compute_avgs_and_dirs_4_comp( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, partition_metrics pm[BLOCK_MAX_PARTITIONS] ) { - // TODO: Candidate for 4-group counting + float texel_weight = hadd_s(blk.channel_weight) / 4.0f; + int partition_count = pi.partition_count; promise(partition_count > 0); for (int partition = 0; partition < partition_count; partition++) { const uint8_t *texel_indexes = pi.texels_of_partition[partition]; - - vfloat4 error_sum = vfloat4::zero(); - vfloat4 base_sum = vfloat4::zero(); - float partition_weight = 0.0f; - unsigned int texel_count = pi.partition_texel_count[partition]; promise(texel_count > 0); + // TODO: Try gathers? + vfloat4 base_sum = vfloat4::zero(); + for (unsigned int i = 0; i < texel_count; i++) { int iwt = texel_indexes[i]; - float weight = ewb.texel_weight[iwt]; - vfloat4 texel_datum = blk.texel(iwt); - vfloat4 error_weight = ewb.error_weights[iwt]; - - partition_weight += weight; - base_sum += texel_datum * weight; - error_sum += error_weight; + base_sum += blk.texel(iwt); } - error_sum = error_sum / static_cast(texel_count); - vfloat4 csf = normalize(sqrt(error_sum)) * 2.0f; - - vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f)); - - pm[partition].error_weight = error_sum; - pm[partition].avg = average * csf; - pm[partition].color_scale = csf; - pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f); + vfloat4 average = base_sum / static_cast(texel_count); + pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); vfloat4 sum_yp = vfloat4::zero(); @@ -76,9 +61,8 @@ void compute_avgs_and_dirs_4_comp( for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - float weight = ewb.texel_weight[iwt]; vfloat4 texel_datum = blk.texel(iwt); - texel_datum = (texel_datum - average) * weight; + texel_datum = (texel_datum - average) * texel_weight; vfloat4 zero = vfloat4::zero(); @@ -128,50 +112,35 @@ void compute_avgs_and_dirs_4_comp( void compute_avgs_and_dirs_3_comp( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, unsigned int omitted_component, partition_metrics pm[BLOCK_MAX_PARTITIONS] ) { - // TODO: Candidate for 4-group counting - const float *texel_weights = ewb.texel_weight_rgb; + float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f; const float* data_vr = blk.data_r; const float* data_vg = blk.data_g; const float* data_vb = blk.data_b; - const float* error_vr = ewb.texel_weight_r; - const float* error_vg = ewb.texel_weight_g; - const float* error_vb = ewb.texel_weight_b; - if (omitted_component == 0) { - texel_weights = ewb.texel_weight_gba; + texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>()) / 3.0f; data_vr = blk.data_g; data_vg = blk.data_b; data_vb = blk.data_a; - - error_vr = ewb.texel_weight_g; - error_vg = ewb.texel_weight_b; - error_vb = ewb.texel_weight_a; } else if (omitted_component == 1) { - texel_weights = ewb.texel_weight_rba; + texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()) / 3.0f; data_vg = blk.data_b; data_vb = blk.data_a; - - error_vg = ewb.texel_weight_b; - error_vb = ewb.texel_weight_a; } else if (omitted_component == 2) { - texel_weights = ewb.texel_weight_rga; + texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()) / 3.0f; data_vb = blk.data_a; - - error_vb = ewb.texel_weight_a; } unsigned int partition_count = pi.partition_count; @@ -180,43 +149,18 @@ void compute_avgs_and_dirs_3_comp( for (unsigned int partition = 0; partition < partition_count; partition++) { const uint8_t *texel_indexes = pi.texels_of_partition[partition]; - - vfloat4 error_sum = vfloat4::zero(); - vfloat4 base_sum = vfloat4::zero(); - float partition_weight = 0.0f; - unsigned int texel_count = pi.partition_texel_count[partition]; promise(texel_count > 0); + vfloat4 base_sum = vfloat4::zero(); for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - float weight = texel_weights[iwt]; - - vfloat4 texel_datum(data_vr[iwt], - data_vg[iwt], - data_vb[iwt], - 0.0f); - - vfloat4 error_weight(error_vr[iwt], - error_vg[iwt], - error_vb[iwt], - 0.0f); - - partition_weight += weight; - base_sum += texel_datum * weight; - error_sum += error_weight; + base_sum += vfloat3(data_vr[iwt], data_vg[iwt], data_vb[iwt]); } - error_sum = error_sum / static_cast(texel_count); - vfloat4 csf = normalize(sqrt(error_sum)) * 1.73205080f; - - vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f)); - - pm[partition].error_weight = error_sum; - pm[partition].avg = average * csf; - pm[partition].color_scale = csf; - pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f); + vfloat4 average = base_sum / static_cast(texel_count); + pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); vfloat4 sum_yp = vfloat4::zero(); @@ -225,13 +169,12 @@ void compute_avgs_and_dirs_3_comp( for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - float weight = texel_weights[iwt]; vfloat4 texel_datum = vfloat3(data_vr[iwt], data_vg[iwt], data_vb[iwt]); - texel_datum = (texel_datum - average) * weight; + texel_datum = (texel_datum - average) * texel_weight; vfloat4 zero = vfloat4::zero(); @@ -271,50 +214,28 @@ void compute_avgs_and_dirs_3_comp( void compute_avgs_and_dirs_3_comp_rgb( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, partition_metrics pm[BLOCK_MAX_PARTITIONS] ) { - // TODO: Candidate for 4-group counting + float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3; + unsigned int partition_count = pi.partition_count; promise(partition_count > 0); for (unsigned int partition = 0; partition < partition_count; partition++) { const uint8_t *texel_indexes = pi.texels_of_partition[partition]; - - vfloat4 error_sum = vfloat4::zero(); - vfloat4 base_sum = vfloat4::zero(); - float partition_weight = 0.0f; - unsigned int texel_count = pi.partition_texel_count[partition]; promise(texel_count > 0); + vfloat4 base_sum = vfloat4::zero(); for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - float weight = ewb.texel_weight_rgb[iwt]; - - vfloat4 texel_datum = blk.texel3(iwt); - - vfloat4 error_weight(ewb.texel_weight_r[iwt], - ewb.texel_weight_g[iwt], - ewb.texel_weight_b[iwt], - 0.0f); - - partition_weight += weight; - base_sum += texel_datum * weight; - error_sum += error_weight; + base_sum += blk.texel3(iwt); } - error_sum = error_sum / static_cast(texel_count); - vfloat4 csf = normalize(sqrt(error_sum)) * 1.73205080f; - - vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f)); - - pm[partition].error_weight = error_sum; - pm[partition].avg = average * csf; - pm[partition].color_scale = csf; - pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f); + vfloat4 average = base_sum / static_cast(texel_count); + pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); vfloat4 sum_yp = vfloat4::zero(); @@ -323,11 +244,10 @@ void compute_avgs_and_dirs_3_comp_rgb( for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - float weight = ewb.texel_weight_rgb[iwt]; vfloat4 texel_datum = blk.texel3(iwt); - texel_datum = (texel_datum - average) * weight; + texel_datum = (texel_datum - average) * texel_weight; vfloat4 zero = vfloat4::zero(); @@ -367,49 +287,37 @@ void compute_avgs_and_dirs_3_comp_rgb( void compute_avgs_and_dirs_2_comp( const partition_info& pt, const image_block& blk, - const error_weight_block& ewb, unsigned int component1, unsigned int component2, partition_metrics pm[BLOCK_MAX_PARTITIONS] ) { - const float *texel_weights; + float texel_weight; const float* data_vr = nullptr; const float* data_vg = nullptr; - const float* error_vr = nullptr; - const float* error_vg = nullptr; - if (component1 == 0 && component2 == 1) { - texel_weights = ewb.texel_weight_rg; + texel_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f; data_vr = blk.data_r; data_vg = blk.data_g; - - error_vr = ewb.texel_weight_r; - error_vg = ewb.texel_weight_g; } else if (component1 == 0 && component2 == 2) { - texel_weights = ewb.texel_weight_rb; + texel_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f; data_vr = blk.data_r; data_vg = blk.data_b; - - error_vr = ewb.texel_weight_r; - error_vg = ewb.texel_weight_b; } else // (component1 == 1 && component2 == 2) { assert(component1 == 1 && component2 == 2); - texel_weights = ewb.texel_weight_gb; + + texel_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f; data_vr = blk.data_g; data_vg = blk.data_b; - - error_vr = ewb.texel_weight_g; - error_vg = ewb.texel_weight_b; } unsigned int partition_count = pt.partition_count; @@ -418,36 +326,18 @@ void compute_avgs_and_dirs_2_comp( for (unsigned int partition = 0; partition < partition_count; partition++) { const uint8_t *texel_indexes = pt.texels_of_partition[partition]; - - vfloat4 error_sum = vfloat4::zero(); - vfloat4 base_sum = vfloat4::zero(); - float partition_weight = 0.0f; - unsigned int texel_count = pt.partition_texel_count[partition]; promise(texel_count > 0); + vfloat4 base_sum = vfloat4::zero(); for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - float weight = texel_weights[iwt]; - vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]) * weight; - - vfloat4 error_weight = vfloat2(error_vr[iwt], error_vg[iwt]); - - partition_weight += weight; - base_sum += texel_datum; - error_sum += error_weight; + base_sum += vfloat2(data_vr[iwt], data_vg[iwt]); } - error_sum = error_sum / static_cast(texel_count); - vfloat4 csf = normalize(sqrt(error_sum)) * 1.41421356f; - vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f)); - - - pm[partition].error_weight = error_sum; - pm[partition].avg = average * csf; - pm[partition].color_scale = csf; - pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f); + vfloat4 average = base_sum / static_cast(texel_count); + pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); vfloat4 sum_yp = vfloat4::zero(); @@ -455,9 +345,8 @@ void compute_avgs_and_dirs_2_comp( for (unsigned int i = 0; i < texel_count; i++) { unsigned int iwt = texel_indexes[i]; - float weight = texel_weights[iwt]; vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]); - texel_datum = (texel_datum - average) * weight; + texel_datum = (texel_datum - average) * texel_weight; vfloat4 zero = vfloat4::zero(); @@ -487,7 +376,6 @@ void compute_avgs_and_dirs_2_comp( void compute_error_squared_rgba( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS], const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS], float uncor_lengths[BLOCK_MAX_PARTITIONS], @@ -528,11 +416,6 @@ void compute_error_squared_rgba( vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); vfloat l_uncor_amod3(l_uncor.amod.lane<3>()); - vfloat l_uncor_bis0(l_uncor.bis.lane<0>()); - vfloat l_uncor_bis1(l_uncor.bis.lane<1>()); - vfloat l_uncor_bis2(l_uncor.bis.lane<2>()); - vfloat l_uncor_bis3(l_uncor.bis.lane<3>()); - vfloat l_samec_bs0(l_samec.bs.lane<0>()); vfloat l_samec_bs1(l_samec.bs.lane<1>()); vfloat l_samec_bs2(l_samec.bs.lane<2>()); @@ -540,11 +423,6 @@ void compute_error_squared_rgba( assert(all(l_samec.amod == vfloat4(0.0f))); - vfloat l_samec_bis0(l_samec.bis.lane<0>()); - vfloat l_samec_bis1(l_samec.bis.lane<1>()); - vfloat l_samec_bis2(l_samec.bis.lane<2>()); - vfloat l_samec_bis3(l_samec.bis.lane<3>()); - vfloat uncor_loparamv(1e10f); vfloat uncor_hiparamv(-1e10f); vfloat4 uncor_errorsumv = vfloat4::zero(); @@ -553,6 +431,11 @@ void compute_error_squared_rgba( vfloat samec_hiparamv(-1e10f); vfloat4 samec_errorsumv = vfloat4::zero(); + vfloat ew_r(blk.channel_weight.lane<0>()); + vfloat ew_g(blk.channel_weight.lane<1>()); + vfloat ew_b(blk.channel_weight.lane<2>()); + vfloat ew_a(blk.channel_weight.lane<3>()); + // This implementation over-shoots, but this is safe as we initialize the texel_indexes // array to extend the last value. This means min/max are not impacted, but we need to mask // out the dummy values when we compute the line weighting. @@ -567,11 +450,6 @@ void compute_error_squared_rgba( vfloat data_b = gatherf(blk.data_b, texel_idxs); vfloat data_a = gatherf(blk.data_a, texel_idxs); - vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs); - vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs); - vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs); - vfloat ew_a = gatherf(ewb.texel_weight_a, texel_idxs); - vfloat uncor_param = (data_r * l_uncor_bs0) + (data_g * l_uncor_bs1) + (data_b * l_uncor_bs2) @@ -581,13 +459,13 @@ void compute_error_squared_rgba( uncor_hiparamv = max(uncor_param, uncor_hiparamv); vfloat uncor_dist0 = (l_uncor_amod0 - data_r) - + (uncor_param * l_uncor_bis0); + + (uncor_param * l_uncor_bs0); vfloat uncor_dist1 = (l_uncor_amod1 - data_g) - + (uncor_param * l_uncor_bis1); + + (uncor_param * l_uncor_bs1); vfloat uncor_dist2 = (l_uncor_amod2 - data_b) - + (uncor_param * l_uncor_bis2); + + (uncor_param * l_uncor_bs2); vfloat uncor_dist3 = (l_uncor_amod3 - data_a) - + (uncor_param * l_uncor_bis3); + + (uncor_param * l_uncor_bs3); vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) + (ew_g * uncor_dist1 * uncor_dist1) @@ -606,10 +484,10 @@ void compute_error_squared_rgba( samec_loparamv = min(samec_param, samec_loparamv); samec_hiparamv = max(samec_param, samec_hiparamv); - vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r; - vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g; - vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b; - vfloat samec_dist3 = samec_param * l_samec_bis3 - data_a; + vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; + vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; + vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; + vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a; vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) + (ew_g * samec_dist1 * samec_dist1) @@ -619,7 +497,7 @@ void compute_error_squared_rgba( samec_err = select(vfloat::zero(), samec_err, mask); haccumulate(samec_errorsumv, samec_err); - lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH); + lane_ids += vint(ASTCENC_SIMD_WIDTH); } uncor_loparam = hmin_s(uncor_loparamv); @@ -645,7 +523,6 @@ void compute_error_squared_rgba( void compute_error_squared_rgb( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, partition_lines3 plines[BLOCK_MAX_PARTITIONS], float& uncor_error, float& samec_error @@ -685,20 +562,12 @@ void compute_error_squared_rgb( vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); - vfloat l_uncor_bis0(l_uncor.bis.lane<0>()); - vfloat l_uncor_bis1(l_uncor.bis.lane<1>()); - vfloat l_uncor_bis2(l_uncor.bis.lane<2>()); - vfloat l_samec_bs0(l_samec.bs.lane<0>()); vfloat l_samec_bs1(l_samec.bs.lane<1>()); vfloat l_samec_bs2(l_samec.bs.lane<2>()); assert(all(l_samec.amod == vfloat4(0.0f))); - vfloat l_samec_bis0(l_samec.bis.lane<0>()); - vfloat l_samec_bis1(l_samec.bis.lane<1>()); - vfloat l_samec_bis2(l_samec.bis.lane<2>()); - vfloat uncor_loparamv(1e10f); vfloat uncor_hiparamv(-1e10f); vfloat4 uncor_errorsumv = vfloat4::zero(); @@ -707,6 +576,10 @@ void compute_error_squared_rgb( vfloat samec_hiparamv(-1e10f); vfloat4 samec_errorsumv = vfloat4::zero(); + vfloat ew_r(blk.channel_weight.lane<0>()); + vfloat ew_g(blk.channel_weight.lane<1>()); + vfloat ew_b(blk.channel_weight.lane<2>()); + // This implementation over-shoots, but this is safe as we initialize the weights array // to extend the last value. This means min/max are not impacted, but we need to mask // out the dummy values when we compute the line weighting. @@ -720,10 +593,6 @@ void compute_error_squared_rgb( vfloat data_g = gatherf(blk.data_g, texel_idxs); vfloat data_b = gatherf(blk.data_b, texel_idxs); - vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs); - vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs); - vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs); - vfloat uncor_param = (data_r * l_uncor_bs0) + (data_g * l_uncor_bs1) + (data_b * l_uncor_bs2); @@ -732,11 +601,11 @@ void compute_error_squared_rgb( uncor_hiparamv = max(uncor_param, uncor_hiparamv); vfloat uncor_dist0 = (l_uncor_amod0 - data_r) - + (uncor_param * l_uncor_bis0); + + (uncor_param * l_uncor_bs0); vfloat uncor_dist1 = (l_uncor_amod1 - data_g) - + (uncor_param * l_uncor_bis1); + + (uncor_param * l_uncor_bs1); vfloat uncor_dist2 = (l_uncor_amod2 - data_b) - + (uncor_param * l_uncor_bis2); + + (uncor_param * l_uncor_bs2); vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) + (ew_g * uncor_dist1 * uncor_dist1) @@ -754,9 +623,9 @@ void compute_error_squared_rgb( samec_hiparamv = max(samec_param, samec_hiparamv); - vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r; - vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g; - vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b; + vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; + vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; + vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) + (ew_g * samec_dist1 * samec_dist1) @@ -765,7 +634,7 @@ void compute_error_squared_rgb( samec_err = select(vfloat::zero(), samec_err, mask); haccumulate(samec_errorsumv, samec_err); - lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH); + lane_ids += vint(ASTCENC_SIMD_WIDTH); } uncor_loparam = hmin_s(uncor_loparamv); diff --git a/lib/astc-encoder/Source/astcenc_block_sizes.cpp b/lib/astc-encoder/Source/astcenc_block_sizes.cpp index 9200cab3de..4a9dc09058 100644 --- a/lib/astc-encoder/Source/astcenc_block_sizes.cpp +++ b/lib/astc-encoder/Source/astcenc_block_sizes.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -30,7 +30,7 @@ * @param[out] is_dual_plane True if this block mode has two weight planes. * @param[out] quant_mode The quantization level for the weights. * - * @return Returns true of valid mode, false otherwise. + * @return Returns true if a valid mode, false otherwise. */ static bool decode_block_mode_2d( unsigned int block_mode, @@ -144,7 +144,7 @@ static bool decode_block_mode_2d( * @param[out] is_dual_plane True if this block mode has two weight planes. * @param[out] quant_mode The quantization level for the weights. * - * @return Returns true of valid mode, false otherwise. + * @return Returns true if a valid mode, false otherwise. */ static bool decode_block_mode_3d( unsigned int block_mode, @@ -854,6 +854,8 @@ static void construct_block_size_descriptor_2d( unsigned int always_block_mode_count = 0; unsigned int always_decimation_mode_count = 0; + float always_threshold = 0.0f; + // Iterate twice; first time keep the "always" blocks, second time keep the "non-always" blocks. // This ensures that the always block modes and decimation modes are at the start of the list. for (unsigned int j = 0; j < 2; j ++) @@ -869,12 +871,12 @@ static void construct_block_size_descriptor_2d( float percentile = percentiles[i]; bool selected = (percentile <= mode_cutoff) || !can_omit_modes; - if (j == 0 && percentile != 0.0f) + if (j == 0 && percentile > always_threshold) { continue; } - if (j == 1 && percentile == 0.0f) + if (j == 1 && percentile <= always_threshold) { continue; } @@ -905,13 +907,13 @@ static void construct_block_size_descriptor_2d( // Allocate and initialize the decimation table entry if we've not used it yet int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights]; - if (decimation_mode == -1) + if (decimation_mode < 0) { decimation_mode = construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd); decimation_mode_index[y_weights * 16 + x_weights] = decimation_mode; #if !defined(ASTCENC_DECOMPRESS_ONLY) - if (percentile == 0.0f) + if (percentile <= always_threshold) { always_decimation_mode_count++; } @@ -920,7 +922,7 @@ static void construct_block_size_descriptor_2d( #if !defined(ASTCENC_DECOMPRESS_ONLY) // Flatten the block mode heuristic into some precomputed flags - if (percentile == 0.0f) + if (percentile <= always_threshold) { always_block_mode_count++; bsd.block_modes[packed_idx].percentile_hit = true; diff --git a/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp b/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp index c64c65aadf..01b2a8b697 100644 --- a/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp +++ b/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -60,7 +60,6 @@ static void merge_endpoints( * @param decode_mode The decode mode (LDR, HDR). * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param[out] scb The symbolic compressed block output. * @param[out] dec_weights_quant_pvalue_plane1 The weights for plane 1. * @param[out] dec_weights_quant_pvalue_plane2 The weights for plane 2, or @c nullptr if 1 plane. @@ -69,7 +68,6 @@ static bool realign_weights( astcenc_profile decode_mode, const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, symbolic_compressed_block& scb, uint8_t* dec_weights_quant_pvalue_plane1, uint8_t* dec_weights_quant_pvalue_plane2 @@ -187,7 +185,7 @@ static bool realign_weights( vfloat4 color = color_base + color_offset * plane_weight; vfloat4 origcolor = blk.texel(texel); - vfloat4 error_weight = ewb.error_weights[texel]; + vfloat4 error_weight = blk.channel_weight; vfloat4 colordiff = color - origcolor; vfloat4 color_up_diff = colordiff + color_offset * plane_up_weight; @@ -226,7 +224,6 @@ static bool realign_weights( * @param config The compressor configuration. * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param only_always True if we only use "always" percentile block modes. * @param tune_errorval_threshold The error value threshold. * @param partition_count The partition count. @@ -238,7 +235,6 @@ static float compress_symbolic_block_for_partition_1plane( const astcenc_config& config, const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, bool only_always, float tune_errorval_threshold, unsigned int partition_count, @@ -260,7 +256,7 @@ static float compress_symbolic_block_for_partition_1plane( // Compute ideal weights and endpoint colors, with no quantization or decimation endpoints_and_weights& ei = tmpbuf.ei1; endpoints_and_weights *eix = tmpbuf.eix1; - compute_ideal_colors_and_weights_1plane(bsd, blk, ewb, pi, ei); + compute_ideal_colors_and_weights_1plane(bsd, blk, pi, ei); // Compute ideal weights and endpoint colors for every decimation float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value; @@ -382,7 +378,7 @@ static float compress_symbolic_block_for_partition_1plane( quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; unsigned int candidate_count = compute_ideal_endpoint_formats( - bsd, pi, blk, ewb, ei.ep, qwt_bitcounts, qwt_errors, + bsd, pi, blk, ei.ep, qwt_bitcounts, qwt_errors, config.tune_candidate_limit, partition_format_specifiers, block_mode_index, color_quant_level, color_quant_level_mod); @@ -424,7 +420,7 @@ static float compress_symbolic_block_for_partition_1plane( for (unsigned int l = 0; l < config.tune_refinement_limit; l++) { recompute_ideal_colors_1plane( - blk, ewb, pi, di, + blk, pi, di, weight_quant_mode, workscb.weights, eix[decimation_mode].ep, rgbs_colors, rgbo_colors); @@ -498,7 +494,7 @@ static float compress_symbolic_block_for_partition_1plane( // Pre-realign test if (l == 0) { - float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb); + float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk); if (errorval == -ERROR_CALC_DEFAULT) { errorval = -errorval; @@ -536,11 +532,11 @@ static float compress_symbolic_block_for_partition_1plane( // Perform a final pass over the weights to try to improve them. bool adjustments = realign_weights( - config.profile, bsd, blk, ewb, workscb, + config.profile, bsd, blk, workscb, workscb.weights, nullptr); // Post-realign test - float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb); + float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk); if (errorval == -ERROR_CALC_DEFAULT) { errorval = -errorval; @@ -590,7 +586,6 @@ static float compress_symbolic_block_for_partition_1plane( * @param config The compressor configuration. * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param tune_errorval_threshold The error value threshold. * @param plane2_component The component index for the second plane of weights. * @param[out] scb The symbolic compressed block output. @@ -600,7 +595,6 @@ static float compress_symbolic_block_for_partition_2planes( const astcenc_config& config, const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, float tune_errorval_threshold, unsigned int plane2_component, symbolic_compressed_block& scb, @@ -615,7 +609,7 @@ static float compress_symbolic_block_for_partition_2planes( endpoints_and_weights& ei2 = tmpbuf.ei2; endpoints_and_weights* eix1 = tmpbuf.eix1; endpoints_and_weights* eix2 = tmpbuf.eix2; - compute_ideal_colors_and_weights_2planes(bsd, blk, ewb, plane2_component, ei1, ei2); + compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2); // Compute ideal weights and endpoint colors for every decimation float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value; @@ -766,7 +760,7 @@ static float compress_symbolic_block_for_partition_2planes( const auto& pi = bsd.get_partition_info(1, 0); unsigned int candidate_count = compute_ideal_endpoint_formats( - bsd, pi, blk, ewb, epm, qwt_bitcounts, qwt_errors, + bsd, pi, blk, epm, qwt_bitcounts, qwt_errors, config.tune_candidate_limit, partition_format_specifiers, block_mode_index, color_quant_level, color_quant_level_mod); @@ -812,8 +806,8 @@ static float compress_symbolic_block_for_partition_2planes( for (unsigned int l = 0; l < config.tune_refinement_limit; l++) { recompute_ideal_colors_2planes( - blk, ewb, bsd, di, - weight_quant_mode, workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET, + blk, bsd, di, weight_quant_mode, + workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET, epm, rgbs_color, rgbo_color, plane2_component); // Quantize the chosen color @@ -842,7 +836,7 @@ static float compress_symbolic_block_for_partition_2planes( // Pre-realign test if (l == 0) { - float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb); + float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk); if (errorval == -ERROR_CALC_DEFAULT) { errorval = -errorval; @@ -880,11 +874,11 @@ static float compress_symbolic_block_for_partition_2planes( // Perform a final pass over the weights to try to improve them bool adjustments = realign_weights( - config.profile, bsd, blk, ewb, workscb, + config.profile, bsd, blk, workscb, workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET); // Post-realign test - float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb); + float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk); if (errorval == -ERROR_CALC_DEFAULT) { errorval = -errorval; @@ -928,260 +922,17 @@ static float compress_symbolic_block_for_partition_2planes( return best_errorval_in_mode; } -/** - * @brief Create a per-texel expansion of the error weights for deblocking. - * - * Deblockign works by assigning a higher error weight to blocks the closer they are the edge of the - * block. The encourages the compressor to keep the periphery colors more accurate, which can help - * reduce block artifacts when compressing gradients. - * - * @param[in,out] ctx The context containing both deblog memory and config. - */ -void expand_deblock_weights( - astcenc_context& ctx -) { - unsigned int xdim = ctx.config.block_x; - unsigned int ydim = ctx.config.block_y; - unsigned int zdim = ctx.config.block_z; - - float centerpos_x = static_cast(xdim - 1) * 0.5f; - float centerpos_y = static_cast(ydim - 1) * 0.5f; - float centerpos_z = static_cast(zdim - 1) * 0.5f; - float *bef = ctx.deblock_weights; - - for (unsigned int z = 0; z < zdim; z++) - { - for (unsigned int y = 0; y < ydim; y++) - { - for (unsigned int x = 0; x < xdim; x++) - { - float xdif = (static_cast(x) - centerpos_x) / static_cast(xdim); - float ydif = (static_cast(y) - centerpos_y) / static_cast(ydim); - float zdif = (static_cast(z) - centerpos_z) / static_cast(zdim); - - float wdif = 0.36f; - float dist = astc::sqrt(xdif * xdif + ydif * ydif + zdif * zdif + wdif * wdif); - *bef = astc::pow(dist, ctx.config.b_deblock_weight); - bef++; - } - } - } -} - -/** - * @brief Create a per-texel and per-channel expansion of the error weights. - * - * This approach creates relatively large error block tables, but it allows a very flexible level of - * control over how specific texels and channels are prioritized by the compressor. - * - * @param ctx The compressor context and configuration. - * @param image The input image information. - * @param bsd The block size information. - * @param blk The image block color data to compress. - * @param[out] ewb The image block weighted error data. - * - * @return Return the total error weight sum for all texels and channels. - */ -static float prepare_error_weight_block( - const astcenc_context& ctx, - const astcenc_image& image, - const block_size_descriptor& bsd, - const image_block& blk, - error_weight_block& ewb -) { - unsigned int idx = 0; - bool any_mean_stdev_weight = - ctx.config.v_rgb_mean != 0.0f || ctx.config.v_rgb_stdev != 0.0f || \ - ctx.config.v_a_mean != 0.0f || ctx.config.v_a_stdev != 0.0f; - - vfloat4 color_weights(ctx.config.cw_r_weight, - ctx.config.cw_g_weight, - ctx.config.cw_b_weight, - ctx.config.cw_a_weight); - - // This works because HDR is imposed globally at compression time - unsigned int rgb_lns = blk.rgb_lns[0]; - unsigned int a_lns = blk.alpha_lns[0]; - vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns); - vmask4 lns_mask = use_lns != vint4::zero(); - - promise(bsd.xdim > 0); - promise(bsd.ydim > 0); - promise(bsd.zdim > 0); - - for (unsigned int z = 0; z < bsd.zdim; z++) - { - for (unsigned int y = 0; y < bsd.ydim; y++) - { - for (unsigned int x = 0; x < bsd.xdim; x++) - { - unsigned int xpos = x + blk.xpos; - unsigned int ypos = y + blk.ypos; - unsigned int zpos = z + blk.zpos; - - if (xpos >= image.dim_x || ypos >= image.dim_y || zpos >= image.dim_z) - { - ewb.error_weights[idx] = vfloat4(1e-11f); - } - else - { - vfloat4 derv(65535.0f); - - // Compute derivative if we have any use of LNS - if (any(lns_mask)) - { - vfloat4 data = blk.texel(idx); - vint4 datai = lns_to_sf16(float_to_int(data)); - - vfloat4 dataf = float16_to_float(datai); - dataf = max(dataf, 6e-5f); - - vfloat4 data_lns1 = dataf * 1.05f; - data_lns1 = float_to_lns(data_lns1); - - vfloat4 data_lns2 = dataf; - data_lns2 = float_to_lns(data_lns2); - - vfloat4 divisor_lns = dataf * 0.05f; - - // Clamp derivatives between 1/32 and 2^25 - float lo = 1.0f / 32.0f; - float hi = 33554432.0f; - vfloat4 derv_lns = clamp(lo, hi, (data_lns1 - data_lns2) / divisor_lns); - derv = select(derv, derv_lns, lns_mask); - } - - // Compute error weight - vfloat4 error_weight(ctx.config.v_rgb_base, - ctx.config.v_rgb_base, - ctx.config.v_rgb_base, - ctx.config.v_a_base); - - unsigned int ydt = image.dim_x; - unsigned int zdt = image.dim_x * image.dim_y; - - if (any_mean_stdev_weight) - { - vfloat4 avg = ctx.input_averages[zpos * zdt + ypos * ydt + xpos]; - avg = max(avg, 6e-5f); - avg = avg * avg; - - vfloat4 variance = ctx.input_variances[zpos * zdt + ypos * ydt + xpos]; - variance = variance * variance; - - float favg = hadd_rgb_s(avg) * (1.0f / 3.0f); - float fvar = hadd_rgb_s(variance) * (1.0f / 3.0f); - - float mixing = ctx.config.v_rgba_mean_stdev_mix; - avg.set_lane<0>(favg * mixing + avg.lane<0>() * (1.0f - mixing)); - avg.set_lane<1>(favg * mixing + avg.lane<1>() * (1.0f - mixing)); - avg.set_lane<2>(favg * mixing + avg.lane<2>() * (1.0f - mixing)); - - variance.set_lane<0>(fvar * mixing + variance.lane<0>() * (1.0f - mixing)); - variance.set_lane<1>(fvar * mixing + variance.lane<1>() * (1.0f - mixing)); - variance.set_lane<2>(fvar * mixing + variance.lane<2>() * (1.0f - mixing)); - - vfloat4 stdev = sqrt(max(variance, 0.0f)); - - vfloat4 scalea(ctx.config.v_rgb_mean, ctx.config.v_rgb_mean, ctx.config.v_rgb_mean, ctx.config.v_a_mean); - avg = avg * scalea; - - vfloat4 scales(ctx.config.v_rgb_stdev, ctx.config.v_rgb_stdev, ctx.config.v_rgb_stdev, ctx.config.v_a_stdev); - stdev = stdev * scales; - - error_weight = error_weight + avg + stdev; - error_weight = 1.0f / error_weight; - } - - if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT) - { - float alpha_scale; - if (ctx.config.a_scale_radius != 0) - { - alpha_scale = ctx.input_alpha_averages[zpos * zdt + ypos * ydt + xpos]; - } - else - { - alpha_scale = blk.data_a[idx] * (1.0f / 65535.0f); - } - - alpha_scale = astc::max(alpha_scale, 0.0001f); - - alpha_scale *= alpha_scale; - error_weight.set_lane<0>(error_weight.lane<0>() * alpha_scale); - error_weight.set_lane<1>(error_weight.lane<1>() * alpha_scale); - error_weight.set_lane<2>(error_weight.lane<2>() * alpha_scale); - } - - error_weight = error_weight * color_weights; - error_weight = error_weight * ctx.deblock_weights[idx]; - - // When we loaded the block to begin with, we applied a transfer function and - // computed the derivative of the transfer function. However, the error-weight - // computation so far is based on the original color values, not the - // transfer-function values. As such, we must multiply the error weights by the - // derivative of the inverse of the transfer function, which is equivalent to - // dividing by the derivative of the transfer function. - - error_weight = error_weight / (derv * derv * 1e-10f); - ewb.error_weights[idx] = error_weight; - } - idx++; - } - } - } - - // Small bias to avoid divide by zeros and NaN propagation later - vfloat4 texel_weight_sum(1e-17f); - vfloat4 error_weight_sum(1e-17f); - - int texels_per_block = bsd.texel_count; - for (int i = 0; i < texels_per_block; i++) - { - texel_weight_sum += ewb.error_weights[i] * blk.texel(i); - error_weight_sum += ewb.error_weights[i]; - - float wr = ewb.error_weights[i].lane<0>(); - float wg = ewb.error_weights[i].lane<1>(); - float wb = ewb.error_weights[i].lane<2>(); - float wa = ewb.error_weights[i].lane<3>(); - - ewb.texel_weight_r[i] = wr; - ewb.texel_weight_g[i] = wg; - ewb.texel_weight_b[i] = wb; - ewb.texel_weight_a[i] = wa; - - ewb.texel_weight_rg[i] = (wr + wg) * 0.5f; - ewb.texel_weight_rb[i] = (wr + wb) * 0.5f; - ewb.texel_weight_gb[i] = (wg + wb) * 0.5f; - - ewb.texel_weight_gba[i] = (wg + wb + wa) * 0.333333f; - ewb.texel_weight_rba[i] = (wr + wb + wa) * 0.333333f; - ewb.texel_weight_rga[i] = (wr + wg + wa) * 0.333333f; - ewb.texel_weight_rgb[i] = (wr + wg + wb) * 0.333333f; - - ewb.texel_weight[i] = (wr + wg + wb + wa) * 0.25f; - } - - ewb.block_error_weighted_rgba_sum = texel_weight_sum; - ewb.block_error_weight_sum = error_weight_sum; - - return hadd_s(error_weight_sum); -} - /** * @brief Determine the lowest cross-channel correlation factor. * * @param texels_per_block The number of texels in a block. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * * @return Return the lowest correlation factor. */ static float prepare_block_statistics( int texels_per_block, - const image_block& blk, - const error_weight_block& ewb + const image_block& blk ) { // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row // of the matrix. The matrix is symmetric, so this is all we need for this use case. @@ -1205,7 +956,7 @@ static float prepare_block_statistics( promise(texels_per_block > 0); for (int i = 0; i < texels_per_block; i++) { - float weight = ewb.texel_weight[i]; + float weight = hadd_s(blk.channel_weight) / 4.0f; assert(weight >= 0.0f); weight_sum += weight; @@ -1295,14 +1046,12 @@ static float prepare_block_statistics( /* See header for documentation. */ void compress_block( const astcenc_context& ctx, - const astcenc_image& input_image, const image_block& blk, physical_compressed_block& pcb, compression_working_buffers& tmpbuf) { astcenc_profile decode_mode = ctx.config.profile; symbolic_compressed_block scb; - error_weight_block& ewb = tmpbuf.ewb; const block_size_descriptor* bsd = ctx.bsd; float lowest_correl; @@ -1332,13 +1081,13 @@ void compress_block( #if defined(ASTCENC_DIAGNOSTICS) // Do this early in diagnostic builds so we can dump uniform metrics // for every block. Do it later in release builds to avoid redundant work! - float error_weight_sum = prepare_error_weight_block(ctx, input_image, *bsd, blk, ewb); + float error_weight_sum = hadd_s(blk.channel_weight) * bsd->texel_count; float error_threshold = ctx.config.tune_db_limit * error_weight_sum * block_is_l_scale * block_is_la_scale; - lowest_correl = prepare_block_statistics(bsd->texel_count, blk, ewb); + lowest_correl = prepare_block_statistics(bsd->texel_count, blk); trace_add_data("lowest_correl", lowest_correl); trace_add_data("tune_error_threshold", error_threshold); #endif @@ -1376,7 +1125,7 @@ void compress_block( } #if !defined(ASTCENC_DIAGNOSTICS) - float error_weight_sum = prepare_error_weight_block(ctx, input_image, *bsd, blk, ewb); + float error_weight_sum = hadd_s(blk.channel_weight) * bsd->texel_count; float error_threshold = ctx.config.tune_db_limit * error_weight_sum * block_is_l_scale @@ -1427,7 +1176,7 @@ void compress_block( trace_add_data("search_mode", i); float errorval = compress_symbolic_block_for_partition_1plane( - ctx.config, *bsd, blk, ewb, i == 0, + ctx.config, *bsd, blk, i == 0, error_threshold * errorval_mult[i] * errorval_overshoot, 1, 0, scb, tmpbuf); @@ -1440,7 +1189,7 @@ void compress_block( } #if !defined(ASTCENC_DIAGNOSTICS) - lowest_correl = prepare_block_statistics(bsd->texel_count, blk, ewb); + lowest_correl = prepare_block_statistics(bsd->texel_count, blk); #endif block_skip_two_plane = lowest_correl > ctx.config.tune_2_plane_early_out_limit_correlation; @@ -1473,8 +1222,7 @@ void compress_block( } float errorval = compress_symbolic_block_for_partition_2planes( - ctx.config, *bsd, blk, ewb, - error_threshold * errorval_overshoot, + ctx.config, *bsd, blk, error_threshold * errorval_overshoot, i, scb, tmpbuf); // If attempting two planes is much worse than the best one plane result @@ -1494,25 +1242,24 @@ void compress_block( // Find best blocks for 2, 3 and 4 partitions for (int partition_count = 2; partition_count <= max_partitions; partition_count++) { - unsigned int partition_indices_1plane[2] { 0, 0 }; + unsigned int partition_indices[2] { 0 }; - find_best_partition_candidates(*bsd, blk, ewb, partition_count, + find_best_partition_candidates(*bsd, blk, partition_count, ctx.config.tune_partition_index_limit, - partition_indices_1plane[0], - partition_indices_1plane[1]); + partition_indices); - for (int i = 0; i < 2; i++) + for (unsigned int i = 0; i < 2; i++) { TRACE_NODE(node1, "pass"); trace_add_data("partition_count", partition_count); - trace_add_data("partition_index", partition_indices_1plane[i]); + trace_add_data("partition_index", partition_indices[i]); trace_add_data("plane_count", 1); trace_add_data("search_mode", i); float errorval = compress_symbolic_block_for_partition_1plane( - ctx.config, *bsd, blk, ewb, false, + ctx.config, *bsd, blk, false, error_threshold * errorval_overshoot, - partition_count, partition_indices_1plane[i], + partition_count, partition_indices[i], scb, tmpbuf); best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval); @@ -1541,7 +1288,7 @@ void compress_block( // TODO: Do something more sensible here, such as average color block if (scb.block_type == SYM_BTYPE_ERROR) { -#if !defined(NDEBUG) +#if defined(ASTCENC_DIAGNOSTICS) static bool printed_once = false; if (!printed_once) { diff --git a/lib/astc-encoder/Source/astcenc_compute_variance.cpp b/lib/astc-encoder/Source/astcenc_compute_variance.cpp index 61c1481073..41757fc5f1 100644 --- a/lib/astc-encoder/Source/astcenc_compute_variance.cpp +++ b/lib/astc-encoder/Source/astcenc_compute_variance.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -100,7 +100,7 @@ static void brent_kung_prefix_sum( } /** - * @brief Compute averages and variances for a pixel region. + * @brief Compute averages for a pixel region. * * The routine computes both in a single pass, using a summed-area table to decouple the running * time from the averaging/variance kernel size. @@ -110,12 +110,10 @@ static void brent_kung_prefix_sum( */ static void compute_pixel_region_variance( astcenc_context& ctx, - const pixel_region_variance_args& arg + const pixel_region_args& arg ) { // Unpack the memory structure into local variables const astcenc_image* img = arg.img; - float rgb_power = arg.rgb_power; - float alpha_power = arg.alpha_power; astcenc_swizzle swz = arg.swz; bool have_z = arg.have_z; @@ -127,16 +125,13 @@ static void compute_pixel_region_variance( int offset_y = arg.offset_y; int offset_z = arg.offset_z; - int avg_var_kernel_radius = arg.avg_var_kernel_radius; int alpha_kernel_radius = arg.alpha_kernel_radius; float* input_alpha_averages = ctx.input_alpha_averages; - vfloat4* input_averages = ctx.input_averages; - vfloat4* input_variances = ctx.input_variances; vfloat4* work_memory = arg.work_memory; // Compute memory sizes and dimensions that we need - int kernel_radius = astc::max(avg_var_kernel_radius, alpha_kernel_radius); + int kernel_radius = alpha_kernel_radius; int kerneldim = 2 * kernel_radius + 1; int kernel_radius_xy = kernel_radius; int kernel_radius_z = have_z ? kernel_radius : 0; @@ -147,7 +142,6 @@ static void compute_pixel_region_variance( int sizeprod = padsize_x * padsize_y * padsize_z; int zd_start = have_z ? 1 : 0; - int are_powers_1 = (rgb_power == 1.0f) && (alpha_power == 1.0f); vfloat4 *varbuf1 = work_memory; vfloat4 *varbuf2 = work_memory + sizeprod; @@ -203,12 +197,6 @@ static void compute_pixel_region_variance( b * (1.0f / 255.0f), a * (1.0f / 255.0f)); - if (!are_powers_1) - { - vfloat4 exp(rgb_power, rgb_power, rgb_power, alpha_power); - d = pow(max(d, 1e-6f), exp); - } - VARBUF1(z, y, x) = d; VARBUF2(z, y, x) = d * d; } @@ -246,12 +234,6 @@ static void compute_pixel_region_variance( vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); vfloat4 d = float16_to_float(di); - if (!are_powers_1) - { - vfloat4 exp(rgb_power, rgb_power, rgb_power, alpha_power); - d = pow(max(d, 1e-6f), exp); - } - VARBUF1(z, y, x) = d; VARBUF2(z, y, x) = d * d; } @@ -295,12 +277,6 @@ static void compute_pixel_region_variance( vfloat4 d(r, g, b, a); - if (!are_powers_1) - { - vfloat4 exp(rgb_power, rgb_power, rgb_power, alpha_power); - d = pow(max(d, 1e-6f), exp); - } - VARBUF1(z, y, x) = d; VARBUF2(z, y, x) = d * d; } @@ -369,37 +345,20 @@ static void compute_pixel_region_variance( } } - int avg_var_kdim = 2 * avg_var_kernel_radius + 1; int alpha_kdim = 2 * alpha_kernel_radius + 1; // Compute a few constants used in the variance-calculation. - float avg_var_samples; float alpha_rsamples; - float mul1; if (have_z) { - avg_var_samples = (float)(avg_var_kdim * avg_var_kdim * avg_var_kdim); alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim * alpha_kdim); } else { - avg_var_samples = (float)(avg_var_kdim * avg_var_kdim); alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim); } - float avg_var_rsamples = 1.0f / avg_var_samples; - if (avg_var_samples == 1) - { - mul1 = 1.0f; - } - else - { - mul1 = 1.0f / (float)(avg_var_samples * (avg_var_samples - 1)); - } - - float mul2 = avg_var_samples * mul1; - // Use the summed-area tables to compute variance for each neighborhood if (have_z) { @@ -436,33 +395,6 @@ static void compute_pixel_region_variance( int out_index = z_dst * zdt + y_dst * ydt + x_dst; input_alpha_averages[out_index] = (vasum * alpha_rsamples); - - // Summed-area table lookups for RGBA average and variance - vfloat4 v1sum = ( VARBUF1(z_high, y_low, x_low) - - VARBUF1(z_high, y_low, x_high) - - VARBUF1(z_high, y_high, x_low) - + VARBUF1(z_high, y_high, x_high)) - - ( VARBUF1(z_low, y_low, x_low) - - VARBUF1(z_low, y_low, x_high) - - VARBUF1(z_low, y_high, x_low) - + VARBUF1(z_low, y_high, x_high)); - - vfloat4 v2sum = ( VARBUF2(z_high, y_low, x_low) - - VARBUF2(z_high, y_low, x_high) - - VARBUF2(z_high, y_high, x_low) - + VARBUF2(z_high, y_high, x_high)) - - ( VARBUF2(z_low, y_low, x_low) - - VARBUF2(z_low, y_low, x_high) - - VARBUF2(z_low, y_high, x_low) - + VARBUF2(z_low, y_high, x_high)); - - // Compute and emit the average - vfloat4 avg = v1sum * avg_var_rsamples; - input_averages[out_index] = avg; - - // Compute and emit the actual variance - vfloat4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum); - input_variances[out_index] = variance; } } } @@ -491,35 +423,16 @@ static void compute_pixel_region_variance( int out_index = y_dst * ydt + x_dst; input_alpha_averages[out_index] = (vasum * alpha_rsamples); - - // summed-area table lookups for RGBA average and variance - vfloat4 v1sum = VARBUF1(0, y_low, x_low) - - VARBUF1(0, y_low, x_high) - - VARBUF1(0, y_high, x_low) - + VARBUF1(0, y_high, x_high); - - vfloat4 v2sum = VARBUF2(0, y_low, x_low) - - VARBUF2(0, y_low, x_high) - - VARBUF2(0, y_high, x_low) - + VARBUF2(0, y_high, x_high); - - // Compute and emit the average - vfloat4 avg = v1sum * avg_var_rsamples; - input_averages[out_index] = avg; - - // Compute and emit the actual variance - vfloat4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum); - input_variances[out_index] = variance; } } } } -void compute_averages_and_variances( +void compute_averages( astcenc_context& ctx, - const avg_var_args &ag + const avg_args &ag ) { - pixel_region_variance_args arg = ag.arg; + pixel_region_args arg = ag.arg; arg.work_memory = new vfloat4[ag.work_memory_size]; int size_x = ag.img_size_x; @@ -535,7 +448,7 @@ void compute_averages_and_variances( while (true) { unsigned int count; - unsigned int base = ctx.manage_avg_var.get_task_assignment(16, count); + unsigned int base = ctx.manage_avg.get_task_assignment(16, count); if (!count) { break; @@ -560,28 +473,25 @@ void compute_averages_and_variances( } } - ctx.manage_avg_var.complete_task_assignment(count); + ctx.manage_avg.complete_task_assignment(count); } delete[] arg.work_memory; } /* See header for documentation. */ -unsigned int init_compute_averages_and_variances( +unsigned int init_compute_averages( const astcenc_image& img, - float rgb_power, - float alpha_power, - unsigned int avg_var_kernel_radius, unsigned int alpha_kernel_radius, const astcenc_swizzle& swz, - avg_var_args& ag + avg_args& ag ) { unsigned int size_x = img.dim_x; unsigned int size_y = img.dim_y; unsigned int size_z = img.dim_z; // Compute maximum block size and from that the working memory buffer size - unsigned int kernel_radius = astc::max(avg_var_kernel_radius, alpha_kernel_radius); + unsigned int kernel_radius = alpha_kernel_radius; unsigned int kerneldim = 2 * kernel_radius + 1; bool have_z = (size_z > 1); @@ -591,7 +501,7 @@ unsigned int init_compute_averages_and_variances( unsigned int max_padsize_xy = max_blk_size_xy + kerneldim; unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0); - // Perform block-wise averages-and-variances calculations across the image + // Perform block-wise averages calculations across the image // Initialize fields which are not populated until later ag.arg.size_x = 0; ag.arg.size_y = 0; @@ -602,11 +512,8 @@ unsigned int init_compute_averages_and_variances( ag.arg.work_memory = nullptr; ag.arg.img = &img; - ag.arg.rgb_power = rgb_power; - ag.arg.alpha_power = alpha_power; ag.arg.swz = swz; ag.arg.have_z = have_z; - ag.arg.avg_var_kernel_radius = avg_var_kernel_radius; ag.arg.alpha_kernel_radius = alpha_kernel_radius; ag.img_size_x = size_x; diff --git a/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp b/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp index 3649a66dd2..478c1cf1c8 100644 --- a/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp +++ b/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -186,6 +186,7 @@ void decompress_symbolic_block( blk.zpos = zpos; blk.data_min = vfloat4::zero(); + blk.data_mean = vfloat4::zero(); blk.data_max = vfloat4::zero(); blk.grayscale = false; @@ -321,8 +322,7 @@ float compute_symbolic_block_difference( const astcenc_config& config, const block_size_descriptor& bsd, const symbolic_compressed_block& scb, - const image_block& blk, - const error_weight_block& ewb + const image_block& blk ) { // If we detected an error-block, blow up immediately. if (scb.block_type == SYM_BTYPE_ERROR) @@ -415,7 +415,7 @@ float compute_symbolic_block_difference( error = min(abs(error), 1e15f); error = error * error; - float metric = dot_s(error, ewb.error_weights[tix]); + float metric = dot_s(error, blk.channel_weight); summa += astc::min(metric, ERROR_CALC_DEFAULT); } } diff --git a/lib/astc-encoder/Source/astcenc_entry.cpp b/lib/astc-encoder/Source/astcenc_entry.cpp index b1ef6b8341..b77857c0bd 100644 --- a/lib/astc-encoder/Source/astcenc_entry.cpp +++ b/lib/astc-encoder/Source/astcenc_entry.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -411,18 +411,6 @@ static astcenc_error validate_config( } #endif - config.v_rgba_mean_stdev_mix = astc::max(config.v_rgba_mean_stdev_mix, 0.0f); - config.v_rgb_power = astc::max(config.v_rgb_power, 0.0f); - config.v_rgb_base = astc::max(config.v_rgb_base, 0.0f); - config.v_rgb_mean = astc::max(config.v_rgb_mean, 0.0f); - config.v_rgb_stdev = astc::max(config.v_rgb_stdev, 0.0f); - config.v_a_power = astc::max(config.v_a_power, 0.0f); - config.v_a_base = astc::max(config.v_a_base, 0.0f); - config.v_a_mean = astc::max(config.v_a_mean, 0.0f); - config.v_a_stdev = astc::max(config.v_a_stdev, 0.0f); - - config.b_deblock_weight = astc::max(config.b_deblock_weight, 0.0f); - config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f); config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u); @@ -586,9 +574,6 @@ astcenc_error astcenc_config_init( } // Set heuristics to the defaults for each color profile - config.v_rgba_radius = 0; - config.v_rgba_mean_stdev_mix = 0.0f; - config.cw_r_weight = 1.0f; config.cw_g_weight = 1.0f; config.cw_b_weight = 1.0f; @@ -606,40 +591,9 @@ astcenc_error astcenc_config_init( { case ASTCENC_PRF_LDR: case ASTCENC_PRF_LDR_SRGB: - config.v_rgb_power = 1.0f; - config.v_rgb_base = 1.0f; - config.v_rgb_mean = 0.0f; - config.v_rgb_stdev = 0.0f; - - config.v_a_power = 1.0f; - config.v_a_base = 1.0f; - config.v_a_mean = 0.0f; - config.v_a_stdev = 0.0f; break; case ASTCENC_PRF_HDR_RGB_LDR_A: - config.v_rgb_power = 0.75f; - config.v_rgb_base = 0.0f; - config.v_rgb_mean = 1.0f; - config.v_rgb_stdev = 0.0f; - - config.v_a_power = 1.0f; - config.v_a_base = 0.05f; - config.v_a_mean = 0.0f; - config.v_a_stdev = 0.0f; - - config.tune_db_limit = 999.0f; - break; case ASTCENC_PRF_HDR: - config.v_rgb_power = 0.75f; - config.v_rgb_base = 0.0f; - config.v_rgb_mean = 1.0f; - config.v_rgb_stdev = 0.0f; - - config.v_a_power = 0.75f; - config.v_a_base = 0.0f; - config.v_a_mean = 1.0f; - config.v_a_stdev = 0.0f; - config.tune_db_limit = 999.0f; break; default: @@ -663,27 +617,13 @@ astcenc_error astcenc_config_init( // Normals are prone to blocking artifacts on smooth curves // so force compressor to try harder here ... - config.b_deblock_weight = 1.8f; config.tune_db_limit *= 1.03f; - - if (flags & ASTCENC_FLG_USE_PERCEPTUAL) - { - config.v_rgba_radius = 3; - config.v_rgba_mean_stdev_mix = 0.0f; - config.v_rgb_mean = 0.0f; - config.v_rgb_stdev = 50.0f; - config.v_a_mean = 0.0f; - config.v_a_stdev = 50.0f; - } } else if (flags & ASTCENC_FLG_MAP_MASK) { - config.v_rgba_radius = 3; - config.v_rgba_mean_stdev_mix = 0.03f; - config.v_rgb_mean = 0.0f; - config.v_rgb_stdev = 25.0f; - config.v_a_mean = 0.0f; - config.v_a_stdev = 25.0f; + // Masks are prone to blocking artifacts on mask edges + // so force compressor to try harder here ... + config.tune_db_limit *= 1.03f; } else if (flags & ASTCENC_FLG_MAP_RGBM) { @@ -756,8 +696,6 @@ astcenc_error astcenc_context_alloc( ctx->working_buffers = nullptr; // These are allocated per-compress, as they depend on image size - ctx->input_averages = nullptr; - ctx->input_variances = nullptr; ctx->input_alpha_averages = nullptr; // Copy the config first and validate the copy (we may modify it) @@ -778,9 +716,6 @@ astcenc_error astcenc_context_alloc( // Do setup only needed by compression if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) { - // Expand deblock supression into a weight scale per texel in the block - expand_deblock_weights(*ctx); - // Turn a dB limit into a per-texel error for faster use later if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) { @@ -951,14 +886,21 @@ static void compress_image( { blk.origin_texel = vfloat4::zero(); blk.data_min = vfloat4::zero(); - blk.data_max = blk.data_min; - blk.grayscale = false; + blk.data_mean = vfloat4::zero(); + blk.data_max = vfloat4::zero(); + blk.grayscale = true; } + // Populate the block channel weights + blk.channel_weight = vfloat4(ctx.config.cw_r_weight, + ctx.config.cw_g_weight, + ctx.config.cw_b_weight, + ctx.config.cw_a_weight); + int offset = ((z * yblocks + y) * xblocks + x) * 16; uint8_t *bp = buffer + offset; physical_compressed_block* pcb = reinterpret_cast(bp); - compress_block(ctx, image, blk, *pcb, temp_buffers); + compress_block(ctx, blk, *pcb, temp_buffers); } ctx.manage_compress.complete_task_assignment(count); @@ -1025,34 +967,29 @@ astcenc_error astcenc_compress_image( astcenc_compress_reset(ctx); } - if (ctx->config.v_rgb_mean != 0.0f || ctx->config.v_rgb_stdev != 0.0f || - ctx->config.v_a_mean != 0.0f || ctx->config.v_a_stdev != 0.0f || - ctx->config.a_scale_radius != 0) + if (ctx->config.a_scale_radius != 0) { // First thread to enter will do setup, other threads will subsequently // enter the critical section but simply skip over the initialization - auto init_avg_var = [ctx, &image, swizzle]() { + auto init_avg = [ctx, &image, swizzle]() { // Perform memory allocations for the destination buffers size_t texel_count = image.dim_x * image.dim_y * image.dim_z; - ctx->input_averages = new vfloat4[texel_count]; - ctx->input_variances = new vfloat4[texel_count]; ctx->input_alpha_averages = new float[texel_count]; - return init_compute_averages_and_variances( - image, ctx->config.v_rgb_power, ctx->config.v_a_power, - ctx->config.v_rgba_radius, ctx->config.a_scale_radius, *swizzle, - ctx->avg_var_preprocess_args); + return init_compute_averages( + image, ctx->config.a_scale_radius, *swizzle, + ctx->avg_preprocess_args); }; // Only the first thread actually runs the initializer - ctx->manage_avg_var.init(init_avg_var); + ctx->manage_avg.init(init_avg); // All threads will enter this function and dynamically grab work - compute_averages_and_variances(*ctx, ctx->avg_var_preprocess_args); + compute_averages(*ctx, ctx->avg_preprocess_args); } - // Wait for compute_averages_and_variances to complete before compressing - ctx->manage_avg_var.wait(); + // Wait for compute_averages to complete before compressing + ctx->manage_avg.wait(); compress_image(*ctx, thread_index, image, *swizzle, data_out); @@ -1060,12 +997,6 @@ astcenc_error astcenc_compress_image( ctx->manage_compress.wait(); auto term_compress = [ctx]() { - delete[] ctx->input_averages; - ctx->input_averages = nullptr; - - delete[] ctx->input_variances; - ctx->input_variances = nullptr; - delete[] ctx->input_alpha_averages; ctx->input_alpha_averages = nullptr; }; @@ -1090,7 +1021,7 @@ astcenc_error astcenc_compress_reset( return ASTCENC_ERR_BAD_CONTEXT; } - ctx->manage_avg_var.reset(); + ctx->manage_avg.reset(); ctx->manage_compress.reset(); return ASTCENC_SUCCESS; #endif diff --git a/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp b/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp index 355a18e804..0b648b9d66 100644 --- a/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp +++ b/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -52,14 +52,12 @@ * @brief Pick some initital kmeans cluster centers. * * @param blk The image block color data to compress. - * @param ewb The image error weight block. * @param texel_count The number of texels in the block. * @param partition_count The number of partitions in the block. * @param[out] cluster_centers The initital partition cluster center colors. */ static void kmeans_init( const image_block& blk, - const error_weight_block& ewb, unsigned int texel_count, unsigned int partition_count, vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS] @@ -82,8 +80,7 @@ static void kmeans_init( { vfloat4 color = blk.texel(i); vfloat4 diff = color - center_color; - diff = diff * ewb.error_weights[i]; - float distance = dot_s(diff, diff); + float distance = dot_s(diff * diff, blk.channel_weight); distance_sum += distance; distances[i] = distance; } @@ -128,8 +125,7 @@ static void kmeans_init( { vfloat4 color = blk.texel(i); vfloat4 diff = color - center_color; - diff = diff * ewb.error_weights[i]; - float distance = dot_s(diff, diff); + float distance = dot_s(diff * diff, blk.channel_weight); distance = astc::min(distance, distances[i]); distance_sum += distance; distances[i] = distance; @@ -141,7 +137,6 @@ static void kmeans_init( * @brief Assign texels to clusters, based on a set of chosen center points. * * @param blk The image block color data to compress. - * @param ewb The image error weight block. * @param texel_count The number of texels in the block. * @param partition_count The number of partitions in the block. * @param cluster_centers The partition cluster center colors. @@ -149,7 +144,6 @@ static void kmeans_init( */ static void kmeans_assign( const image_block& blk, - const error_weight_block& ewb, unsigned int texel_count, unsigned int partition_count, const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS], @@ -170,8 +164,7 @@ static void kmeans_assign( for (unsigned int j = 0; j < partition_count; j++) { vfloat4 diff = color - cluster_centers[j]; - diff = diff * ewb.error_weights[i]; - float distance = dot_s(diff, diff); + float distance = dot_s(diff * diff, blk.channel_weight); if (distance < best_distance) { best_distance = distance; @@ -431,14 +424,12 @@ static void get_partition_ordering_by_mismatch_bits( * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image error weight block. * @param partition_count The desired number of partitions in the block. * @param[out] partition_ordering The list of recommended partition indices, in priority order. - */ + */ static void compute_kmeans_partition_ordering( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, unsigned int partition_count, unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] ) { @@ -450,14 +441,14 @@ static void compute_kmeans_partition_ordering( { if (i == 0) { - kmeans_init(blk, ewb, bsd.texel_count, partition_count, cluster_centers); + kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers); } else { kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); } - kmeans_assign(blk, ewb, bsd.texel_count, partition_count, cluster_centers, texel_partitions); + kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions); } // Construct the block bitmaps of texel assignments to each partition @@ -482,11 +473,9 @@ static void compute_kmeans_partition_ordering( void find_best_partition_candidates( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, unsigned int partition_count, unsigned int partition_search_limit, - unsigned int& best_partition_uncor, - unsigned int& best_partition_samec + unsigned int best_partitions[2] ) { // Constant used to estimate quantization error for a given partitioning; the optimal value for // this depends on bitrate. These values have been determined empirically. @@ -511,7 +500,7 @@ void find_best_partition_candidates( weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS]; - compute_kmeans_partition_ordering(bsd, blk, ewb, partition_count, partition_sequence); + compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); bool uses_alpha = !blk.is_constant_channel(3); @@ -540,7 +529,7 @@ void find_best_partition_candidates( // Compute weighting to give to each component in each partition partition_metrics pms[BLOCK_MAX_PARTITIONS]; - compute_avgs_and_dirs_4_comp(pi, blk, ewb, pms); + compute_avgs_and_dirs_4_comp(pi, blk, pms); line4 uncor_lines[BLOCK_MAX_PARTITIONS]; line4 samec_lines[BLOCK_MAX_PARTITIONS]; @@ -558,16 +547,14 @@ void find_best_partition_candidates( uncor_lines[j].a = pm.avg; uncor_lines[j].b = normalize_safe(pm.dir, unit4()); - uncor_plines[j].amod = (uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b)) * pm.icolor_scale; - uncor_plines[j].bs = uncor_lines[j].b * pm.color_scale; - uncor_plines[j].bis = uncor_lines[j].b * pm.icolor_scale; + uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b); + uncor_plines[j].bs = uncor_lines[j].b; samec_lines[j].a = vfloat4::zero(); samec_lines[j].b = normalize_safe(pm.avg, unit4()); samec_plines[j].amod = vfloat4::zero(); - samec_plines[j].bs = samec_lines[j].b * pm.color_scale; - samec_plines[j].bis = samec_lines[j].b * pm.icolor_scale; + samec_plines[j].bs = samec_lines[j].b; } float uncor_error = 0.0f; @@ -575,7 +562,6 @@ void find_best_partition_candidates( compute_error_squared_rgba(pi, blk, - ewb, uncor_plines, samec_plines, uncor_line_lens, @@ -595,20 +581,14 @@ void find_best_partition_candidates( for (unsigned int j = 0; j < partition_count; j++) { - partition_metrics& pm = pms[j]; - float tpp = (float)(pi.partition_texel_count[j]); - - vfloat4 ics = pm.icolor_scale; - vfloat4 error_weights = pm.error_weight * (tpp * weight_imprecision_estim); - - vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j] * ics; - vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j] * ics; + float tpp = static_cast(pi.partition_texel_count[j]); + vfloat4 error_weights(tpp * weight_imprecision_estim); - uncor_vector = uncor_vector * uncor_vector; - samec_vector = samec_vector * samec_vector; + vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j]; + vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j]; - uncor_error += dot_s(uncor_vector, error_weights); - samec_error += dot_s(samec_vector, error_weights); + uncor_error += dot_s(uncor_vector * uncor_vector, error_weights); + samec_error += dot_s(samec_vector * samec_vector, error_weights); } if (uncor_error < uncor_best_error) @@ -647,7 +627,7 @@ void find_best_partition_candidates( // Compute weighting to give to each component in each partition partition_metrics pms[BLOCK_MAX_PARTITIONS]; - compute_avgs_and_dirs_3_comp_rgb(pi, blk, ewb, pms); + compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); partition_lines3 plines[BLOCK_MAX_PARTITIONS]; @@ -662,13 +642,11 @@ void find_best_partition_candidates( pl.samec_line.a = vfloat4::zero(); pl.samec_line.b = normalize_safe(pm.avg.swz<0, 1, 2>(), unit3()); - pl.uncor_pline.amod = (pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b)) * pm.icolor_scale.swz<0, 1, 2, 3>(); - pl.uncor_pline.bs = (pl.uncor_line.b * pm.color_scale.swz<0, 1, 2, 3>()); - pl.uncor_pline.bis = (pl.uncor_line.b * pm.icolor_scale.swz<0, 1, 2, 3>()); + pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b); + pl.uncor_pline.bs = pl.uncor_line.b; pl.samec_pline.amod = vfloat4::zero(); - pl.samec_pline.bs = (pl.samec_line.b * pm.color_scale.swz<0, 1, 2, 3>()); - pl.samec_pline.bis = (pl.samec_line.b * pm.icolor_scale.swz<0, 1, 2, 3>()); + pl.samec_pline.bs = pl.samec_line.b; } float uncor_error = 0.0f; @@ -676,7 +654,6 @@ void find_best_partition_candidates( compute_error_squared_rgb(pi, blk, - ewb, plines, uncor_error, samec_error); @@ -693,25 +670,16 @@ void find_best_partition_candidates( for (unsigned int j = 0; j < partition_count; j++) { - partition_metrics& pm = pms[j]; partition_lines3& pl = plines[j]; - float tpp = (float)(pi.partition_texel_count[j]); - - vfloat4 ics = pm.icolor_scale; - ics.set_lane<3>(0.0f); - - vfloat4 error_weights = pm.error_weight * (tpp * weight_imprecision_estim); - error_weights.set_lane<3>(0.0f); - - vfloat4 uncor_vector = (pl.uncor_line.b * pl.uncor_line_len) * ics; - vfloat4 samec_vector = (pl.samec_line.b * pl.samec_line_len) * ics; + float tpp = static_cast(pi.partition_texel_count[j]); + vfloat4 error_weights(tpp * weight_imprecision_estim); - uncor_vector = uncor_vector * uncor_vector; - samec_vector = samec_vector * samec_vector; + vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len; + vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len; - uncor_error += dot3_s(uncor_vector, error_weights); - samec_error += dot3_s(samec_vector, error_weights); + uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights); + samec_error += dot3_s(samec_vector * samec_vector, error_weights); } if (uncor_error < uncor_best_error) @@ -736,10 +704,24 @@ void find_best_partition_candidates( } } - best_partition_uncor = uncor_best_partition; - - unsigned int index = samec_best_partitions[0] != uncor_best_partition ? 0 : 1; - best_partition_samec = samec_best_partitions[index]; + // Same parition is best for both, so use this first unconditionally + if (uncor_best_partition == samec_best_partitions[0]) + { + best_partitions[0] = samec_best_partitions[0]; + best_partitions[1] = samec_best_partitions[1]; + } + // Uncor is best + else if (uncor_best_error <= samec_best_errors[0]) + { + best_partitions[0] = uncor_best_partition; + best_partitions[1] = samec_best_partitions[0]; + } + // Samec is best + else + { + best_partitions[0] = samec_best_partitions[0]; + best_partitions[1] = uncor_best_partition; + } } #endif diff --git a/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp b/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp index 46783c8e6f..ce2dd8ba1d 100644 --- a/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -31,7 +31,6 @@ * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param pi The partition info for the current trial. * @param[out] ei The computed ideal endpoints and weights. * @param component The color component to compute. @@ -39,7 +38,6 @@ static void compute_ideal_colors_and_weights_1_comp( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, endpoints_and_weights& ei, unsigned int component @@ -54,78 +52,73 @@ static void compute_ideal_colors_and_weights_1_comp( float lowvalues[BLOCK_MAX_PARTITIONS] { 1e10f, 1e10f, 1e10f, 1e10f }; float highvalues[BLOCK_MAX_PARTITIONS] { -1e10f, -1e10f, -1e10f, -1e10f }; - float partition_error_scale[BLOCK_MAX_PARTITIONS]; - float linelengths_rcp[BLOCK_MAX_PARTITIONS]; + float length_squared[BLOCK_MAX_PARTITIONS]; + float scale[BLOCK_MAX_PARTITIONS]; - const float *error_weights = nullptr; + float error_weight; const float* data_vr = nullptr; assert(component < BLOCK_MAX_COMPONENTS); switch (component) { case 0: - error_weights = ewb.texel_weight_r; + error_weight = blk.channel_weight.lane<0>(); data_vr = blk.data_r; break; case 1: - error_weights = ewb.texel_weight_g; + error_weight = blk.channel_weight.lane<1>(); data_vr = blk.data_g; break; case 2: - error_weights = ewb.texel_weight_b; + error_weight = blk.channel_weight.lane<2>(); data_vr = blk.data_b; break; default: - error_weights = ewb.texel_weight_a; + error_weight = blk.channel_weight.lane<3>(); data_vr = blk.data_a; break; } for (int i = 0; i < texel_count; i++) { - if (error_weights[i] > 1e-10f) - { - float value = data_vr[i]; - int partition = pi.partition_of_texel[i]; + float value = data_vr[i]; + int partition = pi.partition_of_texel[i]; - lowvalues[partition] = astc::min(value, lowvalues[partition]); - highvalues[partition] = astc::max(value, highvalues[partition]); - } + lowvalues[partition] = astc::min(value, lowvalues[partition]); + highvalues[partition] = astc::max(value, highvalues[partition]); } vmask4 sep_mask = vint4::lane_id() == vint4(component); for (int i = 0; i < partition_count; i++) { - float diff = highvalues[i] - lowvalues[i]; - - if (diff < 0) + float length = highvalues[i] - lowvalues[i]; + if (length < 0.0f) { lowvalues[i] = 0.0f; highvalues[i] = 0.0f; } - diff = astc::max(diff, 1e-7f); - - partition_error_scale[i] = diff * diff; - linelengths_rcp[i] = 1.0f / diff; + length = astc::max(length, 1e-7f); + length_squared[i] = length * length; + scale[i] = 1.0f / length; ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalues[i]), sep_mask); ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalues[i]), sep_mask); } bool is_constant_wes = true; - float constant_wes = partition_error_scale[pi.partition_of_texel[0]] * error_weights[0]; + float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight; for (int i = 0; i < texel_count; i++) { float value = data_vr[i]; int partition = pi.partition_of_texel[i]; value -= lowvalues[partition]; - value *= linelengths_rcp[partition]; + value *= scale[partition]; value = astc::clamp1f(value); ei.weights[i] = value; - ei.weight_error_scale[i] = partition_error_scale[partition] * error_weights[i]; + ei.weight_error_scale[i] = length_squared[partition] * error_weight; assert(!astc::isnan(ei.weight_error_scale[i])); is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes; @@ -147,7 +140,6 @@ static void compute_ideal_colors_and_weights_1_comp( * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param pi The partition info for the current trial. * @param[out] ei The computed ideal endpoints and weights. * @param component1 The first color component to compute. @@ -156,7 +148,6 @@ static void compute_ideal_colors_and_weights_1_comp( static void compute_ideal_colors_and_weights_2_comp( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, endpoints_and_weights& ei, int component1, @@ -171,24 +162,29 @@ static void compute_ideal_colors_and_weights_2_comp( partition_metrics pms[BLOCK_MAX_PARTITIONS]; - const float *error_weights; + float error_weight; const float* data_vr = nullptr; const float* data_vg = nullptr; if (component1 == 0 && component2 == 1) { - error_weights = ewb.texel_weight_rg; + error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f; + data_vr = blk.data_r; data_vg = blk.data_g; } else if (component1 == 0 && component2 == 2) { - error_weights = ewb.texel_weight_rb; + error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f; + data_vr = blk.data_r; data_vg = blk.data_b; } else // (component1 == 1 && component2 == 2) { - error_weights = ewb.texel_weight_gb; + assert(component1 == 1 && component2 == 2); + + error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f; + data_vr = blk.data_g; data_vg = blk.data_b; } @@ -200,7 +196,7 @@ static void compute_ideal_colors_and_weights_2_comp( float scale[BLOCK_MAX_PARTITIONS]; float length_squared[BLOCK_MAX_PARTITIONS]; - compute_avgs_and_dirs_2_comp(pi, blk, ewb, component1, component2, pms); + compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms); for (int i = 0; i < partition_count; i++) { @@ -216,21 +212,14 @@ static void compute_ideal_colors_and_weights_2_comp( for (int i = 0; i < texel_count; i++) { - if (error_weights[i] > 1e-10f) - { - int partition = pi.partition_of_texel[i]; - vfloat4 point = vfloat2(data_vr[i], data_vg[i]) * pms[partition].color_scale.swz<0, 1>(); - line2 l = lines[partition]; - float param = dot_s(point - l.a, l.b); - ei.weights[i] = param; - - lowparam[partition] = astc::min(param, lowparam[partition]); - highparam[partition] = astc::max(param, highparam[partition]); - } - else - { - ei.weights[i] = -1e38f; - } + int partition = pi.partition_of_texel[i]; + vfloat4 point = vfloat2(data_vr[i], data_vg[i]); + line2 l = lines[partition]; + float param = dot_s(point - l.a, l.b); + ei.weights[i] = param; + + lowparam[partition] = astc::min(param, lowparam[partition]); + highparam[partition] = astc::max(param, highparam[partition]); } vfloat4 lowvalues[BLOCK_MAX_PARTITIONS]; @@ -242,7 +231,7 @@ static void compute_ideal_colors_and_weights_2_comp( if (length < 0.0f) // Case for when none of the texels had any weight { lowparam[i] = 0.0f; - highparam[i] = 1e-7f; + highparam[i] = 0.0f; } // It is possible for a uniform-color partition to produce length=0; this causes NaN issues @@ -251,17 +240,11 @@ static void compute_ideal_colors_and_weights_2_comp( length_squared[i] = length * length; scale[i] = 1.0f / length; - vfloat4 ep0 = lines[i].a + lines[i].b * lowparam[i]; - vfloat4 ep1 = lines[i].a + lines[i].b * highparam[i]; - - ep0 = ep0.swz<0, 1>() / pms[i].color_scale; - - ep1 = ep1.swz<0, 1>() / pms[i].color_scale; - - lowvalues[i] = ep0; - highvalues[i] = ep1; + lowvalues[i] = lines[i].a + lines[i].b * lowparam[i]; + highvalues[i] = lines[i].a + lines[i].b * highparam[i]; } + // TODO: Merge this into loop above? vmask4 comp1_mask = vint4::lane_id() == vint4(component1); vmask4 comp2_mask = vint4::lane_id() == vint4(component2); for (int i = 0; i < partition_count; i++) @@ -274,7 +257,7 @@ static void compute_ideal_colors_and_weights_2_comp( } bool is_constant_wes = true; - float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weights[0]; + float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight; for (int i = 0; i < texel_count; i++) { @@ -283,7 +266,7 @@ static void compute_ideal_colors_and_weights_2_comp( idx = astc::clamp1f(idx); ei.weights[i] = idx; - ei.weight_error_scale[i] = length_squared[partition] * error_weights[i]; + ei.weight_error_scale[i] = length_squared[partition] * error_weight; assert(!astc::isnan(ei.weight_error_scale[i])); is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes; @@ -305,7 +288,6 @@ static void compute_ideal_colors_and_weights_2_comp( * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param pi The partition info for the current trial. * @param[out] ei The computed ideal endpoints and weights. * @param omitted_component The color component excluded from the calculation. @@ -313,7 +295,6 @@ static void compute_ideal_colors_and_weights_2_comp( static void compute_ideal_colors_and_weights_3_comp( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, endpoints_and_weights& ei, unsigned int omitted_component @@ -327,34 +308,34 @@ static void compute_ideal_colors_and_weights_3_comp( partition_metrics pms[BLOCK_MAX_PARTITIONS]; - const float *error_weights; + float error_weight; const float* data_vr = nullptr; const float* data_vg = nullptr; const float* data_vb = nullptr; if (omitted_component == 0) { - error_weights = ewb.texel_weight_gba; + error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f; data_vr = blk.data_g; data_vg = blk.data_b; data_vb = blk.data_a; } else if (omitted_component == 1) { - error_weights = ewb.texel_weight_rba; + error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()) / 3.0f; data_vr = blk.data_r; data_vg = blk.data_b; data_vb = blk.data_a; } else if (omitted_component == 2) { - error_weights = ewb.texel_weight_rga; + error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()) / 3.0f; data_vr = blk.data_r; data_vg = blk.data_g; data_vb = blk.data_a; } else { - error_weights = ewb.texel_weight_rgb; + error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f; data_vr = blk.data_r; data_vg = blk.data_g; data_vb = blk.data_b; @@ -367,7 +348,7 @@ static void compute_ideal_colors_and_weights_3_comp( float scale[BLOCK_MAX_PARTITIONS]; float length_squared[BLOCK_MAX_PARTITIONS]; - compute_avgs_and_dirs_3_comp(pi, blk, ewb, omitted_component, pms); + compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms); for (unsigned int i = 0; i < partition_count; i++) { @@ -383,27 +364,20 @@ static void compute_ideal_colors_and_weights_3_comp( for (unsigned int i = 0; i < texel_count; i++) { - if (error_weights[i] > 1e-10f) - { - int partition = pi.partition_of_texel[i]; - vfloat4 point = vfloat3(data_vr[i], data_vg[i], data_vb[i]) * pms[partition].color_scale; - line3 l = lines[partition]; - float param = dot3_s(point - l.a, l.b); - ei.weights[i] = param; - - lowparam[partition] = astc::min(param, lowparam[partition]); - highparam[partition] = astc::max(param, highparam[partition]); - } - else - { - ei.weights[i] = -1e38f; - } + int partition = pi.partition_of_texel[i]; + vfloat4 point = vfloat3(data_vr[i], data_vg[i], data_vb[i]); + line3 l = lines[partition]; + float param = dot3_s(point - l.a, l.b); + ei.weights[i] = param; + + lowparam[partition] = astc::min(param, lowparam[partition]); + highparam[partition] = astc::max(param, highparam[partition]); } for (unsigned int i = 0; i < partition_count; i++) { float length = highparam[i] - lowparam[i]; - if (length < 0) // Case for when none of the texels had any weight + if (length < 0.0f) // Case for when none of the texels had any weight { lowparam[i] = 0.0f; highparam[i] = 1e-7f; @@ -412,16 +386,12 @@ static void compute_ideal_colors_and_weights_3_comp( // It is possible for a uniform-color partition to produce length=0; this causes NaN issues // so set to a small value to avoid this problem. length = astc::max(length, 1e-7f); - length_squared[i] = length * length; scale[i] = 1.0f / length; vfloat4 ep0 = lines[i].a + lines[i].b * lowparam[i]; vfloat4 ep1 = lines[i].a + lines[i].b * highparam[i]; - ep0 = ep0 * pms[i].icolor_scale; - ep1 = ep1 * pms[i].icolor_scale; - vfloat4 bmin = blk.data_min; vfloat4 bmax = blk.data_max; @@ -449,7 +419,7 @@ static void compute_ideal_colors_and_weights_3_comp( bool is_constant_wes = true; - float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weights[0]; + float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight; for (unsigned int i = 0; i < texel_count; i++) { @@ -458,7 +428,7 @@ static void compute_ideal_colors_and_weights_3_comp( idx = astc::clamp1f(idx); ei.weights[i] = idx; - ei.weight_error_scale[i] = length_squared[partition] * error_weights[i]; + ei.weight_error_scale[i] = length_squared[partition] * error_weight; assert(!astc::isnan(ei.weight_error_scale[i])); is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes; @@ -480,18 +450,16 @@ static void compute_ideal_colors_and_weights_3_comp( * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param pi The partition info for the current trial. * @param[out] ei The computed ideal endpoints and weights. */ static void compute_ideal_colors_and_weights_4_comp( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, endpoints_and_weights& ei ) { - const float *error_weights = ewb.texel_weight; + const float error_weight = hadd_s(blk.channel_weight) / 4.0f; int partition_count = pi.partition_count; @@ -509,7 +477,7 @@ static void compute_ideal_colors_and_weights_4_comp( partition_metrics pms[BLOCK_MAX_PARTITIONS]; - compute_avgs_and_dirs_4_comp(pi, blk, ewb, pms); + compute_avgs_and_dirs_4_comp(pi, blk, pms); // If the direction points from light to dark then flip so ep0 is darkest for (int i = 0; i < partition_count; i++) @@ -526,50 +494,39 @@ static void compute_ideal_colors_and_weights_4_comp( for (int i = 0; i < texel_count; i++) { - if (error_weights[i] > 1e-10f) - { - int partition = pi.partition_of_texel[i]; + int partition = pi.partition_of_texel[i]; - vfloat4 point = blk.texel(i) * pms[partition].color_scale; - line4 l = lines[partition]; + vfloat4 point = blk.texel(i); + line4 l = lines[partition]; - float param = dot_s(point - l.a, l.b); - ei.weights[i] = param; + float param = dot_s(point - l.a, l.b); + ei.weights[i] = param; - lowparam[partition] = astc::min(param, lowparam[partition]); - highparam[partition] = astc::max(param, highparam[partition]); - } - else - { - ei.weights[i] = -1e38f; - } + lowparam[partition] = astc::min(param, lowparam[partition]); + highparam[partition] = astc::max(param, highparam[partition]); } for (int i = 0; i < partition_count; i++) { float length = highparam[i] - lowparam[i]; - if (length < 0) + if (length < 0.0f) // Case for when none of the texels had any weight { lowparam[i] = 0.0f; - highparam[i] = 1e-7f; + highparam[i] = 0.0f; } // It is possible for a uniform-color partition to produce length=0; this causes NaN issues // so set to a small value to avoid this problem. length = astc::max(length, 1e-7f); - length_squared[i] = length * length; scale[i] = 1.0f / length; - vfloat4 ep0 = lines[i].a + lines[i].b * lowparam[i]; - vfloat4 ep1 = lines[i].a + lines[i].b * highparam[i]; - - ei.ep.endpt0[i] = ep0 * pms[i].icolor_scale; - ei.ep.endpt1[i] = ep1 * pms[i].icolor_scale; + ei.ep.endpt0[i] = lines[i].a + lines[i].b * lowparam[i]; + ei.ep.endpt1[i] = lines[i].a + lines[i].b * highparam[i]; } bool is_constant_wes = true; - float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weights[0]; + float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight; for (int i = 0; i < texel_count; i++) { @@ -578,7 +535,7 @@ static void compute_ideal_colors_and_weights_4_comp( idx = astc::clamp1f(idx); ei.weights[i] = idx; - ei.weight_error_scale[i] = error_weights[i] * length_squared[partition]; + ei.weight_error_scale[i] = length_squared[partition] * error_weight; assert(!astc::isnan(ei.weight_error_scale[i])); is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes; @@ -599,7 +556,6 @@ static void compute_ideal_colors_and_weights_4_comp( void compute_ideal_colors_and_weights_1plane( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, endpoints_and_weights& ei ) { @@ -607,11 +563,11 @@ void compute_ideal_colors_and_weights_1plane( if (uses_alpha) { - compute_ideal_colors_and_weights_4_comp(bsd, blk, ewb, pi, ei); + compute_ideal_colors_and_weights_4_comp(bsd, blk, pi, ei); } else { - compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb, pi, ei, 3); + compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei, 3); } } @@ -619,7 +575,6 @@ void compute_ideal_colors_and_weights_1plane( void compute_ideal_colors_and_weights_2planes( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, unsigned int plane2_component, endpoints_and_weights& ei1, endpoints_and_weights& ei2 @@ -633,43 +588,43 @@ void compute_ideal_colors_and_weights_2planes( case 0: // Separate weights for red if (uses_alpha) { - compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb, pi, ei1, 0); + compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei1, 0); } else { - compute_ideal_colors_and_weights_2_comp(bsd, blk, ewb, pi, ei1, 1, 2); + compute_ideal_colors_and_weights_2_comp(bsd, blk, pi, ei1, 1, 2); } - compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 0); + compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 0); break; case 1: // Separate weights for green if (uses_alpha) { - compute_ideal_colors_and_weights_3_comp(bsd,blk, ewb, pi, ei1, 1); + compute_ideal_colors_and_weights_3_comp(bsd,blk, pi, ei1, 1); } else { - compute_ideal_colors_and_weights_2_comp(bsd, blk, ewb, pi, ei1, 0, 2); + compute_ideal_colors_and_weights_2_comp(bsd, blk, pi, ei1, 0, 2); } - compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 1); + compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 1); break; case 2: // Separate weights for blue if (uses_alpha) { - compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb, pi, ei1, 2); + compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei1, 2); } else { - compute_ideal_colors_and_weights_2_comp(bsd, blk, ewb, pi, ei1, 0, 1); + compute_ideal_colors_and_weights_2_comp(bsd, blk, pi, ei1, 0, 1); } - compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 2); + compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 2); break; default: // Separate weights for alpha assert(uses_alpha); - compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb, pi, ei1, 3); - compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 3); + compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei1, 3); + compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 3); break; } } @@ -1098,9 +1053,9 @@ static inline vfloat4 compute_rgbo_vector( } /* See header for documentation. */ +// TODO: Specialize for 1 partition? void recompute_ideal_colors_1plane( const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, const decimation_info& di, int weight_quant_mode, @@ -1127,24 +1082,21 @@ void recompute_ideal_colors_1plane( for (int i = 0; i < partition_count; i++) { vfloat4 rgba_sum(1e-17f); - vfloat4 rgba_weight_sum(1e-17f); unsigned int texel_count = pi.partition_texel_count[i]; const uint8_t *texel_indexes = pi.texels_of_partition[i]; + // TODO: Use gathers? promise(texel_count > 0); for (unsigned int j = 0; j < texel_count; j++) { unsigned int tix = texel_indexes[j]; - - vfloat4 rgba = blk.texel(tix); - vfloat4 error_weight = ewb.error_weights[tix]; - - rgba_sum += rgba * error_weight; - rgba_weight_sum += error_weight; + rgba_sum += blk.texel(tix); } - vfloat4 scale_direction = normalize((rgba_sum * (1.0f / rgba_weight_sum)).swz<0, 1, 2>()); + rgba_sum = rgba_sum * blk.channel_weight; + vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast(texel_count), 1e-17f); + vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>()); float scale_max = 0.0f; float scale_min = 1e10f; @@ -1152,28 +1104,25 @@ void recompute_ideal_colors_1plane( float wmin1 = 1.0f; float wmax1 = 0.0f; - vfloat4 left_sum = vfloat4::zero(); - vfloat4 middle_sum = vfloat4::zero(); - vfloat4 right_sum = vfloat4::zero(); - vfloat4 lmrs_sum = vfloat4::zero(); + float left_sum_s = 0.0f; + float middle_sum_s = 0.0f; + float right_sum_s = 0.0f; vfloat4 color_vec_x = vfloat4::zero(); vfloat4 color_vec_y = vfloat4::zero(); vfloat4 scale_vec = vfloat4::zero(); - vfloat4 weight_weight_sum = vfloat4(1e-17f); - float psum = 1e-17f; + float weight_weight_sum_s = 1e-17f; + + vfloat4 color_weight = blk.channel_weight; + float ls_weight = hadd_rgb_s(color_weight); for (unsigned int j = 0; j < texel_count; j++) { unsigned int tix = texel_indexes[j]; vfloat4 rgba = blk.texel(tix); - vfloat4 color_weight = ewb.error_weights[tix]; - - // TODO: Move this calculation out to the color block? - float ls_weight = hadd_rgb_s(color_weight); float idx0; if (!is_decimated) @@ -1190,54 +1139,41 @@ void recompute_ideal_colors_1plane( wmin1 = astc::min(idx0, wmin1); wmax1 = astc::max(idx0, wmax1); - float scale = dot3_s(scale_direction, rgba); + float scale = dot3_s(scale_dir, rgba); scale_min = astc::min(scale, scale_min); scale_max = astc::max(scale, scale_max); - vfloat4 left = color_weight * (om_idx0 * om_idx0); - vfloat4 middle = color_weight * (om_idx0 * idx0); - vfloat4 right = color_weight * (idx0 * idx0); - - vfloat4 lmrs = vfloat3(om_idx0 * om_idx0, - om_idx0 * idx0, - idx0 * idx0) * ls_weight; - - left_sum += left; - middle_sum += middle; - right_sum += right; - lmrs_sum += lmrs; + left_sum_s += om_idx0 * om_idx0; + middle_sum_s += om_idx0 * idx0; + right_sum_s += idx0 * idx0; + weight_weight_sum_s += idx0; vfloat4 color_idx(idx0); - vfloat4 cwprod = color_weight * rgba; + vfloat4 cwprod = rgba; vfloat4 cwiprod = cwprod * color_idx; color_vec_y += cwiprod; color_vec_x += cwprod - cwiprod; - scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale); - weight_weight_sum += color_weight * color_idx; - psum += dot3_s(color_weight * color_idx, color_idx); + scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight); } - // Calculations specific to mode #7, the HDR RGB-scale mode - vfloat4 rgbq_sum = color_vec_x + color_vec_y; - rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); + vfloat4 left_sum = vfloat4(left_sum_s) * color_weight; + vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight; + vfloat4 right_sum = vfloat4(right_sum_s) * color_weight; + vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight; - vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, - rgbq_sum, psum); - rgbo_vectors[i] = rgbovec; + vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight; + float psum = right_sum_s * hadd_rgb_s(color_weight); - // We will occasionally get a failure due to the use of a singular (non-invertible) matrix. - // Record whether such a failure has taken place; if it did, compute rgbo_vectors[] with a - // different method later - float chkval = dot_s(rgbovec, rgbovec); - int rgbo_fail = chkval != chkval; + color_vec_x = color_vec_x * color_weight; + color_vec_y = color_vec_y * color_weight; // Initialize the luminance and scale vectors with a reasonable default float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f)); scalediv = astc::clamp1f(scalediv); - vfloat4 sds = scale_direction * scale_max; + vfloat4 sds = scale_dir * scale_max; rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); @@ -1245,7 +1181,7 @@ void recompute_ideal_colors_1plane( { // If all weights in the partition were equal, then just take average of all colors in // the partition and use that as both endpoint colors - vfloat4 avg = (color_vec_x + color_vec_y) * (1.0f / rgba_weight_sum); + vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; vmask4 notnan_mask = avg == avg; ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask); @@ -1287,13 +1223,21 @@ void recompute_ideal_colors_1plane( if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) { float scalediv2 = scale_ep0 * (1.0f / scale_ep1); - vfloat4 sdsm = scale_direction * scale_ep1; + vfloat4 sdsm = scale_dir * scale_ep1; rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); } } - // If the calculation of an RGB-offset vector failed, try to compute a value another way - if (rgbo_fail) + // Calculations specific to mode #7, the HDR RGB-scale mode + vfloat4 rgbq_sum = color_vec_x + color_vec_y; + rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); + + vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); + rgbo_vectors[i] = rgbovec; + + // We can get a failure due to the use of a singular (non-invertible) matrix + // If it failed, compute rgbo_vectors[] with a different method ... + if (astc::isnan(dot_s(rgbovec, rgbovec))) { vfloat4 v0 = ep.endpt0[i]; vfloat4 v1 = ep.endpt1[i]; @@ -1303,7 +1247,6 @@ void recompute_ideal_colors_1plane( vfloat4 avg = (v0 + v1) * 0.5f; vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f; - rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif); } } @@ -1312,7 +1255,6 @@ void recompute_ideal_colors_1plane( /* See header for documentation. */ void recompute_ideal_colors_2planes( const image_block& blk, - const error_weight_block& ewb, const block_size_descriptor& bsd, const decimation_info& di, int weight_quant_mode, @@ -1340,28 +1282,26 @@ void recompute_ideal_colors_2planes( dec_weights_quant_uvalue_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f); } - vfloat4 rgba_sum = ewb.block_error_weighted_rgba_sum; - vfloat4 rgba_weight_sum = ewb.block_error_weight_sum; - unsigned int texel_count = bsd.texel_count; - vfloat4 scale_direction = normalize((rgba_sum * (1.0f / rgba_weight_sum)).swz<0, 1, 2>()); + vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast(texel_count), 1e-17f); + vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>()); float scale_max = 0.0f; float scale_min = 1e10f; float wmin1 = 1.0f; float wmax1 = 0.0f; + float wmin2 = 1.0f; float wmax2 = 0.0f; - vfloat4 left_sum = vfloat4::zero(); - vfloat4 middle_sum = vfloat4::zero(); - vfloat4 right_sum = vfloat4::zero(); + float left1_sum_s = 0.0f; + float middle1_sum_s = 0.0f; + float right1_sum_s = 0.0f; - vfloat4 left2_sum = vfloat4::zero(); - vfloat4 middle2_sum = vfloat4::zero(); - vfloat4 right2_sum = vfloat4::zero(); - vfloat4 lmrs_sum = vfloat4::zero(); + float left2_sum_s = 0.0f; + float middle2_sum_s = 0.0f; + float right2_sum_s = 0.0f; vfloat4 color_vec_x = vfloat4::zero(); vfloat4 color_vec_y = vfloat4::zero(); @@ -1369,15 +1309,14 @@ void recompute_ideal_colors_2planes( vfloat4 scale_vec = vfloat4::zero(); vfloat4 weight_weight_sum = vfloat4(1e-17f); - float psum = 1e-17f; + + vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); + vfloat4 color_weight = blk.channel_weight; + float ls_weight = hadd_rgb_s(color_weight); for (unsigned int j = 0; j < texel_count; j++) { vfloat4 rgba = blk.texel(j); - vfloat4 color_weight = ewb.error_weights[j]; - - // TODO: Move this calculation out to the color block? - float ls_weight = hadd_rgb_s(color_weight); float idx0; if (!is_decimated) @@ -1394,22 +1333,13 @@ void recompute_ideal_colors_2planes( wmin1 = astc::min(idx0, wmin1); wmax1 = astc::max(idx0, wmax1); - float scale = dot3_s(scale_direction, rgba); + float scale = dot3_s(scale_dir, rgba); scale_min = astc::min(scale, scale_min); scale_max = astc::max(scale, scale_max); - vfloat4 left = color_weight * (om_idx0 * om_idx0); - vfloat4 middle = color_weight * (om_idx0 * idx0); - vfloat4 right = color_weight * (idx0 * idx0); - - vfloat4 lmrs = vfloat3(om_idx0 * om_idx0, - om_idx0 * idx0, - idx0 * idx0) * ls_weight; - - left_sum += left; - middle_sum += middle; - right_sum += right; - lmrs_sum += lmrs; + left1_sum_s += om_idx0 * om_idx0; + middle1_sum_s += om_idx0 * idx0; + right1_sum_s += idx0 * idx0; float idx1; if (!is_decimated) @@ -1426,18 +1356,13 @@ void recompute_ideal_colors_2planes( wmin2 = astc::min(idx1, wmin2); wmax2 = astc::max(idx1, wmax2); - vfloat4 left2 = color_weight * (om_idx1 * om_idx1); - vfloat4 middle2 = color_weight * (om_idx1 * idx1); - vfloat4 right2 = color_weight * (idx1 * idx1); - - left2_sum += left2; - middle2_sum += middle2; - right2_sum += right2; + left2_sum_s += om_idx1 * om_idx1; + middle2_sum_s += om_idx1 * idx1; + right2_sum_s += idx1 * idx1; - vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask); - vfloat4 cwprod = color_weight * rgba; + vfloat4 cwprod = rgba; vfloat4 cwiprod = cwprod * color_idx; color_vec_y += cwiprod; @@ -1445,26 +1370,27 @@ void recompute_ideal_colors_2planes( scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale); weight_weight_sum += (color_weight * color_idx); - psum += dot3_s(color_weight * color_idx, color_idx); } - // Calculations specific to mode #7, the HDR RGB-scale mode - vfloat4 rgbq_sum = color_vec_x + color_vec_y; - rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); + vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight; + vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight; + vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight; + vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight; - rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); + vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight; + vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight; + vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight; + + float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight); - // We will occasionally get a failure due to the use of a singular (non-invertible) matrix. - // Record whether such a failure has taken place; if it did, compute rgbo_vectors[] with a - // different method later - float chkval = dot_s(rgbo_vector, rgbo_vector); - int rgbo_fail = chkval != chkval; + color_vec_x = color_vec_x * color_weight; + color_vec_y = color_vec_y * color_weight; // Initialize the luminance and scale vectors with a reasonable default float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f)); scalediv = astc::clamp1f(scalediv); - vfloat4 sds = scale_direction * scale_max; + vfloat4 sds = scale_dir * scale_max; rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv); @@ -1472,7 +1398,7 @@ void recompute_ideal_colors_2planes( { // If all weights in the partition were equal, then just take average of all colors in // the partition and use that as both endpoint colors - vfloat4 avg = (color_vec_x + color_vec_y) * (1.0f / rgba_weight_sum); + vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component); vmask4 notnan_mask = avg == avg; @@ -1487,22 +1413,22 @@ void recompute_ideal_colors_2planes( { // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given // set of texel weights and pixel colors - vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum); + vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum); vfloat4 color_rdet1 = 1.0f / color_det1; float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>()); float ls_rdet1 = 1.0f / ls_det1; - vfloat4 color_mss1 = (left_sum * left_sum) - + (2.0f * middle_sum * middle_sum) - + (right_sum * right_sum); + vfloat4 color_mss1 = (left1_sum * left1_sum) + + (2.0f * middle1_sum * middle1_sum) + + (right1_sum * right1_sum); float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>()) + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>()) + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>()); - vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1; - vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1; + vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1; + vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1; float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1; float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1; @@ -1518,7 +1444,7 @@ void recompute_ideal_colors_2planes( if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1) { float scalediv2 = scale_ep0 * (1.0f / scale_ep1); - vfloat4 sdsm = scale_direction * scale_ep1; + vfloat4 sdsm = scale_dir * scale_ep1; rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2); } } @@ -1527,9 +1453,8 @@ void recompute_ideal_colors_2planes( { // If all weights in the partition were equal, then just take average of all colors in // the partition and use that as both endpoint colors - vfloat4 avg = (color_vec_x + color_vec_y) * (1.0f / rgba_weight_sum); + vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum; - vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); vmask4 notnan_mask = avg == avg; vmask4 full_mask = p2_mask & notnan_mask; @@ -1550,7 +1475,6 @@ void recompute_ideal_colors_2planes( vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2; vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2; - vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component); vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f); vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1); vmask4 full_mask = p2_mask & det_mask & notnan_mask; @@ -1559,8 +1483,15 @@ void recompute_ideal_colors_2planes( ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask); } - // If the calculation of an RGB-offset vector failed, try to compute a value another way - if (rgbo_fail) + // Calculations specific to mode #7, the HDR RGB-scale mode + vfloat4 rgbq_sum = color_vec_x + color_vec_y; + rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y)); + + rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum); + + // We can get a failure due to the use of a singular (non-invertible) matrix + // If it failed, compute rgbo_vectors[] with a different method ... + if (astc::isnan(dot_s(rgbo_vector, rgbo_vector))) { vfloat4 v0 = ep.endpt0[0]; vfloat4 v1 = ep.endpt1[0]; diff --git a/lib/astc-encoder/Source/astcenc_image.cpp b/lib/astc-encoder/Source/astcenc_image.cpp index f4c8e00f96..47af5714a7 100644 --- a/lib/astc-encoder/Source/astcenc_image.cpp +++ b/lib/astc-encoder/Source/astcenc_image.cpp @@ -173,6 +173,8 @@ void fetch_image_block( int idx = 0; vfloat4 data_min(1e38f); + vfloat4 data_mean(0.0f); + vfloat4 data_mean_scale(1.0f / static_cast(bsd.texel_count)); vfloat4 data_max(-1e38f); bool grayscale = true; @@ -225,6 +227,7 @@ void fetch_image_block( // Compute block metadata data_min = min(data_min, datav); + data_mean += datav * data_mean_scale; data_max = max(data_max, datav); if (grayscale && (datav.lane<0>() != datav.lane<1>() || datav.lane<0>() != datav.lane<2>())) @@ -259,6 +262,7 @@ void fetch_image_block( // Store block metadata blk.data_min = data_min; + blk.data_mean = data_mean; blk.data_max = data_max; blk.grayscale = grayscale; } diff --git a/lib/astc-encoder/Source/astcenc_internal.h b/lib/astc-encoder/Source/astcenc_internal.h index cf31cce6dd..5981fd1d02 100644 --- a/lib/astc-encoder/Source/astcenc_internal.h +++ b/lib/astc-encoder/Source/astcenc_internal.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -60,7 +60,7 @@ #define promise(cond) if(!(cond)) { __builtin_unreachable(); } #endif #else - #define promise(cond) assert(cond); + #define promise(cond) assert(cond) #endif /* ============================================================================ @@ -447,9 +447,10 @@ static inline unsigned int get_quant_level(quant_method method) case QUANT_160: return 160; case QUANT_192: return 192; case QUANT_256: return 256; - // Unreachable - the enum is fully described - default: return 0; } + + // Unreachable - the enum is fully described + return 0; } /** @@ -457,15 +458,6 @@ static inline unsigned int get_quant_level(quant_method method) */ struct partition_metrics { - /** @brief The sum of the error weights for texels in this partition. */ - vfloat4 error_weight; - - /** @brief The color scale factor used to weight color channels. */ - vfloat4 color_scale; - - /** @brief The 1 / color_scale used to avoid divisions. */ - vfloat4 icolor_scale; - /** @brief The error-weighted average color in the partition. */ vfloat4 avg; @@ -818,10 +810,16 @@ struct image_block /** @brief The min component value of all texels in the block. */ vfloat4 data_min; + /** @brief The mean component value of all texels in the block. */ + vfloat4 data_mean; + /** @brief The max component value of all texels in the block. */ vfloat4 data_max; - /** @brief Is this greyscale block where R == G == B for all texels? */ + /** @brief The relative error significance of the color channels. */ + vfloat4 channel_weight; + + /** @brief Is this grayscale block where R == G == B for all texels? */ bool grayscale; /** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */ @@ -923,85 +921,6 @@ struct image_block } }; -/** - * @brief Data structure representing per-texel and per-component error weights for a block. - * - * This structure stores a multiplier for the error weight to apply to each component when computing - * block errors. This can be used as a general purpose technique to to amplify or diminish the - * significance of texels and individual color components, based on what is being stored and the - * compressor heuristics. It can be applied in many different ways, some of which are outlined in - * the description below (this is not exhaustive). - * - * For blocks that span the edge of the texture, the weighting for texels outside of the texture - * bounds can zeroed to maximize the quality of the texels inside the texture. - * - * For textures storing fewer than 4 components the weighting for color components that are unused - * can be zeroed to maximize the quality of the components that are used. This is particularly - * important for two component textures, which must be imported in LLLA format to match the two - * component endpoint encoding. Without manual component weighting to correct significance the "L" - * would be treated as three times more important than A because of the replication. - * - * For HDR textures we can use perceptual weighting which os approximately inverse to the luminance - * of a texel. - * - * For normal maps we can use perceptual weighting which assigns higher weight to low-variability - * regions than to high-variability regions, ensuring smooth surfaces don't pick up artifacts. - * - * For transparent texels we can multiply the RGB weights by the alpha value, ensuring that - * the least transprent texels maintain the highest accuracy. - */ -struct error_weight_block -{ - /** @brief Block error weighted RGBA sum for whole block / 1 partition. */ - vfloat4 block_error_weighted_rgba_sum; - - /** @brief Block error sum for whole block / 1 partition. */ - vfloat4 block_error_weight_sum; - - /** @brief The full per texel per component error weights. */ - vfloat4 error_weights[BLOCK_MAX_TEXELS]; - - - /** @brief The full per texel per component error weights. */ - float texel_weight[BLOCK_MAX_TEXELS]; - - - /** @brief The average of the GBA error weights per texel. */ - float texel_weight_gba[BLOCK_MAX_TEXELS]; - - /** @brief The average of the RBA error weights per texel. */ - float texel_weight_rba[BLOCK_MAX_TEXELS]; - - /** @brief The average of the RGA error weights per texel. */ - float texel_weight_rga[BLOCK_MAX_TEXELS]; - - /** @brief The average of the RGB error weights per texel. */ - float texel_weight_rgb[BLOCK_MAX_TEXELS]; - - - /** @brief The average of the RG error weights per texel. */ - float texel_weight_rg[BLOCK_MAX_TEXELS]; - - /** @brief The average of the RB error weights per texel. */ - float texel_weight_rb[BLOCK_MAX_TEXELS]; - - /** @brief The average of the GB error weights per texel. */ - float texel_weight_gb[BLOCK_MAX_TEXELS]; - - - /** @brief The individual R component error weights per texel. */ - float texel_weight_r[BLOCK_MAX_TEXELS]; - - /** @brief The individual G component error weights per texel. */ - float texel_weight_g[BLOCK_MAX_TEXELS]; - - /** @brief The individual B component error weights per texel. */ - float texel_weight_b[BLOCK_MAX_TEXELS]; - - /** @brief The individual A component error weights per texel. */ - float texel_weight_a[BLOCK_MAX_TEXELS]; -}; - /** * @brief Data structure storing the color endpoints for a block. */ @@ -1076,9 +995,6 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers /** @brief Ideal decimated endpoints and weights for plane 2. */ endpoints_and_weights eix2[WEIGHTS_MAX_DECIMATION_MODES]; - /** @brief The error weight block for the current thread. */ - error_weight_block ewb; - /** * @brief Decimated ideal weight values. * @@ -1240,26 +1156,17 @@ struct physical_compressed_block * This function takes a structure to avoid spilling arguments to the stack on every function * invocation, as there are a lot of parameters. */ -struct pixel_region_variance_args +struct pixel_region_args { /** @brief The image to analyze. */ const astcenc_image* img; - /** @brief The RGB component power adjustment. */ - float rgb_power; - - /** @brief The alpha component power adjustment. */ - float alpha_power; - /** @brief The component swizzle pattern. */ astcenc_swizzle swz; /** @brief Should the algorithm bother with Z axis processing? */ bool have_z; - /** @brief The kernel radius for average and variance. */ - unsigned int avg_var_kernel_radius; - /** @brief The kernel radius for alpha processing. */ unsigned int alpha_kernel_radius; @@ -1286,12 +1193,12 @@ struct pixel_region_variance_args }; /** - * @brief Parameter structure for @c compute_averages_and_variances_proc(). + * @brief Parameter structure for @c compute_averages_proc(). */ -struct avg_var_args +struct avg_args { /** @brief The arguments for the nested variance computation. */ - pixel_region_variance_args arg; + pixel_region_args arg; // The above has a reference to the image altread? /** @brief The image X dimensions. */ @@ -1338,28 +1245,21 @@ struct astcenc_context * large structure size are omitted. */ - /** @brief The input images averages table, may be @c nullptr if not needed. */ - vfloat4 *input_averages; - - /** @brief The input image RGBA channel variances table, may be @c nullptr if not needed. */ - vfloat4 *input_variances; - - /** @brief The input image alpha channel variances table, may be @c nullptr if not needed. */ + /** @brief The input image alpha channel averages table, may be @c nullptr if not needed. */ float *input_alpha_averages; - /** @brief The scratch workign buffers, one per thread (see @c thread_count). */ compression_working_buffers* working_buffers; #if !defined(ASTCENC_DECOMPRESS_ONLY) /** @brief The pixel region and variance worker arguments. */ - avg_var_args avg_var_preprocess_args; + avg_args avg_preprocess_args; /** @brief The per-texel deblocking weights for the current block size. */ float deblock_weights[BLOCK_MAX_TEXELS]; - /** @brief The parallel manager for averages and variances computation. */ - ParallelManager manage_avg_var; + /** @brief The parallel manager for averages computation. */ + ParallelManager manage_avg; /** @brief The parallel manager for compression. */ ParallelManager manage_compress; @@ -1549,7 +1449,6 @@ unsigned int get_ise_sequence_bitcount( * * @param pi The partition info for the current trial. * @param blk The image block color data to be compressed. - * @param ewb The image block weighted error data. * @param component1 The first component included in the analysis. * @param component2 The second component included in the analysis. * @param[out] pm The output partition metrics. @@ -1559,7 +1458,6 @@ unsigned int get_ise_sequence_bitcount( void compute_avgs_and_dirs_2_comp( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, unsigned int component1, unsigned int component2, partition_metrics pm[BLOCK_MAX_PARTITIONS]); @@ -1569,7 +1467,6 @@ void compute_avgs_and_dirs_2_comp( * * @param pi The partition info for the current trial. * @param blk The image block color data to be compressed. - * @param ewb The image block weighted error data. * @param omitted_component The component excluded from the analysis. * @param[out] pm The output partition metrics. * - Only pi.partition_count array entries actually get initialized. @@ -1578,7 +1475,6 @@ void compute_avgs_and_dirs_2_comp( void compute_avgs_and_dirs_3_comp( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, unsigned int omitted_component, partition_metrics pm[BLOCK_MAX_PARTITIONS]); @@ -1590,7 +1486,6 @@ void compute_avgs_and_dirs_3_comp( * * @param pi The partition info for the current trial. * @param blk The image block color data to be compressed. - * @param ewb The image block weighted error data. * @param[out] pm The output partition metrics. * - Only pi.partition_count array entries actually get initialized. * - Direction vectors @c pm.dir are not normalized. @@ -1598,7 +1493,6 @@ void compute_avgs_and_dirs_3_comp( void compute_avgs_and_dirs_3_comp_rgb( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, partition_metrics pm[BLOCK_MAX_PARTITIONS]); /** @@ -1606,7 +1500,6 @@ void compute_avgs_and_dirs_3_comp_rgb( * * @param pi The partition info for the current trial. * @param blk The image block color data to be compressed. - * @param ewb The image block weighted error data. * @param[out] pm The output partition metrics. * - Only pi.partition_count array entries actually get initialized. * - Direction vectors @c pm.dir are not normalized. @@ -1614,7 +1507,6 @@ void compute_avgs_and_dirs_3_comp_rgb( void compute_avgs_and_dirs_4_comp( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, partition_metrics pm[BLOCK_MAX_PARTITIONS]); /** @@ -1629,7 +1521,6 @@ void compute_avgs_and_dirs_4_comp( * * @param pi The partition info for the current trial. * @param blk The image block color data to be compressed. - * @param ewb The image block weighted error data. * @param[in,out] plines Processed line inputs, and line length outputs. * @param[out] uncor_error The cumulative error for using the uncorrelated line. * @param[out] samec_error The cumulative error for using the same chroma line. @@ -1637,7 +1528,6 @@ void compute_avgs_and_dirs_4_comp( void compute_error_squared_rgb( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, partition_lines3 plines[BLOCK_MAX_PARTITIONS], float& uncor_error, float& samec_error); @@ -1654,7 +1544,6 @@ void compute_error_squared_rgb( * * @param pi The partition info for the current trial. * @param blk The image block color data to be compressed. - * @param ewb The image block weighted error data. * @param uncor_plines Processed uncorrelated partition lines for each partition. * @param samec_plines Processed same chroma partition lines for each partition. * @param[out] uncor_lengths The length of each components deviation from the line. @@ -1665,7 +1554,6 @@ void compute_error_squared_rgb( void compute_error_squared_rgba( const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS], const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS], float uncor_lengths[BLOCK_MAX_PARTITIONS], @@ -1676,73 +1564,62 @@ void compute_error_squared_rgba( /** * @brief Find the best set of partitions to trial for a given block. * - * On return @c best_partition_uncor contains the best partition assuming data has uncorrelated - * chroma, @c best_partition_samec contains the best partition assuming data has corelated chroma. + * On return the @c best_partitions list will contain the two best partition + * candidates; one assuming data has uncorrelated chroma and one assuming the + * data has corelated chroma. The best candidate is returned first in the list. * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param partition_count The number of partitions in the block. * @param partition_search_limit The number of candidate partition encodings to trial. - * @param[out] best_partition_uncor The best partition for uncorrelated chroma. - * @param[out] best_partition_samec The best partition for correlated chroma. + * @param[out] best_partitions The best partition candidates. */ void find_best_partition_candidates( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, unsigned int partition_count, unsigned int partition_search_limit, - unsigned int& best_partition_uncor, - unsigned int& best_partition_samec); + unsigned int best_partitions[2]); /* ============================================================================ Functionality for managing images and image related data. ============================================================================ */ /** - * @brief Setup computation of regional averages and variances in an image. + * @brief Setup computation of regional averages in an image. * * This must be done by only a single thread per image, before any thread calls - * @c compute_averages_and_variances(). + * @c compute_averages(). * - * Results are written back into @c img->input_averages, @c img->input_variances, - * and @c img->input_alpha_averages. + * Results are written back into @c img->input_alpha_averages. * * @param img The input image data, also holds output data. - * @param rgb_power The RGB component power. - * @param alpha_power The A component power. - * @param avg_var_kernel_radius The kernel radius (in pixels) for avg and var. * @param alpha_kernel_radius The kernel radius (in pixels) for alpha mods. * @param swz Input data component swizzle. * @param[out] ag The average variance arguments to init. * * @return The number of tasks in the processing stage. */ -unsigned int init_compute_averages_and_variances( +unsigned int init_compute_averages( const astcenc_image& img, - float rgb_power, - float alpha_power, - unsigned int avg_var_kernel_radius, unsigned int alpha_kernel_radius, const astcenc_swizzle& swz, - avg_var_args& ag); + avg_args& ag); /** - * @brief Compute regional averages and variances. + * @brief Compute regional averages in an image. * - * This function can be called by multiple threads, but only after a single thread calls the setup - * function @c init_compute_averages_and_variances(). + * This function can be called by multiple threads, but only after a single + * thread calls the setup function @c init_compute_averages(). * - * Results are written back into @c img->input_averages, @c img->input_variances, - * and @c img->input_alpha_averages. + * Results are written back into @c img->input_alpha_averages. * * @param[out] ctx The context. * @param ag The average and variance arguments created during setup. */ -void compute_averages_and_variances( +void compute_averages( astcenc_context& ctx, - const avg_var_args& ag); + const avg_args& ag); /** * @brief Fetch a single image block from the input image @@ -1799,14 +1676,12 @@ void write_image_block( * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param pi The partition info for the current trial. * @param[out] ei The endpoint and weight values. */ void compute_ideal_colors_and_weights_1plane( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, endpoints_and_weights& ei); @@ -1819,7 +1694,6 @@ void compute_ideal_colors_and_weights_1plane( * * @param bsd The block size information. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param plane2_component The component assigned to plane 2. * @param[out] ei1 The endpoint and weight values for plane 1. * @param[out] ei2 The endpoint and weight values for plane 2. @@ -1827,7 +1701,6 @@ void compute_ideal_colors_and_weights_1plane( void compute_ideal_colors_and_weights_2planes( const block_size_descriptor& bsd, const image_block& blk, - const error_weight_block& ewb, unsigned int plane2_component, endpoints_and_weights& ei1, endpoints_and_weights& ei2); @@ -2054,7 +1927,6 @@ void unpack_weights( * @param bsd The block size information. * @param pi The partition info for the current trial. * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param ep The ideal endpoints. * @param qwt_bitcounts Bit counts for different quantization methods. * @param qwt_errors Errors for different quantization methods. @@ -2070,7 +1942,6 @@ unsigned int compute_ideal_endpoint_formats( const block_size_descriptor& bsd, const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, const endpoints& ep, const int* qwt_bitcounts, const float* qwt_errors, @@ -2087,7 +1958,6 @@ unsigned int compute_ideal_endpoint_formats( * recompute the ideal colors for a specific weight set. * * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param pi The partition info for the current trial. * @param di The weight grid decimation table. * @param weight_quant_mode The weight grid quantization level. @@ -2098,7 +1968,6 @@ unsigned int compute_ideal_endpoint_formats( */ void recompute_ideal_colors_1plane( const image_block& blk, - const error_weight_block& ewb, const partition_info& pi, const decimation_info& di, int weight_quant_mode, @@ -2114,7 +1983,6 @@ void recompute_ideal_colors_1plane( * recompute the ideal colors for a specific weight set. * * @param blk The image block color data to compress. - * @param ewb The image block weighted error data. * @param bsd The block_size descriptor. * @param di The weight grid decimation table. * @param weight_quant_mode The weight grid quantization level. @@ -2127,7 +1995,6 @@ void recompute_ideal_colors_1plane( */ void recompute_ideal_colors_2planes( const image_block& blk, - const error_weight_block& ewb, const block_size_descriptor& bsd, const decimation_info& di, int weight_quant_mode, @@ -2138,19 +2005,6 @@ void recompute_ideal_colors_2planes( vfloat4& rgbo_vector, int plane2_component); -/** - * @brief Expand the deblock weights based on the config deblocking parameter. - * - * The approach to deblocking is a general purpose approach which elevates the error weight - * significance of texels closest to the block periphery. This function computes the deblock weights - * for each texel, which can be mixed on a block-by-block basis with the other error weighting - * parameters to compute a specific per-texel weight for a trial. - * - * @param[in,out] ctx The context to expand. - */ -void expand_deblock_weights( - astcenc_context& ctx); - /** * @brief Expand the angular tables needed for the alternative to PCA that we use. */ @@ -2206,14 +2060,12 @@ void compute_angular_endpoints_2planes( * @brief Compress an image block into a physical block. * * @param ctx The compressor context and configuration. - * @param image The input image information. * @param blk The image block color data to compress. * @param[out] pcb The physical compressed block output. * @param[out] tmpbuf Preallocated scratch buffers for the compressor. */ void compress_block( const astcenc_context& ctx, - const astcenc_image& image, const image_block& blk, physical_compressed_block& pcb, compression_working_buffers& tmpbuf); @@ -2246,7 +2098,6 @@ void decompress_symbolic_block( * @param bsd The block size information. * @param scb The symbolic compressed encoding. * @param blk The original image block color data. - * @param ewb The error weight block data. * * @return Returns the computed error, or a negative value if the encoding * should be rejected for any reason. @@ -2255,8 +2106,7 @@ float compute_symbolic_block_difference( const astcenc_config& config, const block_size_descriptor& bsd, const symbolic_compressed_block& scb, - const image_block& blk, - const error_weight_block& ewb) ; + const image_block& blk); /** * @brief Convert a symbolic representation into a binary physical encoding. diff --git a/lib/astc-encoder/Source/astcenc_mathlib.h b/lib/astc-encoder/Source/astcenc_mathlib.h index 0dc17b42d0..4876749bfe 100644 --- a/lib/astc-encoder/Source/astcenc_mathlib.h +++ b/lib/astc-encoder/Source/astcenc_mathlib.h @@ -458,21 +458,18 @@ struct processed_line2 { vfloat4 amod; vfloat4 bs; - vfloat4 bis; }; struct processed_line3 { vfloat4 amod; vfloat4 bs; - vfloat4 bis; }; struct processed_line4 { vfloat4 amod; vfloat4 bs; - vfloat4 bis; }; #endif diff --git a/lib/astc-encoder/Source/astcenc_partition_tables.cpp b/lib/astc-encoder/Source/astcenc_partition_tables.cpp index fd840add16..52d76cfaf2 100644 --- a/lib/astc-encoder/Source/astcenc_partition_tables.cpp +++ b/lib/astc-encoder/Source/astcenc_partition_tables.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -58,7 +58,7 @@ static void generate_canonical_partitioning( { int index = partition_of_texel[i]; - if (mapped_index[index] == -1) + if (mapped_index[index] < 0) { mapped_index[index] = map_weight_count++; } diff --git a/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp b/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp index cbeb285535..140edb1029 100644 --- a/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp +++ b/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -47,43 +47,6 @@ #include -/** - * @brief Compute cumulative error weight of each partition. - * - * The cumulative error weight is used to determine the relative importance of each partiton when - * deciding how to quantize colors, as not all partitions are equal. For example, some partitions - * will have far fewer texels than others in the same block. - * - * @param ewb The block error weights. - * @param pi The partiion info. - * @param[out] error_weights The output per-partition error_weight sum. - */ -static void compute_partition_error_color_weightings( - const error_weight_block& ewb, - const partition_info& pi, - vfloat4 error_weights[BLOCK_MAX_PARTITIONS] -) { - // TODO: Candidate for 4-group counting - int partition_count = pi.partition_count; - promise(partition_count > 0); - - for (int i = 0; i < partition_count; i++) - { - vfloat4 error_weight(1e-12f); - - int texel_count = pi.partition_texel_count[i]; - promise(texel_count > 0); - - for (int j = 0; j < texel_count; j++) - { - int tidx = pi.texels_of_partition[i][j]; - error_weight += ewb.error_weights[tidx]; - } - - error_weights[i] = error_weight / pi.partition_texel_count[i]; - } -} - /** * @brief Compute the errors of the endpoint line options for one partition. * @@ -96,7 +59,6 @@ static void compute_partition_error_color_weightings( * @param pi The partition info data. * @param partition_index The partition index to compule the error for. * @param blk The image block. - * @param ewb The error weight block. * @param uncor_pline The endpoint line assuming uncorrelated endpoints. * @param[out] uncor_err The computed error for the uncorrelated endpoint line. * @param samec_pline The endpoint line assuming the same chroma for both endpoints. @@ -111,7 +73,6 @@ static void compute_error_squared_rgb_single_partition( const partition_info& pi, int partition_index, const image_block& blk, - const error_weight_block& ewb, const processed_line3& uncor_pline, float& uncor_err, const processed_line3& samec_pline, @@ -134,14 +95,14 @@ static void compute_error_squared_rgb_single_partition( for (int i = 0; i < texels_in_partition; i++) { int tix = pi.texels_of_partition[partition_index][i]; - float texel_weight = ewb.texel_weight_rgb[tix]; + float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f; if (texel_weight < 1e-20f) { continue; } vfloat4 point = blk.texel(tix); - vfloat4 ews = ewb.error_weights[tix]; + vfloat4 ews = blk.channel_weight; // Compute the error that arises from just ditching alpha float default_alpha = blk.get_default_alpha(); @@ -149,24 +110,24 @@ static void compute_error_squared_rgb_single_partition( a_drop_err += omalpha * omalpha * ews.lane<3>(); float param1 = dot3_s(point, uncor_pline.bs); - vfloat4 rp1 = uncor_pline.amod + param1 * uncor_pline.bis; + vfloat4 rp1 = uncor_pline.amod + param1 * uncor_pline.bs; vfloat4 dist1 = rp1 - point; uncor_err += dot3_s(ews, dist1 * dist1); float param2 = dot3_s(point, samec_pline.bs); // No samec amod - we know it's always zero - vfloat4 rp2 = /* samec_pline.amod + */ param2 * samec_pline.bis; + vfloat4 rp2 = /* samec_pline.amod + */ param2 * samec_pline.bs; vfloat4 dist2 = rp2 - point; samec_err += dot3_s(ews, dist2 * dist2); float param3 = dot3_s(point, rgbl_pline.bs); - vfloat4 rp3 = rgbl_pline.amod + param3 * rgbl_pline.bis; + vfloat4 rp3 = rgbl_pline.amod + param3 * rgbl_pline.bs; vfloat4 dist3 = rp3 - point; rgbl_err += dot3_s(ews, dist3 * dist3); float param4 = dot3_s(point, l_pline.bs); // No luma amod - we know it's always zero - vfloat4 rp4 = /* l_pline.amod + */ param4 * l_pline.bis; + vfloat4 rp4 = /* l_pline.amod + */ param4 * l_pline.bs; vfloat4 dist4 = rp4 - point; l_err += dot3_s(ews, dist4 * dist4); } @@ -182,7 +143,6 @@ static void compute_error_squared_rgb_single_partition( * @param bsd The block size information. * @param blk The image block. * @param pi The partition info data. - * @param ewb The error weight block. * @param ep The idealized endpoints. * @param[out] eci The resulting encoding choice error metrics. */ @@ -190,7 +150,6 @@ static void compute_encoding_choice_errors( const block_size_descriptor& bsd, const image_block& blk, const partition_info& pi, - const error_weight_block& ewb, const endpoints& ep, encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]) { @@ -202,19 +161,19 @@ static void compute_encoding_choice_errors( partition_metrics pms[BLOCK_MAX_PARTITIONS]; - compute_avgs_and_dirs_3_comp_rgb(pi, blk, ewb, pms); + compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); for (int i = 0; i < partition_count; i++) { partition_metrics& pm = pms[i]; line3 uncor_rgb_lines; - line3 samec_rgb_lines; // for LDR-RGB-scale - line3 rgb_luma_lines; // for HDR-RGB-scale + line3 samec_rgb_lines; // for LDR-RGB-scale + line3 rgb_luma_lines; // for HDR-RGB-scale processed_line3 uncor_rgb_plines; - processed_line3 samec_rgb_plines; // for LDR-RGB-scale - processed_line3 rgb_luma_plines; // for HDR-RGB-scale + processed_line3 samec_rgb_plines; + processed_line3 rgb_luma_plines; processed_line3 luminance_plines; float uncorr_rgb_error; @@ -223,41 +182,31 @@ static void compute_encoding_choice_errors( float luminance_rgb_error; float alpha_drop_error; - vfloat4 csf = pm.color_scale; - vfloat4 csfn = normalize(csf); - - vfloat4 icsf = pm.icolor_scale; - icsf.set_lane<3>(0.0f); - uncor_rgb_lines.a = pm.avg; - uncor_rgb_lines.b = normalize_safe(pm.dir, csfn); + uncor_rgb_lines.b = normalize_safe(pm.dir, unit3()); samec_rgb_lines.a = vfloat4::zero(); - samec_rgb_lines.b = normalize_safe(pm.avg, csfn); + samec_rgb_lines.b = normalize_safe(pm.avg, unit3()); rgb_luma_lines.a = pm.avg; - rgb_luma_lines.b = csfn; + rgb_luma_lines.b = unit3(); - uncor_rgb_plines.amod = (uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b)) * icsf; - uncor_rgb_plines.bs = uncor_rgb_lines.b * csf; - uncor_rgb_plines.bis = uncor_rgb_lines.b * icsf; + uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b); + uncor_rgb_plines.bs = uncor_rgb_lines.b; // Same chroma always goes though zero, so this is simpler than the others samec_rgb_plines.amod = vfloat4::zero(); - samec_rgb_plines.bs = samec_rgb_lines.b * csf; - samec_rgb_plines.bis = samec_rgb_lines.b * icsf; + samec_rgb_plines.bs = samec_rgb_lines.b; - rgb_luma_plines.amod = (rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b)) * icsf; - rgb_luma_plines.bs = rgb_luma_lines.b * csf; - rgb_luma_plines.bis = rgb_luma_lines.b * icsf; + rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b); + rgb_luma_plines.bs = rgb_luma_lines.b; // Luminance always goes though zero, so this is simpler than the others luminance_plines.amod = vfloat4::zero(); - luminance_plines.bs = csfn * csf; - luminance_plines.bis = csfn * icsf; + luminance_plines.bs = unit3(); compute_error_squared_rgb_single_partition( - pi, i, blk, ewb, + pi, i, blk, uncor_rgb_plines, uncorr_rgb_error, samec_rgb_plines, samechroma_rgb_error, rgb_luma_plines, rgb_luma_error, @@ -284,9 +233,9 @@ static void compute_encoding_choice_errors( bool can_blue_contract = (mask(endpt_can_bc_lo & endpt_can_bc_hi) & 0x7) == 0x7; // Store out the settings - eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical - eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess - eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical + eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical + eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess + eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical eci[i].alpha_drop_error = alpha_drop_error * 3.0f; eci[i].can_offset_encode = can_offset_encode; eci[i].can_blue_contract = can_blue_contract; @@ -688,7 +637,7 @@ static float one_partition_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level == -1) + if (quant_level < 0) { continue; } @@ -791,7 +740,7 @@ static float two_partitions_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level == -1) + if (quant_level < 0) { break; } @@ -916,7 +865,7 @@ static float three_partitions_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level == -1) + if (quant_level < 0) { break; } @@ -1052,7 +1001,7 @@ static float four_partitions_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level == -1) + if (quant_level < 0) { break; } @@ -1094,7 +1043,6 @@ unsigned int compute_ideal_endpoint_formats( const block_size_descriptor& bsd, const partition_info& pi, const image_block& blk, - const error_weight_block& ewb, const endpoints& ep, // bitcounts and errors computed for the various quantization methods const int* qwt_bitcounts, @@ -1117,12 +1065,7 @@ unsigned int compute_ideal_endpoint_formats( // Compute the errors that result from various encoding choices (such as using luminance instead // of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on) encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]; - compute_encoding_choice_errors(bsd, blk, pi, ewb, ep, eci); - - // For each partition, compute the error weights to apply for that partition - vfloat4 error_weights[BLOCK_MAX_PARTITIONS]; - - compute_partition_error_color_weightings(ewb, pi, error_weights); + compute_encoding_choice_errors(bsd, blk, pi, ep, eci); float best_error[BLOCK_MAX_PARTITIONS][21][4]; int format_of_choice[BLOCK_MAX_PARTITIONS][21][4]; @@ -1130,7 +1073,7 @@ unsigned int compute_ideal_endpoint_formats( { compute_color_error_for_every_integer_count_and_quant_level( encode_hdr_rgb, encode_hdr_alpha, i, - pi, eci[i], ep, error_weights[i], best_error[i], + pi, eci[i], ep, blk.channel_weight, best_error[i], format_of_choice[i]); } @@ -1301,7 +1244,7 @@ unsigned int compute_ideal_endpoint_formats( vmask mask = mask1 & mask2; vbest_ep_error = select(vbest_ep_error, err, mask); vbest_error_index = select(vbest_error_index, lane_ids, mask); - lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH); + lane_ids += vint(ASTCENC_SIMD_WIDTH); } // Pick best mode from the SIMD result, using lowest matching index to ensure invariance diff --git a/lib/astc-encoder/Source/astcenc_vecmathlib.h b/lib/astc-encoder/Source/astcenc_vecmathlib.h index ab86dc50c4..069c03c94f 100644 --- a/lib/astc-encoder/Source/astcenc_vecmathlib.h +++ b/lib/astc-encoder/Source/astcenc_vecmathlib.h @@ -243,7 +243,8 @@ static ASTCENC_SIMD_INLINE vfloat4 unit4() */ static ASTCENC_SIMD_INLINE vfloat4 unit3() { - return vfloat4(0.57735f, 0.57735f, 0.57735f, 0.0f); + float val = 0.577350258827209473f; + return vfloat4(val, val, val, 0.0f); } /** @@ -251,7 +252,8 @@ static ASTCENC_SIMD_INLINE vfloat4 unit3() */ static ASTCENC_SIMD_INLINE vfloat4 unit2() { - return vfloat4(0.70711f, 0.70711f, 0.0f, 0.0f); + float val = 0.707106769084930420f; + return vfloat4(val, val, 0.0f, 0.0f); } /** diff --git a/lib/astc-encoder/Source/astcenc_weight_align.cpp b/lib/astc-encoder/Source/astcenc_weight_align.cpp index 4c1e04e4a0..e29ff8861e 100644 --- a/lib/astc-encoder/Source/astcenc_weight_align.cpp +++ b/lib/astc-encoder/Source/astcenc_weight_align.cpp @@ -65,7 +65,7 @@ static const unsigned int quantization_steps_for_level[13] { alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS]; alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS]; -#if !defined(NDEBUG) +#if defined(ASTCENC_DIAGNOSTICS) static bool print_once { true }; #endif @@ -329,7 +329,7 @@ static void compute_angular_endpoints_for_quant_levels( int bsi = (int)best_results[q].lane<1>(); // Did we find anything? -#if !defined(NDEBUG) +#if defined(ASTCENC_DIAGNOSTICS) if ((bsi < 0) && print_once) { print_once = false; @@ -493,7 +493,7 @@ static void compute_angular_endpoints_for_quant_levels_lwc( int bsi = best_index[q]; // Did we find anything? -#if !defined(NDEBUG) +#if defined(ASTCENC_DIAGNOSTICS) if ((bsi < 0) && print_once) { print_once = false; diff --git a/lib/astc-encoder/Source/astcenccli_error_metrics.cpp b/lib/astc-encoder/Source/astcenccli_error_metrics.cpp index 4e28f95e13..9023d97f91 100644 --- a/lib/astc-encoder/Source/astcenccli_error_metrics.cpp +++ b/lib/astc-encoder/Source/astcenccli_error_metrics.cpp @@ -129,7 +129,7 @@ void compute_error_metrics( kahan_accum4 log_errorsum; kahan_accum4 mpsnr_errorsum; double mean_angular_errorsum = 0.0; - float worst_angular_errorsum = 0.0; + double worst_angular_errorsum = 0.0; unsigned int dim_x = astc::min(img1->dim_x, img2->dim_x); unsigned int dim_y = astc::min(img1->dim_y, img2->dim_y); @@ -282,9 +282,9 @@ void compute_error_metrics( // Float error can push this outside of valid range for acos, so clamp to avoid NaN issues float normal_cos = clamp(-1.0f, 1.0f, dot3(normal1, normal2)).lane<0>(); float rad_to_degrees = 180.0f / astc::PI; - float error_degrees = std::acos(static_cast(normal_cos)) * static_cast(rad_to_degrees); + double error_degrees = std::acos(static_cast(normal_cos)) * static_cast(rad_to_degrees); - mean_angular_errorsum += static_cast(error_degrees) / (dim_x * dim_y * dim_z); + mean_angular_errorsum += error_degrees / (dim_x * dim_y * dim_z); worst_angular_errorsum = astc::max(worst_angular_errorsum, error_degrees); } } @@ -396,7 +396,7 @@ void compute_error_metrics( if (compute_normal_metrics) { printf(" Mean Angular Error: %9.4f degrees\n", mean_angular_errorsum); - printf(" Worst Angular Error: %9.4f degrees\n", (double)worst_angular_errorsum); + printf(" Worst Angular Error: %9.4f degrees\n", worst_angular_errorsum); } printf("\n"); diff --git a/lib/astc-encoder/Source/astcenccli_image_load_store.cpp b/lib/astc-encoder/Source/astcenccli_image_load_store.cpp index 1adb82ee23..d6afd0a923 100644 --- a/lib/astc-encoder/Source/astcenccli_image_load_store.cpp +++ b/lib/astc-encoder/Source/astcenccli_image_load_store.cpp @@ -778,17 +778,17 @@ static unsigned int get_format( struct ktx_header { uint8_t magic[12]; - uint32_t endianness; // should be 0x04030201; if it is instead 0x01020304, then the endianness of everything must be switched. - uint32_t gl_type; // 0 for compressed textures, otherwise value from table 3.2 (page 162) of OpenGL 4.0 spec - uint32_t gl_type_size; // size of data elements to do endianness swap on (1=endian-neutral data) - uint32_t gl_format; // 0 for compressed textures, otherwise value from table 3.3 (page 163) of OpenGLl spec - uint32_t gl_internal_format; // sized-internal-format, corresponding to table 3.12 to 3.14 (pages 182-185) of OpenGL spec + uint32_t endianness; // should be 0x04030201; if it is instead 0x01020304, then the endianness of everything must be switched. + uint32_t gl_type; // 0 for compressed textures, otherwise value from table 3.2 (page 162) of OpenGL 4.0 spec + uint32_t gl_type_size; // size of data elements to do endianness swap on (1=endian-neutral data) + uint32_t gl_format; // 0 for compressed textures, otherwise value from table 3.3 (page 163) of OpenGL spec + uint32_t gl_internal_format; // sized-internal-format, corresponding to table 3.12 to 3.14 (pages 182-185) of OpenGL spec uint32_t gl_base_internal_format; // unsized-internal-format: corresponding to table 3.11 (page 179) of OpenGL spec - uint32_t pixel_width; // texture dimensions; not rounded up to block size for compressed. - uint32_t pixel_height; // must be 0 for 1D textures. - uint32_t pixel_depth; // must be 0 for 1D, 2D and cubemap textures. + uint32_t pixel_width; // texture dimensions; not rounded up to block size for compressed. + uint32_t pixel_height; // must be 0 for 1D textures. + uint32_t pixel_depth; // must be 0 for 1D, 2D and cubemap textures. uint32_t number_of_array_elements; // 0 if not a texture array - uint32_t number_of_faces; // 6 for cubemaps, 1 for non-cubemaps + uint32_t number_of_faces; // 6 for cubemaps, 1 for non-cubemaps uint32_t number_of_mipmap_levels; // 0 or 1 for non-mipmapped textures; 0 indicates that auto-mipmap-gen should be done at load time. uint32_t bytes_of_key_value_data; // size in bytes of the key-and-value area immediately following the header. }; diff --git a/lib/astc-encoder/Source/astcenccli_toplevel.cpp b/lib/astc-encoder/Source/astcenccli_toplevel.cpp index 273c421a9e..3f6a14dace 100644 --- a/lib/astc-encoder/Source/astcenccli_toplevel.cpp +++ b/lib/astc-encoder/Source/astcenccli_toplevel.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -655,37 +655,6 @@ static int edit_astcenc_config( argidx++; cli_config.silentmode = 1; } - else - if (!strcmp(argv[argidx], "-v")) - { - argidx += 7; - if (argidx > argc) - { - printf("ERROR: -v switch with less than 6 arguments\n"); - return 1; - } - - config.v_rgba_radius = atoi(argv[argidx - 6]); - config.v_rgb_power = static_cast(atof(argv[argidx - 5])); - config.v_rgb_base = static_cast(atof(argv[argidx - 4])); - config.v_rgb_mean = static_cast(atof(argv[argidx - 3])); - config.v_rgb_stdev = static_cast(atof(argv[argidx - 2])); - config.v_rgba_mean_stdev_mix = static_cast(atof(argv[argidx - 1])); - } - else if (!strcmp(argv[argidx], "-va")) - { - argidx += 5; - if (argidx > argc) - { - printf("ERROR: -va switch with less than 4 arguments\n"); - return 1; - } - - config.v_a_power= static_cast(atof(argv[argidx - 4])); - config.v_a_base = static_cast(atof(argv[argidx - 3])); - config.v_a_mean = static_cast(atof(argv[argidx - 2])); - config.v_a_stdev = static_cast(atof(argv[argidx - 1])); - } else if (!strcmp(argv[argidx], "-cw")) { argidx += 5; @@ -711,17 +680,6 @@ static int edit_astcenc_config( config.a_scale_radius = atoi(argv[argidx - 1]); } - else if (!strcmp(argv[argidx], "-b")) - { - argidx += 2; - if (argidx > argc) - { - printf("ERROR: -b switch with no argument\n"); - return 1; - } - - config.b_deblock_weight = static_cast(atof(argv[argidx - 1])); - } else if (!strcmp(argv[argidx], "-esw")) { argidx += 2; @@ -1125,17 +1083,6 @@ static void print_astcenc_config( } printf(" Bitrate: %3.2f bpp\n", 128.0 / (config.block_x * config.block_y * config.block_z)); - - printf(" Radius mean/stdev: %u texels\n", config.v_rgba_radius); - printf(" RGB power: %g\n", (double)config.v_rgb_power ); - printf(" RGB base weight: %g\n", (double)config.v_rgb_base); - printf(" RGB mean weight: %g\n", (double)config.v_rgb_mean); - printf(" RGB stdev weight: %g\n", (double)config.v_rgb_stdev); - printf(" RGB mean/stdev mixing: %g\n", (double)config.v_rgba_mean_stdev_mix); - printf(" Alpha power: %g\n", (double)config.v_a_power); - printf(" Alpha base weight: %g\n", (double)config.v_a_base); - printf(" Alpha mean weight: %g\n", (double)config.v_a_mean); - printf(" Alpha stdev weight: %g\n", (double)config.v_a_stdev); printf(" RGB alpha scale weight: %d\n", (config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)); if ((config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)) { @@ -1146,7 +1093,6 @@ static void print_astcenc_config( printf(" G component weight: %g\n",(double)config.cw_g_weight); printf(" B component weight: %g\n",(double)config.cw_b_weight); printf(" A component weight: %g\n",(double)config.cw_a_weight); - printf(" Deblock artifact setting: %g\n", (double)config.b_deblock_weight); printf(" Partition cutoff: %u partitions\n", config.tune_partition_count_limit); printf(" Partition index cutoff: %u partition ids\n", config.tune_partition_index_limit); printf(" PSNR cutoff: %g dB\n", (double)config.tune_db_limit); @@ -1154,7 +1100,8 @@ static void print_astcenc_config( printf(" 3.2+ partition cutoff: %g\n", (double)config.tune_3_partition_early_out_limit_factor); printf(" 2 plane correlation cutoff: %g\n", (double)config.tune_2_plane_early_out_limit_correlation); printf(" Block mode centile cutoff: %g%%\n", (double)(config.tune_block_mode_limit)); - printf(" Max refinement cutoff: %u iterations\n", config.tune_refinement_limit); + printf(" Candidate cutoff: %u candidates\n", config.tune_candidate_limit); + printf(" Refinement cutoff: %u iterations\n", config.tune_refinement_limit); printf(" Compressor thread count: %d\n", cli_config.thread_count); printf("\n"); } @@ -1512,7 +1459,7 @@ int main( if (operation & ASTCENC_STAGE_ST_NCOMP) { int bitness = get_output_filename_enforced_bitness(output_filename.c_str()); - if (bitness == -1) + if (bitness < 0) { return 1; } diff --git a/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp b/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp index a32e741a23..e9da90ea9a 100644 --- a/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp +++ b/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- -// Copyright 2011-2021 Arm Limited +// Copyright 2011-2022 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy @@ -161,14 +161,13 @@ COMPRESSION -mask The input texture is a mask texture with unrelated data stored in the various color components, so enable error heuristics that - aim to improve perceptual quality by minimizing the effect of - error cross-talk across the color components. + aim to improve quality by minimizing the effect of error + cross-talk across the color components. -normal The input texture is a three component linear LDR normal map storing unit length normals as (R=X, G=Y, B=Z). The output will - be a two component X+Y normal map stored as (RGB=X, A=Y), - optimized for angular error instead of simple PSNR. The Z + be a two component X+Y normal map stored as (RGB=X, A=Y). The Z component can be recovered programmatically in shader code by using the equation: @@ -218,17 +217,8 @@ R"( COMPRESSION TIPS & TRICKS ASTC is a block-based format that can be prone to block artifacts. If block artifacts are a problem when compressing a given texture, - adding some or all of following command-line options may help: - - -b 1.8 - -v 2 1 1 0 25 0.1 - -va 1 1 0 25 - -dblimit 60 - - The -b option is a general-purpose block-artifact reduction option. - The -v and -va option settings will concentrate effort where smooth - regions lie next to regions with high detail, which are particularly - prone to block artifacts. + increasing the compressor quality preset can help to alleviate the + problem. If a texture exhibits severe block artifacts in only some of the color components, which is a common problem for mask textures, then @@ -243,34 +233,6 @@ ADVANCED COMPRESSION These options provide low-level control of the codec error metric computation, used to determine what good compression looks like. - -v - Compute the per-texel relative error weighting for the RGB color - components as follows: - - weight = 1 / ( + * mean^2 + * stdev^2) - - The argument specifies the texel radius of the - neighborhood over which the average and standard deviation are - computed. - - The parameter is used to control the degree of mixing of - the average and stddev error values across the color components. - Setting this parameter to 0 causes the computation to be done - separately for each color component; setting it to 1 causes the - results from the RGB components to be combined and applied to - all three components. Intermediate values between these two - settings do a linear mix of the two. - - The argument is a power used to raise the values of the - input texels before computing average and standard deviation; - e.g. a power of 0.5 causes the codec to take the square root - of every input texel value. - - -va - Compute the per-texel relative error weighting for the alpha - component, when used in conjunction with -v. See documentation - of -v for individual parameter documentation. - -a For textures with alpha component, scale per-texel weights by the alpha value. The alpha value chosen for scaling of any @@ -290,12 +252,6 @@ ADVANCED COMPRESSION significance, and values below 1 to decrease it. Set to 0 to exclude a component from error computation. - -b - Assign an additional weight scaling for texels at compression - block edges and corners. Setting this to a value above 1 - increases the significance of texels closer to the edges of a - block, and can help to reduce block artifacts. - -mpsnr Set the low and high f-stop values for the mPSNR error metric. The mPSNR error metric only applies to HDR textures. diff --git a/lib/astc-encoder/Source/cmake_core.cmake b/lib/astc-encoder/Source/cmake_core.cmake index e3f9c5088b..8431fd8c84 100644 --- a/lib/astc-encoder/Source/cmake_core.cmake +++ b/lib/astc-encoder/Source/cmake_core.cmake @@ -118,6 +118,7 @@ macro(astcenc_set_properties NAME) $<$>:-Wno-c++98-c++11-compat-pedantic> $<$>:-Wno-float-equal> $<$>:-Wno-deprecated-declarations> + $<$>:-Wno-atomic-implicit-seq-cst> # Clang 10 also throws up warnings we need to investigate (ours) $<$>:-Wno-old-style-cast> @@ -127,8 +128,7 @@ macro(astcenc_set_properties NAME) $<$>:-Wno-shift-sign-overflow> $<$>:-Wno-format-nonliteral> - $<$:-Wdocumentation>) - + $<$:-Wdocumentation>) target_link_options(${NAME} PRIVATE diff --git a/lib/astc-encoder/Utils/Example/CMakeLists.txt b/lib/astc-encoder/Utils/Example/CMakeLists.txt index eec6ffcfaf..dbc104770d 100644 --- a/lib/astc-encoder/Utils/Example/CMakeLists.txt +++ b/lib/astc-encoder/Utils/Example/CMakeLists.txt @@ -35,7 +35,7 @@ project(astcencoder_example VERSION 1.0.0) ExternalProject_Add(astcencoder GIT_REPOSITORY https://github.com/ARM-software/astc-encoder GIT_TAG main - CMAKE_CACHE_ARGS -DCLI:String=OFF + CMAKE_CACHE_ARGS -DCLI:STRING=OFF -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} INSTALL_COMMAND "") ExternalProject_Get_property(astcencoder diff --git a/lib/astc-encoder/Utils/Example/astc_api_example.cpp b/lib/astc-encoder/Utils/Example/astc_api_example.cpp index b60fa8d8da..7e95f7c86b 100644 --- a/lib/astc-encoder/Utils/Example/astc_api_example.cpp +++ b/lib/astc-encoder/Utils/Example/astc_api_example.cpp @@ -68,7 +68,7 @@ int main(int argc, char **argv) uint8_t *image_data = (uint8_t*)stbi_load(argv[1], &image_x, &image_y, &image_c, 4); if (!image_data) { - printf("Failed to load image \"%s\"\n", image_data); + printf("Failed to load image \"%s\"\n", argv[1]); return 1; }