diff --git a/lib/astc-encoder/.gitrepo b/lib/astc-encoder/.gitrepo
index 370aac4a1d..568c3dd6bc 100644
--- a/lib/astc-encoder/.gitrepo
+++ b/lib/astc-encoder/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = https://github.com/ARM-software/astc-encoder.git
 	branch = main
-	commit = e7cb1e453968b0e16e48ef6d68fc9d1227d8a378
-	parent = b87158d371723ba9670f5dc13e50a3e2a2799eba
+	commit = 42a8f6ee01715f45edffb6773e34b8bb914a47df
+	parent = 2b1c072a13e9b69ada8457665b51ed5b59ddb408
 	method = merge
 	cmdver = 0.4.3
diff --git a/lib/astc-encoder/CMakeLists.txt b/lib/astc-encoder/CMakeLists.txt
index ad608f1a38..761098054f 100644
--- a/lib/astc-encoder/CMakeLists.txt
+++ b/lib/astc-encoder/CMakeLists.txt
@@ -24,7 +24,7 @@ if(MSVC)
     add_compile_options("/wd4324") # Disable structure was padded due to alignment specifier
 endif()
 
-project(astcencoder VERSION 3.2.0)
+project(astcencoder VERSION 3.3.0)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -215,7 +215,7 @@ if(PACKAGE)
 
     set(PKG_VER ${CMAKE_PROJECT_VERSION_MAJOR}.${CMAKE_PROJECT_VERSION_MINOR})
 
-    set(CPACK_PACKAGE_FILE_NAME "astcenc-${PKG_VER}-${PKG_OS}${PACKAGE}")
+    set(CPACK_PACKAGE_FILE_NAME "astcenc-${PKG_VER}-${PKG_OS}-${PACKAGE}")
     set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY FALSE)
     set(CPACK_PACKAGE_CHECKSUM SHA256)
     set(CPACK_GENERATOR ZIP)
diff --git a/lib/astc-encoder/README.md b/lib/astc-encoder/README.md
index b1044f4618..f4f58db2c1 100644
--- a/lib/astc-encoder/README.md
+++ b/lib/astc-encoder/README.md
@@ -58,16 +58,12 @@ from 0.89 bits/pixel up to 8 bits/pixel.
 Release build binaries for the `astcenc` stable releases are provided in the
 [GitHub Releases page][3].
 
-**Latest 3.x stable release:** 3.2
+**Latest 3.x stable release:** 3.3
 * Change log: [3.x series](./Docs/ChangeLog-3x.md)
 
 **Latest 2.x stable release:** 2.5
 * Change log: [2.x series](./Docs/ChangeLog-2x.md)
 
-**Latest development release:** 3.3-develop
-* Change log: [3.x series](./Docs/ChangeLog-3x.md)
-* Roadmap: [Ideas ...](./Docs/Roadmap.md)
-
 Binaries are provided for 64-bit builds on Windows, macOS, and Linux. The
 builds of the astcenc are provided as multiple binaries, each tuned for a
 specific SIMD instruction set.
diff --git a/lib/astc-encoder/Source/astcenc.h b/lib/astc-encoder/Source/astcenc.h
index 3da8929930..f98fa7c68f 100644
--- a/lib/astc-encoder/Source/astcenc.h
+++ b/lib/astc-encoder/Source/astcenc.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2020-2021 Arm Limited
+// Copyright 2020-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -405,36 +405,6 @@ struct astcenc_config
 	/** @brief The ASTC block size Z dimension. */
 	unsigned int block_z;
 
-	/** @brief The size of the texel kernel for error weighting (-v). */
-	unsigned int v_rgba_radius;
-
-	/** @brief The mean and stdev component mix for error weighting (-v). */
-	float v_rgba_mean_stdev_mix;
-
-	/** @brief The texel RGB power for error weighting (-v). */
-	float v_rgb_power;
-
-	/** @brief The texel RGB base weight for error weighting (-v). */
-	float v_rgb_base;
-
-	/** @brief The texel RGB mean weight for error weighting (-v). */
-	float v_rgb_mean;
-
-	/** @brief The texel RGB stdev for error weighting (-v). */
-	float v_rgb_stdev;
-
-	/** @brief The texel A power for error weighting (-va). */
-	float v_a_power;
-
-	/** @brief The texel A base weight for error weighting (-va). */
-	float v_a_base;
-
-	/** @brief The texel A mean weight for error weighting (-va). */
-	float v_a_mean;
-
-	/** @brief The texel A stdev for error weighting (-va). */
-	float v_a_stdev;
-
 	/** @brief The red component weight scale for error weighting (-cw). */
 	float cw_r_weight;
 
@@ -456,13 +426,6 @@ struct astcenc_config
 	 */
 	unsigned int a_scale_radius;
 
-	/**
-	 * @brief The additional weight for block edge texels (-b).
-	 *
-	 * This is generic tool for reducing artefacts visible on block changes.
-	 */
-	float b_deblock_weight;
-
 	/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
 	float rgbm_m_scale;
 
diff --git a/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp b/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp
index 3dac01e831..3002928d1c 100644
--- a/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp
+++ b/lib/astc-encoder/Source/astcenc_averages_and_directions.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -28,45 +28,30 @@
 void compute_avgs_and_dirs_4_comp(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	// TODO: Candidate for 4-group counting
+	float texel_weight = hadd_s(blk.channel_weight) / 4.0f;
+
 	int partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
 	for (int partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-
-		vfloat4 error_sum = vfloat4::zero();
-		vfloat4 base_sum = vfloat4::zero();
-		float partition_weight = 0.0f;
-
 		unsigned int texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
+		// TODO: Try gathers?
+		vfloat4 base_sum = vfloat4::zero();
+
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			int iwt = texel_indexes[i];
-			float weight = ewb.texel_weight[iwt];
-			vfloat4 texel_datum = blk.texel(iwt);
-			vfloat4 error_weight = ewb.error_weights[iwt];
-
-			partition_weight += weight;
-			base_sum += texel_datum * weight;
-			error_sum += error_weight;
+			base_sum += blk.texel(iwt);
 		}
 
-		error_sum = error_sum / static_cast<float>(texel_count);
-		vfloat4 csf = normalize(sqrt(error_sum)) * 2.0f;
-
-		vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f));
-
-		pm[partition].error_weight = error_sum;
-		pm[partition].avg = average * csf;
-		pm[partition].color_scale = csf;
-		pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f);
+		vfloat4 average = base_sum / static_cast<float>(texel_count);
+		pm[partition].avg = average;
 
 		vfloat4 sum_xp = vfloat4::zero();
 		vfloat4 sum_yp = vfloat4::zero();
@@ -76,9 +61,8 @@ void compute_avgs_and_dirs_4_comp(
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
-			float weight = ewb.texel_weight[iwt];
 			vfloat4 texel_datum = blk.texel(iwt);
-			texel_datum = (texel_datum - average) * weight;
+			texel_datum = (texel_datum - average) * texel_weight;
 
 			vfloat4 zero = vfloat4::zero();
 
@@ -128,50 +112,35 @@ void compute_avgs_and_dirs_4_comp(
 void compute_avgs_and_dirs_3_comp(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int omitted_component,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	// TODO: Candidate for 4-group counting
-	const float *texel_weights = ewb.texel_weight_rgb;
+	float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f;
 
 	const float* data_vr = blk.data_r;
 	const float* data_vg = blk.data_g;
 	const float* data_vb = blk.data_b;
 
-	const float* error_vr = ewb.texel_weight_r;
-	const float* error_vg = ewb.texel_weight_g;
-	const float* error_vb = ewb.texel_weight_b;
-
 	if (omitted_component == 0)
 	{
-		texel_weights = ewb.texel_weight_gba;
+		texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>()) / 3.0f;
 
 		data_vr = blk.data_g;
 		data_vg = blk.data_b;
 		data_vb = blk.data_a;
-
-		error_vr = ewb.texel_weight_g;
-		error_vg = ewb.texel_weight_b;
-		error_vb = ewb.texel_weight_a;
 	}
 	else if (omitted_component == 1)
 	{
-		texel_weights = ewb.texel_weight_rba;
+		texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()) / 3.0f;
 
 		data_vg = blk.data_b;
 		data_vb = blk.data_a;
-
-		error_vg = ewb.texel_weight_b;
-		error_vb = ewb.texel_weight_a;
 	}
 	else if (omitted_component == 2)
 	{
-		texel_weights = ewb.texel_weight_rga;
+		texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()) / 3.0f;
 
 		data_vb = blk.data_a;
-
-		error_vb = ewb.texel_weight_a;
 	}
 
 	unsigned int partition_count = pi.partition_count;
@@ -180,43 +149,18 @@ void compute_avgs_and_dirs_3_comp(
 	for (unsigned int partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-
-		vfloat4 error_sum = vfloat4::zero();
-		vfloat4 base_sum = vfloat4::zero();
-		float partition_weight = 0.0f;
-
 		unsigned int texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
+		vfloat4 base_sum = vfloat4::zero();
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
-			float weight = texel_weights[iwt];
-
-			vfloat4 texel_datum(data_vr[iwt],
-			                    data_vg[iwt],
-			                    data_vb[iwt],
-			                    0.0f);
-
-			vfloat4 error_weight(error_vr[iwt],
-			                     error_vg[iwt],
-			                     error_vb[iwt],
-			                     0.0f);
-
-			partition_weight += weight;
-			base_sum += texel_datum * weight;
-			error_sum += error_weight;
+			base_sum += vfloat3(data_vr[iwt], data_vg[iwt], data_vb[iwt]);
 		}
 
-		error_sum = error_sum / static_cast<float>(texel_count);
-		vfloat4 csf = normalize(sqrt(error_sum)) * 1.73205080f;
-
-		vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f));
-
-		pm[partition].error_weight = error_sum;
-		pm[partition].avg = average * csf;
-		pm[partition].color_scale = csf;
-		pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f);
+		vfloat4 average = base_sum / static_cast<float>(texel_count);
+		pm[partition].avg = average;
 
 		vfloat4 sum_xp = vfloat4::zero();
 		vfloat4 sum_yp = vfloat4::zero();
@@ -225,13 +169,12 @@ void compute_avgs_and_dirs_3_comp(
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
-			float weight = texel_weights[iwt];
 
 			vfloat4 texel_datum = vfloat3(data_vr[iwt],
 			                              data_vg[iwt],
 			                              data_vb[iwt]);
 
-			texel_datum = (texel_datum - average) * weight;
+			texel_datum = (texel_datum - average) * texel_weight;
 
 			vfloat4 zero = vfloat4::zero();
 
@@ -271,50 +214,28 @@ void compute_avgs_and_dirs_3_comp(
 void compute_avgs_and_dirs_3_comp_rgb(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	// TODO: Candidate for 4-group counting
+	float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3;
+
 	unsigned int partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
 	for (unsigned int partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
-
-		vfloat4 error_sum = vfloat4::zero();
-		vfloat4 base_sum = vfloat4::zero();
-		float partition_weight = 0.0f;
-
 		unsigned int texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
+		vfloat4 base_sum = vfloat4::zero();
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
-			float weight = ewb.texel_weight_rgb[iwt];
-
-			vfloat4 texel_datum = blk.texel3(iwt);
-
-			vfloat4 error_weight(ewb.texel_weight_r[iwt],
-			                     ewb.texel_weight_g[iwt],
-			                     ewb.texel_weight_b[iwt],
-			                     0.0f);
-
-			partition_weight += weight;
-			base_sum += texel_datum * weight;
-			error_sum += error_weight;
+			base_sum += blk.texel3(iwt);
 		}
 
-		error_sum = error_sum / static_cast<float>(texel_count);
-		vfloat4 csf = normalize(sqrt(error_sum)) * 1.73205080f;
-
-		vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f));
-
-		pm[partition].error_weight = error_sum;
-		pm[partition].avg = average * csf;
-		pm[partition].color_scale = csf;
-		pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f);
+		vfloat4 average = base_sum / static_cast<float>(texel_count);
+		pm[partition].avg = average;
 
 		vfloat4 sum_xp = vfloat4::zero();
 		vfloat4 sum_yp = vfloat4::zero();
@@ -323,11 +244,10 @@ void compute_avgs_and_dirs_3_comp_rgb(
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
-			float weight = ewb.texel_weight_rgb[iwt];
 
 			vfloat4 texel_datum = blk.texel3(iwt);
 
-			texel_datum = (texel_datum - average) * weight;
+			texel_datum = (texel_datum - average) * texel_weight;
 
 			vfloat4 zero = vfloat4::zero();
 
@@ -367,49 +287,37 @@ void compute_avgs_and_dirs_3_comp_rgb(
 void compute_avgs_and_dirs_2_comp(
 	const partition_info& pt,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int component1,
 	unsigned int component2,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
 ) {
-	const float *texel_weights;
+	float texel_weight;
 
 	const float* data_vr = nullptr;
 	const float* data_vg = nullptr;
 
-	const float* error_vr = nullptr;
-	const float* error_vg = nullptr;
-
 	if (component1 == 0 && component2 == 1)
 	{
-		texel_weights = ewb.texel_weight_rg;
+		texel_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
 
 		data_vr = blk.data_r;
 		data_vg = blk.data_g;
-
-		error_vr = ewb.texel_weight_r;
-		error_vg = ewb.texel_weight_g;
 	}
 	else if (component1 == 0 && component2 == 2)
 	{
-		texel_weights = ewb.texel_weight_rb;
+		texel_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
 
 		data_vr = blk.data_r;
 		data_vg = blk.data_b;
-
-		error_vr = ewb.texel_weight_r;
-		error_vg = ewb.texel_weight_b;
 	}
 	else // (component1 == 1 && component2 == 2)
 	{
 		assert(component1 == 1 && component2 == 2);
-		texel_weights = ewb.texel_weight_gb;
+
+		texel_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
 
 		data_vr = blk.data_g;
 		data_vg = blk.data_b;
-
-		error_vr = ewb.texel_weight_g;
-		error_vg = ewb.texel_weight_b;
 	}
 
 	unsigned int partition_count = pt.partition_count;
@@ -418,36 +326,18 @@ void compute_avgs_and_dirs_2_comp(
 	for (unsigned int partition = 0; partition < partition_count; partition++)
 	{
 		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
-
-		vfloat4 error_sum = vfloat4::zero();
-		vfloat4 base_sum = vfloat4::zero();
-		float partition_weight = 0.0f;
-
 		unsigned int texel_count = pt.partition_texel_count[partition];
 		promise(texel_count > 0);
 
+		vfloat4 base_sum = vfloat4::zero();
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
-			float weight = texel_weights[iwt];
-			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]) * weight;
-
-			vfloat4 error_weight = vfloat2(error_vr[iwt], error_vg[iwt]);
-
-			partition_weight += weight;
-			base_sum += texel_datum;
-			error_sum += error_weight;
+			base_sum += vfloat2(data_vr[iwt], data_vg[iwt]);
 		}
 
-		error_sum = error_sum / static_cast<float>(texel_count);
-		vfloat4 csf = normalize(sqrt(error_sum)) * 1.41421356f;
-		vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f));
-
-
-		pm[partition].error_weight = error_sum;
-		pm[partition].avg = average * csf;
-		pm[partition].color_scale = csf;
-		pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f);
+		vfloat4 average = base_sum / static_cast<float>(texel_count);
+		pm[partition].avg = average;
 
 		vfloat4 sum_xp = vfloat4::zero();
 		vfloat4 sum_yp = vfloat4::zero();
@@ -455,9 +345,8 @@ void compute_avgs_and_dirs_2_comp(
 		for (unsigned int i = 0; i < texel_count; i++)
 		{
 			unsigned int iwt = texel_indexes[i];
-			float weight = texel_weights[iwt];
 			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
-			texel_datum = (texel_datum - average) * weight;
+			texel_datum = (texel_datum - average) * texel_weight;
 
 			vfloat4 zero = vfloat4::zero();
 
@@ -487,7 +376,6 @@ void compute_avgs_and_dirs_2_comp(
 void compute_error_squared_rgba(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
 	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
 	float uncor_lengths[BLOCK_MAX_PARTITIONS],
@@ -528,11 +416,6 @@ void compute_error_squared_rgba(
 		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
 		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
 
-		vfloat l_uncor_bis0(l_uncor.bis.lane<0>());
-		vfloat l_uncor_bis1(l_uncor.bis.lane<1>());
-		vfloat l_uncor_bis2(l_uncor.bis.lane<2>());
-		vfloat l_uncor_bis3(l_uncor.bis.lane<3>());
-
 		vfloat l_samec_bs0(l_samec.bs.lane<0>());
 		vfloat l_samec_bs1(l_samec.bs.lane<1>());
 		vfloat l_samec_bs2(l_samec.bs.lane<2>());
@@ -540,11 +423,6 @@ void compute_error_squared_rgba(
 
 		assert(all(l_samec.amod == vfloat4(0.0f)));
 
-		vfloat l_samec_bis0(l_samec.bis.lane<0>());
-		vfloat l_samec_bis1(l_samec.bis.lane<1>());
-		vfloat l_samec_bis2(l_samec.bis.lane<2>());
-		vfloat l_samec_bis3(l_samec.bis.lane<3>());
-
 		vfloat uncor_loparamv(1e10f);
 		vfloat uncor_hiparamv(-1e10f);
 		vfloat4 uncor_errorsumv = vfloat4::zero();
@@ -553,6 +431,11 @@ void compute_error_squared_rgba(
 		vfloat samec_hiparamv(-1e10f);
 		vfloat4 samec_errorsumv = vfloat4::zero();
 
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+		vfloat ew_a(blk.channel_weight.lane<3>());
+
 		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
 		// array to extend the last value. This means min/max are not impacted, but we need to mask
 		// out the dummy values when we compute the line weighting.
@@ -567,11 +450,6 @@ void compute_error_squared_rgba(
 			vfloat data_b = gatherf(blk.data_b, texel_idxs);
 			vfloat data_a = gatherf(blk.data_a, texel_idxs);
 
-			vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs);
-			vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs);
-			vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs);
-			vfloat ew_a = gatherf(ewb.texel_weight_a, texel_idxs);
-
 			vfloat uncor_param  = (data_r * l_uncor_bs0)
 			                    + (data_g * l_uncor_bs1)
 			                    + (data_b * l_uncor_bs2)
@@ -581,13 +459,13 @@ void compute_error_squared_rgba(
 			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
 
 			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
-			                   + (uncor_param * l_uncor_bis0);
+			                   + (uncor_param * l_uncor_bs0);
 			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
-			                   + (uncor_param * l_uncor_bis1);
+			                   + (uncor_param * l_uncor_bs1);
 			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
-			                   + (uncor_param * l_uncor_bis2);
+			                   + (uncor_param * l_uncor_bs2);
 			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
-			                   + (uncor_param * l_uncor_bis3);
+			                   + (uncor_param * l_uncor_bs3);
 
 			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
 			                 + (ew_g * uncor_dist1 * uncor_dist1)
@@ -606,10 +484,10 @@ void compute_error_squared_rgba(
 			samec_loparamv = min(samec_param, samec_loparamv);
 			samec_hiparamv = max(samec_param, samec_hiparamv);
 
-			vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r;
-			vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g;
-			vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b;
-			vfloat samec_dist3 = samec_param * l_samec_bis3 - data_a;
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
 
 			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
 			                 + (ew_g * samec_dist1 * samec_dist1)
@@ -619,7 +497,7 @@ void compute_error_squared_rgba(
 			samec_err = select(vfloat::zero(), samec_err, mask);
 			haccumulate(samec_errorsumv, samec_err);
 
-			lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH);
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
 		}
 
 		uncor_loparam = hmin_s(uncor_loparamv);
@@ -645,7 +523,6 @@ void compute_error_squared_rgba(
 void compute_error_squared_rgb(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
 	float& uncor_error,
 	float& samec_error
@@ -685,20 +562,12 @@ void compute_error_squared_rgb(
 		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
 		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
 
-		vfloat l_uncor_bis0(l_uncor.bis.lane<0>());
-		vfloat l_uncor_bis1(l_uncor.bis.lane<1>());
-		vfloat l_uncor_bis2(l_uncor.bis.lane<2>());
-
 		vfloat l_samec_bs0(l_samec.bs.lane<0>());
 		vfloat l_samec_bs1(l_samec.bs.lane<1>());
 		vfloat l_samec_bs2(l_samec.bs.lane<2>());
 
 		assert(all(l_samec.amod == vfloat4(0.0f)));
 
-		vfloat l_samec_bis0(l_samec.bis.lane<0>());
-		vfloat l_samec_bis1(l_samec.bis.lane<1>());
-		vfloat l_samec_bis2(l_samec.bis.lane<2>());
-
 		vfloat uncor_loparamv(1e10f);
 		vfloat uncor_hiparamv(-1e10f);
 		vfloat4 uncor_errorsumv = vfloat4::zero();
@@ -707,6 +576,10 @@ void compute_error_squared_rgb(
 		vfloat samec_hiparamv(-1e10f);
 		vfloat4 samec_errorsumv = vfloat4::zero();
 
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+
 		// This implementation over-shoots, but this is safe as we initialize the weights array
 		// to extend the last value. This means min/max are not impacted, but we need to mask
 		// out the dummy values when we compute the line weighting.
@@ -720,10 +593,6 @@ void compute_error_squared_rgb(
 			vfloat data_g = gatherf(blk.data_g, texel_idxs);
 			vfloat data_b = gatherf(blk.data_b, texel_idxs);
 
-			vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs);
-			vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs);
-			vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs);
-
 			vfloat uncor_param  = (data_r * l_uncor_bs0)
 			                    + (data_g * l_uncor_bs1)
 			                    + (data_b * l_uncor_bs2);
@@ -732,11 +601,11 @@ void compute_error_squared_rgb(
 			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
 
 			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
-			                   + (uncor_param * l_uncor_bis0);
+			                   + (uncor_param * l_uncor_bs0);
 			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
-			                   + (uncor_param * l_uncor_bis1);
+			                   + (uncor_param * l_uncor_bs1);
 			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
-			                   + (uncor_param * l_uncor_bis2);
+			                   + (uncor_param * l_uncor_bs2);
 
 			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
 			                 + (ew_g * uncor_dist1 * uncor_dist1)
@@ -754,9 +623,9 @@ void compute_error_squared_rgb(
 			samec_hiparamv = max(samec_param, samec_hiparamv);
 
 
-			vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r;
-			vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g;
-			vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b;
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
 
 			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
 			                 + (ew_g * samec_dist1 * samec_dist1)
@@ -765,7 +634,7 @@ void compute_error_squared_rgb(
 			samec_err = select(vfloat::zero(), samec_err, mask);
 			haccumulate(samec_errorsumv, samec_err);
 
-			lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH);
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
 		}
 
 		uncor_loparam = hmin_s(uncor_loparamv);
diff --git a/lib/astc-encoder/Source/astcenc_block_sizes.cpp b/lib/astc-encoder/Source/astcenc_block_sizes.cpp
index 9200cab3de..4a9dc09058 100644
--- a/lib/astc-encoder/Source/astcenc_block_sizes.cpp
+++ b/lib/astc-encoder/Source/astcenc_block_sizes.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -30,7 +30,7 @@
  * @param[out] is_dual_plane   True if this block mode has two weight planes.
  * @param[out] quant_mode      The quantization level for the weights.
  *
- * @return Returns true of valid mode, false otherwise.
+ * @return Returns true if a valid mode, false otherwise.
  */
 static bool decode_block_mode_2d(
 	unsigned int block_mode,
@@ -144,7 +144,7 @@ static bool decode_block_mode_2d(
  * @param[out] is_dual_plane   True if this block mode has two weight planes.
  * @param[out] quant_mode      The quantization level for the weights.
  *
- * @return Returns true of valid mode, false otherwise.
+ * @return Returns true if a valid mode, false otherwise.
  */
 static bool decode_block_mode_3d(
 	unsigned int block_mode,
@@ -854,6 +854,8 @@ static void construct_block_size_descriptor_2d(
 	unsigned int always_block_mode_count = 0;
 	unsigned int always_decimation_mode_count = 0;
 
+	float always_threshold = 0.0f;
+
 	// Iterate twice; first time keep the "always" blocks, second time keep the "non-always" blocks.
 	// This ensures that the always block modes and decimation modes are at the start of the list.
 	for (unsigned int j = 0; j < 2; j ++)
@@ -869,12 +871,12 @@ static void construct_block_size_descriptor_2d(
 			float percentile = percentiles[i];
 			bool selected = (percentile <= mode_cutoff) || !can_omit_modes;
 
-			if (j == 0 && percentile != 0.0f)
+			if (j == 0 && percentile > always_threshold)
 			{
 				continue;
 			}
 
-			if (j == 1 && percentile == 0.0f)
+			if (j == 1 && percentile <= always_threshold)
 			{
 				continue;
 			}
@@ -905,13 +907,13 @@ static void construct_block_size_descriptor_2d(
 
 			// Allocate and initialize the decimation table entry if we've not used it yet
 			int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
-			if (decimation_mode == -1)
+			if (decimation_mode < 0)
 			{
 				decimation_mode = construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd);
 				decimation_mode_index[y_weights * 16 + x_weights] = decimation_mode;
 
 	#if !defined(ASTCENC_DECOMPRESS_ONLY)
-				if (percentile == 0.0f)
+				if (percentile <= always_threshold)
 				{
 					always_decimation_mode_count++;
 				}
@@ -920,7 +922,7 @@ static void construct_block_size_descriptor_2d(
 
 	#if !defined(ASTCENC_DECOMPRESS_ONLY)
 			// Flatten the block mode heuristic into some precomputed flags
-			if (percentile == 0.0f)
+			if (percentile <= always_threshold)
 			{
 				always_block_mode_count++;
 				bsd.block_modes[packed_idx].percentile_hit = true;
diff --git a/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp b/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp
index c64c65aadf..01b2a8b697 100644
--- a/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp
+++ b/lib/astc-encoder/Source/astcenc_compress_symbolic.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -60,7 +60,6 @@ static void merge_endpoints(
  * @param      decode_mode                       The decode mode (LDR, HDR).
  * @param      bsd                               The block size information.
  * @param      blk                               The image block color data to compress.
- * @param      ewb                               The image block weighted error data.
  * @param[out] scb                               The symbolic compressed block output.
  * @param[out] dec_weights_quant_pvalue_plane1   The weights for plane 1.
  * @param[out] dec_weights_quant_pvalue_plane2   The weights for plane 2, or @c nullptr if 1 plane.
@@ -69,7 +68,6 @@ static bool realign_weights(
 	astcenc_profile decode_mode,
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	symbolic_compressed_block& scb,
 	uint8_t* dec_weights_quant_pvalue_plane1,
 	uint8_t* dec_weights_quant_pvalue_plane2
@@ -187,7 +185,7 @@ static bool realign_weights(
 				vfloat4 color = color_base + color_offset * plane_weight;
 
 				vfloat4 origcolor    = blk.texel(texel);
-				vfloat4 error_weight = ewb.error_weights[texel];
+				vfloat4 error_weight = blk.channel_weight;
 
 				vfloat4 colordiff       = color - origcolor;
 				vfloat4 color_up_diff   = colordiff + color_offset * plane_up_weight;
@@ -226,7 +224,6 @@ static bool realign_weights(
  * @param      config                    The compressor configuration.
  * @param      bsd                       The block size information.
  * @param      blk                       The image block color data to compress.
- * @param      ewb                       The image block weighted error data.
  * @param      only_always               True if we only use "always" percentile block modes.
  * @param      tune_errorval_threshold   The error value threshold.
  * @param      partition_count           The partition count.
@@ -238,7 +235,6 @@ static float compress_symbolic_block_for_partition_1plane(
 	const astcenc_config& config,
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	bool only_always,
 	float tune_errorval_threshold,
 	unsigned int partition_count,
@@ -260,7 +256,7 @@ static float compress_symbolic_block_for_partition_1plane(
 	// Compute ideal weights and endpoint colors, with no quantization or decimation
 	endpoints_and_weights& ei = tmpbuf.ei1;
 	endpoints_and_weights *eix = tmpbuf.eix1;
-	compute_ideal_colors_and_weights_1plane(bsd, blk, ewb, pi, ei);
+	compute_ideal_colors_and_weights_1plane(bsd, blk, pi, ei);
 
 	// Compute ideal weights and endpoint colors for every decimation
 	float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value;
@@ -382,7 +378,7 @@ static float compress_symbolic_block_for_partition_1plane(
 	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
 
 	unsigned int candidate_count = compute_ideal_endpoint_formats(
-	    bsd, pi, blk, ewb, ei.ep, qwt_bitcounts, qwt_errors,
+	    bsd, pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
 	    config.tune_candidate_limit, partition_format_specifiers, block_mode_index,
 	    color_quant_level, color_quant_level_mod);
 
@@ -424,7 +420,7 @@ static float compress_symbolic_block_for_partition_1plane(
 		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
 		{
 			recompute_ideal_colors_1plane(
-			    blk, ewb, pi, di,
+			    blk, pi, di,
 			    weight_quant_mode, workscb.weights,
 			    eix[decimation_mode].ep, rgbs_colors, rgbo_colors);
 
@@ -498,7 +494,7 @@ static float compress_symbolic_block_for_partition_1plane(
 			// Pre-realign test
 			if (l == 0)
 			{
-				float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb);
+				float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk);
 				if (errorval == -ERROR_CALC_DEFAULT)
 				{
 					errorval = -errorval;
@@ -536,11 +532,11 @@ static float compress_symbolic_block_for_partition_1plane(
 
 			// Perform a final pass over the weights to try to improve them.
 			bool adjustments = realign_weights(
-			    config.profile, bsd, blk, ewb, workscb,
+			    config.profile, bsd, blk, workscb,
 			    workscb.weights, nullptr);
 
 			// Post-realign test
-			float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb);
+			float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk);
 			if (errorval == -ERROR_CALC_DEFAULT)
 			{
 				errorval = -errorval;
@@ -590,7 +586,6 @@ static float compress_symbolic_block_for_partition_1plane(
  * @param      config                    The compressor configuration.
  * @param      bsd                       The block size information.
  * @param      blk                       The image block color data to compress.
- * @param      ewb                       The image block weighted error data.
  * @param      tune_errorval_threshold   The error value threshold.
  * @param      plane2_component          The component index for the second plane of weights.
  * @param[out] scb                       The symbolic compressed block output.
@@ -600,7 +595,6 @@ static float compress_symbolic_block_for_partition_2planes(
 	const astcenc_config& config,
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	float tune_errorval_threshold,
 	unsigned int plane2_component,
 	symbolic_compressed_block& scb,
@@ -615,7 +609,7 @@ static float compress_symbolic_block_for_partition_2planes(
 	endpoints_and_weights& ei2 = tmpbuf.ei2;
 	endpoints_and_weights* eix1 = tmpbuf.eix1;
 	endpoints_and_weights* eix2 = tmpbuf.eix2;
-	compute_ideal_colors_and_weights_2planes(bsd, blk, ewb, plane2_component, ei1, ei2);
+	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
 
 	// Compute ideal weights and endpoint colors for every decimation
 	float *dec_weights_ideal_value = tmpbuf.dec_weights_ideal_value;
@@ -766,7 +760,7 @@ static float compress_symbolic_block_for_partition_2planes(
 
 	const auto& pi = bsd.get_partition_info(1, 0);
 	unsigned int candidate_count = compute_ideal_endpoint_formats(
-	    bsd, pi, blk, ewb, epm, qwt_bitcounts, qwt_errors,
+	    bsd, pi, blk, epm, qwt_bitcounts, qwt_errors,
 	    config.tune_candidate_limit, partition_format_specifiers, block_mode_index,
 	    color_quant_level, color_quant_level_mod);
 
@@ -812,8 +806,8 @@ static float compress_symbolic_block_for_partition_2planes(
 		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
 		{
 			recompute_ideal_colors_2planes(
-			    blk, ewb, bsd, di,
-			    weight_quant_mode, workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
+			    blk, bsd, di, weight_quant_mode,
+			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
 			    epm, rgbs_color, rgbo_color, plane2_component);
 
 			// Quantize the chosen color
@@ -842,7 +836,7 @@ static float compress_symbolic_block_for_partition_2planes(
 			// Pre-realign test
 			if (l == 0)
 			{
-				float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb);
+				float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk);
 				if (errorval == -ERROR_CALC_DEFAULT)
 				{
 					errorval = -errorval;
@@ -880,11 +874,11 @@ static float compress_symbolic_block_for_partition_2planes(
 
 			// Perform a final pass over the weights to try to improve them
 			bool adjustments = realign_weights(
-			    config.profile, bsd, blk, ewb, workscb,
+			    config.profile, bsd, blk, workscb,
 			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET);
 
 			// Post-realign test
-			float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk, ewb);
+			float errorval = compute_symbolic_block_difference(config, bsd, workscb, blk);
 			if (errorval == -ERROR_CALC_DEFAULT)
 			{
 				errorval = -errorval;
@@ -928,260 +922,17 @@ static float compress_symbolic_block_for_partition_2planes(
 	return best_errorval_in_mode;
 }
 
-/**
- * @brief Create a per-texel expansion of the error weights for deblocking.
- *
- * Deblockign works by assigning a higher error weight to blocks the closer they are the edge of the
- * block. The encourages the compressor to keep the periphery colors more accurate, which can help
- * reduce block artifacts when compressing gradients.
- *
- * @param[in,out] ctx   The context containing both deblog memory and config.
- */
-void expand_deblock_weights(
-	astcenc_context& ctx
-) {
-	unsigned int xdim = ctx.config.block_x;
-	unsigned int ydim = ctx.config.block_y;
-	unsigned int zdim = ctx.config.block_z;
-
-	float centerpos_x = static_cast<float>(xdim - 1) * 0.5f;
-	float centerpos_y = static_cast<float>(ydim - 1) * 0.5f;
-	float centerpos_z = static_cast<float>(zdim - 1) * 0.5f;
-	float *bef = ctx.deblock_weights;
-
-	for (unsigned int z = 0; z < zdim; z++)
-	{
-		for (unsigned int y = 0; y < ydim; y++)
-		{
-			for (unsigned int x = 0; x < xdim; x++)
-			{
-				float xdif = (static_cast<float>(x) - centerpos_x) / static_cast<float>(xdim);
-				float ydif = (static_cast<float>(y) - centerpos_y) / static_cast<float>(ydim);
-				float zdif = (static_cast<float>(z) - centerpos_z) / static_cast<float>(zdim);
-
-				float wdif = 0.36f;
-				float dist = astc::sqrt(xdif * xdif + ydif * ydif + zdif * zdif + wdif * wdif);
-				*bef = astc::pow(dist, ctx.config.b_deblock_weight);
-				bef++;
-			}
-		}
-	}
-}
-
-/**
- * @brief Create a per-texel and per-channel expansion of the error weights.
- *
- * This approach creates relatively large error block tables, but it allows a very flexible level of
- * control over how specific texels and channels are prioritized by the compressor.
- *
- * @param      ctx     The compressor context and configuration.
- * @param      image   The input image information.
- * @param      bsd     The block size information.
- * @param      blk     The image block color data to compress.
- * @param[out] ewb     The image block weighted error data.
- *
- * @return Return the total error weight sum for all texels and channels.
- */
-static float prepare_error_weight_block(
-	const astcenc_context& ctx,
-	const astcenc_image& image,
-	const block_size_descriptor& bsd,
-	const image_block& blk,
-	error_weight_block& ewb
-) {
-	unsigned int idx = 0;
-	bool any_mean_stdev_weight =
-	    ctx.config.v_rgb_mean != 0.0f || ctx.config.v_rgb_stdev != 0.0f || \
-	    ctx.config.v_a_mean != 0.0f || ctx.config.v_a_stdev != 0.0f;
-
-	vfloat4 color_weights(ctx.config.cw_r_weight,
-	                      ctx.config.cw_g_weight,
-	                      ctx.config.cw_b_weight,
-	                      ctx.config.cw_a_weight);
-
-	// This works because HDR is imposed globally at compression time
-	unsigned int rgb_lns = blk.rgb_lns[0];
-	unsigned int a_lns = blk.alpha_lns[0];
-	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
-	vmask4 lns_mask = use_lns != vint4::zero();
-
-	promise(bsd.xdim > 0);
-	promise(bsd.ydim > 0);
-	promise(bsd.zdim > 0);
-
-	for (unsigned int z = 0; z < bsd.zdim; z++)
-	{
-		for (unsigned int y = 0; y < bsd.ydim; y++)
-		{
-			for (unsigned int x = 0; x < bsd.xdim; x++)
-			{
-				unsigned int xpos = x + blk.xpos;
-				unsigned int ypos = y + blk.ypos;
-				unsigned int zpos = z + blk.zpos;
-
-				if (xpos >= image.dim_x || ypos >= image.dim_y || zpos >= image.dim_z)
-				{
-					ewb.error_weights[idx] = vfloat4(1e-11f);
-				}
-				else
-				{
-					vfloat4 derv(65535.0f);
-
-					// Compute derivative if we have any use of LNS
-					if (any(lns_mask))
-					{
-						vfloat4 data = blk.texel(idx);
-						vint4 datai = lns_to_sf16(float_to_int(data));
-
-						vfloat4 dataf = float16_to_float(datai);
-						dataf = max(dataf, 6e-5f);
-
-						vfloat4 data_lns1 = dataf * 1.05f;
-						data_lns1 = float_to_lns(data_lns1);
-
-						vfloat4 data_lns2 = dataf;
-						data_lns2 = float_to_lns(data_lns2);
-
-						vfloat4 divisor_lns = dataf * 0.05f;
-
-						// Clamp derivatives between 1/32 and 2^25
-						float lo = 1.0f / 32.0f;
-						float hi = 33554432.0f;
-						vfloat4 derv_lns = clamp(lo, hi, (data_lns1 - data_lns2) / divisor_lns);
-						derv = select(derv, derv_lns, lns_mask);
-					}
-
-					// Compute error weight
-					vfloat4 error_weight(ctx.config.v_rgb_base,
-					                     ctx.config.v_rgb_base,
-					                     ctx.config.v_rgb_base,
-					                     ctx.config.v_a_base);
-
-					unsigned int ydt = image.dim_x;
-					unsigned int zdt = image.dim_x * image.dim_y;
-
-					if (any_mean_stdev_weight)
-					{
-						vfloat4 avg = ctx.input_averages[zpos * zdt + ypos * ydt + xpos];
-						avg = max(avg, 6e-5f);
-						avg = avg * avg;
-
-						vfloat4 variance = ctx.input_variances[zpos * zdt + ypos * ydt + xpos];
-						variance = variance * variance;
-
-						float favg = hadd_rgb_s(avg) * (1.0f / 3.0f);
-						float fvar = hadd_rgb_s(variance) * (1.0f / 3.0f);
-
-						float mixing = ctx.config.v_rgba_mean_stdev_mix;
-						avg.set_lane<0>(favg * mixing + avg.lane<0>() * (1.0f - mixing));
-						avg.set_lane<1>(favg * mixing + avg.lane<1>() * (1.0f - mixing));
-						avg.set_lane<2>(favg * mixing + avg.lane<2>() * (1.0f - mixing));
-
-						variance.set_lane<0>(fvar * mixing + variance.lane<0>() * (1.0f - mixing));
-						variance.set_lane<1>(fvar * mixing + variance.lane<1>() * (1.0f - mixing));
-						variance.set_lane<2>(fvar * mixing + variance.lane<2>() * (1.0f - mixing));
-
-						vfloat4 stdev = sqrt(max(variance, 0.0f));
-
-						vfloat4 scalea(ctx.config.v_rgb_mean, ctx.config.v_rgb_mean, ctx.config.v_rgb_mean, ctx.config.v_a_mean);
-						avg = avg * scalea;
-
-						vfloat4 scales(ctx.config.v_rgb_stdev, ctx.config.v_rgb_stdev, ctx.config.v_rgb_stdev, ctx.config.v_a_stdev);
-						stdev = stdev * scales;
-
-						error_weight = error_weight + avg + stdev;
-						error_weight = 1.0f / error_weight;
-					}
-
-					if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
-					{
-						float alpha_scale;
-						if (ctx.config.a_scale_radius != 0)
-						{
-							alpha_scale = ctx.input_alpha_averages[zpos * zdt + ypos * ydt + xpos];
-						}
-						else
-						{
-							alpha_scale = blk.data_a[idx] * (1.0f / 65535.0f);
-						}
-
-						alpha_scale = astc::max(alpha_scale, 0.0001f);
-
-						alpha_scale *= alpha_scale;
-						error_weight.set_lane<0>(error_weight.lane<0>() * alpha_scale);
-						error_weight.set_lane<1>(error_weight.lane<1>() * alpha_scale);
-						error_weight.set_lane<2>(error_weight.lane<2>() * alpha_scale);
-					}
-
-					error_weight = error_weight * color_weights;
-					error_weight = error_weight * ctx.deblock_weights[idx];
-
-					// When we loaded the block to begin with, we applied a transfer function and
-					// computed the derivative of the transfer function. However, the error-weight
-					// computation so far is based on the original color values, not the
-					// transfer-function values. As such, we must multiply the error weights by the
-					// derivative of the inverse of the transfer function, which is equivalent to
-					// dividing by the derivative of the transfer function.
-
-					error_weight = error_weight / (derv * derv * 1e-10f);
-					ewb.error_weights[idx] = error_weight;
-				}
-				idx++;
-			}
-		}
-	}
-
-	// Small bias to avoid divide by zeros and NaN propagation later
-	vfloat4 texel_weight_sum(1e-17f);
-	vfloat4 error_weight_sum(1e-17f);
-
-	int texels_per_block = bsd.texel_count;
-	for (int i = 0; i < texels_per_block; i++)
-	{
-		texel_weight_sum += ewb.error_weights[i] * blk.texel(i);
-		error_weight_sum += ewb.error_weights[i];
-
-		float wr = ewb.error_weights[i].lane<0>();
-		float wg = ewb.error_weights[i].lane<1>();
-		float wb = ewb.error_weights[i].lane<2>();
-		float wa = ewb.error_weights[i].lane<3>();
-
-		ewb.texel_weight_r[i] = wr;
-		ewb.texel_weight_g[i] = wg;
-		ewb.texel_weight_b[i] = wb;
-		ewb.texel_weight_a[i] = wa;
-
-		ewb.texel_weight_rg[i] = (wr + wg) * 0.5f;
-		ewb.texel_weight_rb[i] = (wr + wb) * 0.5f;
-		ewb.texel_weight_gb[i] = (wg + wb) * 0.5f;
-
-		ewb.texel_weight_gba[i] = (wg + wb + wa) * 0.333333f;
-		ewb.texel_weight_rba[i] = (wr + wb + wa) * 0.333333f;
-		ewb.texel_weight_rga[i] = (wr + wg + wa) * 0.333333f;
-		ewb.texel_weight_rgb[i] = (wr + wg + wb) * 0.333333f;
-
-		ewb.texel_weight[i] = (wr + wg + wb + wa) * 0.25f;
-	}
-
-	ewb.block_error_weighted_rgba_sum = texel_weight_sum;
-	ewb.block_error_weight_sum = error_weight_sum;
-
-	return hadd_s(error_weight_sum);
-}
-
 /**
  * @brief Determine the lowest cross-channel correlation factor.
  *
  * @param texels_per_block   The number of texels in a block.
  * @param blk                The image block color data to compress.
- * @param ewb                The image block weighted error data.
  *
  * @return Return the lowest correlation factor.
  */
 static float prepare_block_statistics(
 	int texels_per_block,
-	const image_block& blk,
-	const error_weight_block& ewb
+	const image_block& blk
 ) {
 	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
 	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
@@ -1205,7 +956,7 @@ static float prepare_block_statistics(
 	promise(texels_per_block > 0);
 	for (int i = 0; i < texels_per_block; i++)
 	{
-		float weight = ewb.texel_weight[i];
+		float weight = hadd_s(blk.channel_weight) / 4.0f;
 		assert(weight >= 0.0f);
 		weight_sum += weight;
 
@@ -1295,14 +1046,12 @@ static float prepare_block_statistics(
 /* See header for documentation. */
 void compress_block(
 	const astcenc_context& ctx,
-	const astcenc_image& input_image,
 	const image_block& blk,
 	physical_compressed_block& pcb,
 	compression_working_buffers& tmpbuf)
 {
 	astcenc_profile decode_mode = ctx.config.profile;
 	symbolic_compressed_block scb;
-	error_weight_block& ewb = tmpbuf.ewb;
 	const block_size_descriptor* bsd = ctx.bsd;
 	float lowest_correl;
 
@@ -1332,13 +1081,13 @@ void compress_block(
 #if defined(ASTCENC_DIAGNOSTICS)
 	// Do this early in diagnostic builds so we can dump uniform metrics
 	// for every block. Do it later in release builds to avoid redundant work!
-	float error_weight_sum = prepare_error_weight_block(ctx, input_image, *bsd, blk, ewb);
+	float error_weight_sum = hadd_s(blk.channel_weight) * bsd->texel_count;
 	float error_threshold = ctx.config.tune_db_limit
 	                      * error_weight_sum
 	                      * block_is_l_scale
 	                      * block_is_la_scale;
 
-	lowest_correl = prepare_block_statistics(bsd->texel_count, blk, ewb);
+	lowest_correl = prepare_block_statistics(bsd->texel_count, blk);
 	trace_add_data("lowest_correl", lowest_correl);
 	trace_add_data("tune_error_threshold", error_threshold);
 #endif
@@ -1376,7 +1125,7 @@ void compress_block(
 	}
 
 #if !defined(ASTCENC_DIAGNOSTICS)
-	float error_weight_sum = prepare_error_weight_block(ctx, input_image, *bsd, blk, ewb);
+	float error_weight_sum = hadd_s(blk.channel_weight) * bsd->texel_count;
 	float error_threshold = ctx.config.tune_db_limit
 	                      * error_weight_sum
 	                      * block_is_l_scale
@@ -1427,7 +1176,7 @@ void compress_block(
 		trace_add_data("search_mode", i);
 
 		float errorval = compress_symbolic_block_for_partition_1plane(
-		    ctx.config, *bsd, blk, ewb, i == 0,
+		    ctx.config, *bsd, blk, i == 0,
 		    error_threshold * errorval_mult[i] * errorval_overshoot,
 		    1, 0,  scb, tmpbuf);
 
@@ -1440,7 +1189,7 @@ void compress_block(
 	}
 
 #if !defined(ASTCENC_DIAGNOSTICS)
-	lowest_correl = prepare_block_statistics(bsd->texel_count, blk, ewb);
+	lowest_correl = prepare_block_statistics(bsd->texel_count, blk);
 #endif
 
 	block_skip_two_plane = lowest_correl > ctx.config.tune_2_plane_early_out_limit_correlation;
@@ -1473,8 +1222,7 @@ void compress_block(
 		}
 
 		float errorval = compress_symbolic_block_for_partition_2planes(
-		    ctx.config, *bsd, blk, ewb,
-		    error_threshold * errorval_overshoot,
+		    ctx.config, *bsd, blk, error_threshold * errorval_overshoot,
 		    i, scb, tmpbuf);
 
 		// If attempting two planes is much worse than the best one plane result
@@ -1494,25 +1242,24 @@ void compress_block(
 	// Find best blocks for 2, 3 and 4 partitions
 	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
 	{
-		unsigned int partition_indices_1plane[2] { 0, 0 };
+		unsigned int partition_indices[2] { 0 };
 
-		find_best_partition_candidates(*bsd, blk, ewb, partition_count,
+		find_best_partition_candidates(*bsd, blk, partition_count,
 		                               ctx.config.tune_partition_index_limit,
-		                               partition_indices_1plane[0],
-		                               partition_indices_1plane[1]);
+		                               partition_indices);
 
-		for (int i = 0; i < 2; i++)
+		for (unsigned int i = 0; i < 2; i++)
 		{
 			TRACE_NODE(node1, "pass");
 			trace_add_data("partition_count", partition_count);
-			trace_add_data("partition_index", partition_indices_1plane[i]);
+			trace_add_data("partition_index", partition_indices[i]);
 			trace_add_data("plane_count", 1);
 			trace_add_data("search_mode", i);
 
 			float errorval = compress_symbolic_block_for_partition_1plane(
-			    ctx.config, *bsd, blk, ewb, false,
+			    ctx.config, *bsd, blk, false,
 			    error_threshold * errorval_overshoot,
-			    partition_count, partition_indices_1plane[i],
+			    partition_count, partition_indices[i],
 			    scb, tmpbuf);
 
 			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
@@ -1541,7 +1288,7 @@ void compress_block(
 	// TODO: Do something more sensible here, such as average color block
 	if (scb.block_type == SYM_BTYPE_ERROR)
 	{
-#if !defined(NDEBUG)
+#if defined(ASTCENC_DIAGNOSTICS)
 		static bool printed_once = false;
 		if (!printed_once)
 		{
diff --git a/lib/astc-encoder/Source/astcenc_compute_variance.cpp b/lib/astc-encoder/Source/astcenc_compute_variance.cpp
index 61c1481073..41757fc5f1 100644
--- a/lib/astc-encoder/Source/astcenc_compute_variance.cpp
+++ b/lib/astc-encoder/Source/astcenc_compute_variance.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -100,7 +100,7 @@ static void brent_kung_prefix_sum(
 }
 
 /**
- * @brief Compute averages and variances for a pixel region.
+ * @brief Compute averages for a pixel region.
  *
  * The routine computes both in a single pass, using a summed-area table to decouple the running
  * time from the averaging/variance kernel size.
@@ -110,12 +110,10 @@ static void brent_kung_prefix_sum(
  */
 static void compute_pixel_region_variance(
 	astcenc_context& ctx,
-	const pixel_region_variance_args& arg
+	const pixel_region_args& arg
 ) {
 	// Unpack the memory structure into local variables
 	const astcenc_image* img = arg.img;
-	float rgb_power = arg.rgb_power;
-	float alpha_power = arg.alpha_power;
 	astcenc_swizzle swz = arg.swz;
 	bool have_z = arg.have_z;
 
@@ -127,16 +125,13 @@ static void compute_pixel_region_variance(
 	int offset_y = arg.offset_y;
 	int offset_z = arg.offset_z;
 
-	int avg_var_kernel_radius = arg.avg_var_kernel_radius;
 	int alpha_kernel_radius = arg.alpha_kernel_radius;
 
 	float*   input_alpha_averages = ctx.input_alpha_averages;
-	vfloat4* input_averages = ctx.input_averages;
-	vfloat4* input_variances = ctx.input_variances;
 	vfloat4* work_memory = arg.work_memory;
 
 	// Compute memory sizes and dimensions that we need
-	int kernel_radius = astc::max(avg_var_kernel_radius, alpha_kernel_radius);
+	int kernel_radius = alpha_kernel_radius;
 	int kerneldim = 2 * kernel_radius + 1;
 	int kernel_radius_xy = kernel_radius;
 	int kernel_radius_z = have_z ? kernel_radius : 0;
@@ -147,7 +142,6 @@ static void compute_pixel_region_variance(
 	int sizeprod = padsize_x * padsize_y * padsize_z;
 
 	int zd_start = have_z ? 1 : 0;
-	int are_powers_1 = (rgb_power == 1.0f) && (alpha_power == 1.0f);
 
 	vfloat4 *varbuf1 = work_memory;
 	vfloat4 *varbuf2 = work_memory + sizeprod;
@@ -203,12 +197,6 @@ static void compute_pixel_region_variance(
 					                     b * (1.0f / 255.0f),
 					                     a * (1.0f / 255.0f));
 
-					if (!are_powers_1)
-					{
-						vfloat4 exp(rgb_power, rgb_power, rgb_power, alpha_power);
-						d = pow(max(d, 1e-6f), exp);
-					}
-
 					VARBUF1(z, y, x) = d;
 					VARBUF2(z, y, x) = d * d;
 				}
@@ -246,12 +234,6 @@ static void compute_pixel_region_variance(
 					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
 					vfloat4 d = float16_to_float(di);
 
-					if (!are_powers_1)
-					{
-						vfloat4 exp(rgb_power, rgb_power, rgb_power, alpha_power);
-						d = pow(max(d, 1e-6f), exp);
-					}
-
 					VARBUF1(z, y, x) = d;
 					VARBUF2(z, y, x) = d * d;
 				}
@@ -295,12 +277,6 @@ static void compute_pixel_region_variance(
 
 					vfloat4 d(r, g, b, a);
 
-					if (!are_powers_1)
-					{
-						vfloat4 exp(rgb_power, rgb_power, rgb_power, alpha_power);
-						d = pow(max(d, 1e-6f), exp);
-					}
-
 					VARBUF1(z, y, x) = d;
 					VARBUF2(z, y, x) = d * d;
 				}
@@ -369,37 +345,20 @@ static void compute_pixel_region_variance(
 		}
 	}
 
-	int avg_var_kdim = 2 * avg_var_kernel_radius + 1;
 	int alpha_kdim = 2 * alpha_kernel_radius + 1;
 
 	// Compute a few constants used in the variance-calculation.
-	float avg_var_samples;
 	float alpha_rsamples;
-	float mul1;
 
 	if (have_z)
 	{
-		avg_var_samples = (float)(avg_var_kdim * avg_var_kdim * avg_var_kdim);
 		alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim * alpha_kdim);
 	}
 	else
 	{
-		avg_var_samples = (float)(avg_var_kdim * avg_var_kdim);
 		alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim);
 	}
 
-	float avg_var_rsamples = 1.0f / avg_var_samples;
-	if (avg_var_samples == 1)
-	{
-		mul1 = 1.0f;
-	}
-	else
-	{
-		mul1 = 1.0f / (float)(avg_var_samples * (avg_var_samples - 1));
-	}
-
-	float mul2 = avg_var_samples * mul1;
-
 	// Use the summed-area tables to compute variance for each neighborhood
 	if (have_z)
 	{
@@ -436,33 +395,6 @@ static void compute_pixel_region_variance(
 
 					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
 					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
-
-					// Summed-area table lookups for RGBA average and variance
-					vfloat4 v1sum = ( VARBUF1(z_high, y_low,  x_low)
-					                - VARBUF1(z_high, y_low,  x_high)
-					                - VARBUF1(z_high, y_high, x_low)
-					                + VARBUF1(z_high, y_high, x_high)) -
-					               (  VARBUF1(z_low,  y_low,  x_low)
-					                - VARBUF1(z_low,  y_low,  x_high)
-					                - VARBUF1(z_low,  y_high, x_low)
-					                + VARBUF1(z_low,  y_high, x_high));
-
-					vfloat4 v2sum = ( VARBUF2(z_high, y_low,  x_low)
-					                - VARBUF2(z_high, y_low,  x_high)
-					                - VARBUF2(z_high, y_high, x_low)
-					                + VARBUF2(z_high, y_high, x_high)) -
-					               (  VARBUF2(z_low,  y_low,  x_low)
-					                - VARBUF2(z_low,  y_low,  x_high)
-					                - VARBUF2(z_low,  y_high, x_low)
-					                + VARBUF2(z_low,  y_high, x_high));
-
-					// Compute and emit the average
-					vfloat4 avg = v1sum * avg_var_rsamples;
-					input_averages[out_index] = avg;
-
-					// Compute and emit the actual variance
-					vfloat4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum);
-					input_variances[out_index] = variance;
 				}
 			}
 		}
@@ -491,35 +423,16 @@ static void compute_pixel_region_variance(
 
 				int out_index = y_dst * ydt + x_dst;
 				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
-
-				// summed-area table lookups for RGBA average and variance
-				vfloat4 v1sum = VARBUF1(0, y_low,  x_low)
-				              - VARBUF1(0, y_low,  x_high)
-				              - VARBUF1(0, y_high, x_low)
-				              + VARBUF1(0, y_high, x_high);
-
-				vfloat4 v2sum = VARBUF2(0, y_low,  x_low)
-				              - VARBUF2(0, y_low,  x_high)
-				              - VARBUF2(0, y_high, x_low)
-				              + VARBUF2(0, y_high, x_high);
-
-				// Compute and emit the average
-				vfloat4 avg = v1sum * avg_var_rsamples;
-				input_averages[out_index] = avg;
-
-				// Compute and emit the actual variance
-				vfloat4 variance = mul2 * v2sum - mul1 * (v1sum * v1sum);
-				input_variances[out_index] = variance;
 			}
 		}
 	}
 }
 
-void compute_averages_and_variances(
+void compute_averages(
 	astcenc_context& ctx,
-	const avg_var_args &ag
+	const avg_args &ag
 ) {
-	pixel_region_variance_args arg = ag.arg;
+	pixel_region_args arg = ag.arg;
 	arg.work_memory = new vfloat4[ag.work_memory_size];
 
 	int size_x = ag.img_size_x;
@@ -535,7 +448,7 @@ void compute_averages_and_variances(
 	while (true)
 	{
 		unsigned int count;
-		unsigned int base = ctx.manage_avg_var.get_task_assignment(16, count);
+		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
 		if (!count)
 		{
 			break;
@@ -560,28 +473,25 @@ void compute_averages_and_variances(
 			}
 		}
 
-		ctx.manage_avg_var.complete_task_assignment(count);
+		ctx.manage_avg.complete_task_assignment(count);
 	}
 
 	delete[] arg.work_memory;
 }
 
 /* See header for documentation. */
-unsigned int init_compute_averages_and_variances(
+unsigned int init_compute_averages(
 	const astcenc_image& img,
-	float rgb_power,
-	float alpha_power,
-	unsigned int avg_var_kernel_radius,
 	unsigned int alpha_kernel_radius,
 	const astcenc_swizzle& swz,
-	avg_var_args& ag
+	avg_args& ag
 ) {
 	unsigned int size_x = img.dim_x;
 	unsigned int size_y = img.dim_y;
 	unsigned int size_z = img.dim_z;
 
 	// Compute maximum block size and from that the working memory buffer size
-	unsigned int kernel_radius = astc::max(avg_var_kernel_radius, alpha_kernel_radius);
+	unsigned int kernel_radius = alpha_kernel_radius;
 	unsigned int kerneldim = 2 * kernel_radius + 1;
 
 	bool have_z = (size_z > 1);
@@ -591,7 +501,7 @@ unsigned int init_compute_averages_and_variances(
 	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
 	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
 
-	// Perform block-wise averages-and-variances calculations across the image
+	// Perform block-wise averages calculations across the image
 	// Initialize fields which are not populated until later
 	ag.arg.size_x = 0;
 	ag.arg.size_y = 0;
@@ -602,11 +512,8 @@ unsigned int init_compute_averages_and_variances(
 	ag.arg.work_memory = nullptr;
 
 	ag.arg.img = &img;
-	ag.arg.rgb_power = rgb_power;
-	ag.arg.alpha_power = alpha_power;
 	ag.arg.swz = swz;
 	ag.arg.have_z = have_z;
-	ag.arg.avg_var_kernel_radius = avg_var_kernel_radius;
 	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
 
 	ag.img_size_x = size_x;
diff --git a/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp b/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp
index 3649a66dd2..478c1cf1c8 100644
--- a/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp
+++ b/lib/astc-encoder/Source/astcenc_decompress_symbolic.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -186,6 +186,7 @@ void decompress_symbolic_block(
 	blk.zpos = zpos;
 
 	blk.data_min = vfloat4::zero();
+	blk.data_mean = vfloat4::zero();
 	blk.data_max = vfloat4::zero();
 	blk.grayscale = false;
 
@@ -321,8 +322,7 @@ float compute_symbolic_block_difference(
 	const astcenc_config& config,
 	const block_size_descriptor& bsd,
 	const symbolic_compressed_block& scb,
-	const image_block& blk,
-	const error_weight_block& ewb
+	const image_block& blk
 ) {
 	// If we detected an error-block, blow up immediately.
 	if (scb.block_type == SYM_BTYPE_ERROR)
@@ -415,7 +415,7 @@ float compute_symbolic_block_difference(
 			error = min(abs(error), 1e15f);
 			error = error * error;
 
-			float metric = dot_s(error, ewb.error_weights[tix]);
+			float metric = dot_s(error, blk.channel_weight);
 			summa += astc::min(metric, ERROR_CALC_DEFAULT);
 		}
 	}
diff --git a/lib/astc-encoder/Source/astcenc_entry.cpp b/lib/astc-encoder/Source/astcenc_entry.cpp
index b1ef6b8341..b77857c0bd 100644
--- a/lib/astc-encoder/Source/astcenc_entry.cpp
+++ b/lib/astc-encoder/Source/astcenc_entry.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -411,18 +411,6 @@ static astcenc_error validate_config(
 	}
 #endif
 
-	config.v_rgba_mean_stdev_mix = astc::max(config.v_rgba_mean_stdev_mix, 0.0f);
-	config.v_rgb_power = astc::max(config.v_rgb_power, 0.0f);
-	config.v_rgb_base = astc::max(config.v_rgb_base, 0.0f);
-	config.v_rgb_mean = astc::max(config.v_rgb_mean, 0.0f);
-	config.v_rgb_stdev = astc::max(config.v_rgb_stdev, 0.0f);
-	config.v_a_power = astc::max(config.v_a_power, 0.0f);
-	config.v_a_base = astc::max(config.v_a_base, 0.0f);
-	config.v_a_mean = astc::max(config.v_a_mean, 0.0f);
-	config.v_a_stdev = astc::max(config.v_a_stdev, 0.0f);
-
-	config.b_deblock_weight = astc::max(config.b_deblock_weight, 0.0f);
-
 	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
 
 	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
@@ -586,9 +574,6 @@ astcenc_error astcenc_config_init(
 	}
 
 	// Set heuristics to the defaults for each color profile
-	config.v_rgba_radius = 0;
-	config.v_rgba_mean_stdev_mix = 0.0f;
-
 	config.cw_r_weight = 1.0f;
 	config.cw_g_weight = 1.0f;
 	config.cw_b_weight = 1.0f;
@@ -606,40 +591,9 @@ astcenc_error astcenc_config_init(
 	{
 	case ASTCENC_PRF_LDR:
 	case ASTCENC_PRF_LDR_SRGB:
-		config.v_rgb_power = 1.0f;
-		config.v_rgb_base = 1.0f;
-		config.v_rgb_mean = 0.0f;
-		config.v_rgb_stdev = 0.0f;
-
-		config.v_a_power = 1.0f;
-		config.v_a_base = 1.0f;
-		config.v_a_mean = 0.0f;
-		config.v_a_stdev = 0.0f;
 		break;
 	case ASTCENC_PRF_HDR_RGB_LDR_A:
-		config.v_rgb_power = 0.75f;
-		config.v_rgb_base = 0.0f;
-		config.v_rgb_mean = 1.0f;
-		config.v_rgb_stdev = 0.0f;
-
-		config.v_a_power = 1.0f;
-		config.v_a_base = 0.05f;
-		config.v_a_mean = 0.0f;
-		config.v_a_stdev = 0.0f;
-
-		config.tune_db_limit = 999.0f;
-		break;
 	case ASTCENC_PRF_HDR:
-		config.v_rgb_power = 0.75f;
-		config.v_rgb_base = 0.0f;
-		config.v_rgb_mean = 1.0f;
-		config.v_rgb_stdev = 0.0f;
-
-		config.v_a_power = 0.75f;
-		config.v_a_base = 0.0f;
-		config.v_a_mean = 1.0f;
-		config.v_a_stdev = 0.0f;
-
 		config.tune_db_limit = 999.0f;
 		break;
 	default:
@@ -663,27 +617,13 @@ astcenc_error astcenc_config_init(
 
 		// Normals are prone to blocking artifacts on smooth curves
 		// so force compressor to try harder here ...
-		config.b_deblock_weight = 1.8f;
 		config.tune_db_limit *= 1.03f;
-
-		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
-		{
-			config.v_rgba_radius = 3;
-			config.v_rgba_mean_stdev_mix = 0.0f;
-			config.v_rgb_mean = 0.0f;
-			config.v_rgb_stdev = 50.0f;
-			config.v_a_mean = 0.0f;
-			config.v_a_stdev = 50.0f;
-		}
 	}
 	else if (flags & ASTCENC_FLG_MAP_MASK)
 	{
-		config.v_rgba_radius = 3;
-		config.v_rgba_mean_stdev_mix = 0.03f;
-		config.v_rgb_mean = 0.0f;
-		config.v_rgb_stdev = 25.0f;
-		config.v_a_mean = 0.0f;
-		config.v_a_stdev = 25.0f;
+		// Masks are prone to blocking artifacts on mask edges
+		// so force compressor to try harder here ...
+		config.tune_db_limit *= 1.03f;
 	}
 	else if (flags & ASTCENC_FLG_MAP_RGBM)
 	{
@@ -756,8 +696,6 @@ astcenc_error astcenc_context_alloc(
 	ctx->working_buffers = nullptr;
 
 	// These are allocated per-compress, as they depend on image size
-	ctx->input_averages = nullptr;
-	ctx->input_variances = nullptr;
 	ctx->input_alpha_averages = nullptr;
 
 	// Copy the config first and validate the copy (we may modify it)
@@ -778,9 +716,6 @@ astcenc_error astcenc_context_alloc(
 	// Do setup only needed by compression
 	if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
 	{
-		// Expand deblock supression into a weight scale per texel in the block
-		expand_deblock_weights(*ctx);
-
 		// Turn a dB limit into a per-texel error for faster use later
 		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
 		{
@@ -951,14 +886,21 @@ static void compress_image(
 			{
 				blk.origin_texel = vfloat4::zero();
 				blk.data_min = vfloat4::zero();
-				blk.data_max = blk.data_min;
-				blk.grayscale = false;
+				blk.data_mean = vfloat4::zero();
+				blk.data_max = vfloat4::zero();
+				blk.grayscale = true;
 			}
 
+			// Populate the block channel weights
+			blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
+			                             ctx.config.cw_g_weight,
+			                             ctx.config.cw_b_weight,
+			                             ctx.config.cw_a_weight);
+
 			int offset = ((z * yblocks + y) * xblocks + x) * 16;
 			uint8_t *bp = buffer + offset;
 			physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
-			compress_block(ctx, image, blk, *pcb, temp_buffers);
+			compress_block(ctx, blk, *pcb, temp_buffers);
 		}
 
 		ctx.manage_compress.complete_task_assignment(count);
@@ -1025,34 +967,29 @@ astcenc_error astcenc_compress_image(
 		astcenc_compress_reset(ctx);
 	}
 
-	if (ctx->config.v_rgb_mean != 0.0f || ctx->config.v_rgb_stdev != 0.0f ||
-	    ctx->config.v_a_mean != 0.0f || ctx->config.v_a_stdev != 0.0f ||
-	    ctx->config.a_scale_radius != 0)
+	if (ctx->config.a_scale_radius != 0)
 	{
 		// First thread to enter will do setup, other threads will subsequently
 		// enter the critical section but simply skip over the initialization
-		auto init_avg_var = [ctx, &image, swizzle]() {
+		auto init_avg = [ctx, &image, swizzle]() {
 			// Perform memory allocations for the destination buffers
 			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
-			ctx->input_averages = new vfloat4[texel_count];
-			ctx->input_variances = new vfloat4[texel_count];
 			ctx->input_alpha_averages = new float[texel_count];
 
-			return init_compute_averages_and_variances(
-				image, ctx->config.v_rgb_power, ctx->config.v_a_power,
-				ctx->config.v_rgba_radius, ctx->config.a_scale_radius, *swizzle,
-				ctx->avg_var_preprocess_args);
+			return init_compute_averages(
+				image, ctx->config.a_scale_radius, *swizzle,
+				ctx->avg_preprocess_args);
 		};
 
 		// Only the first thread actually runs the initializer
-		ctx->manage_avg_var.init(init_avg_var);
+		ctx->manage_avg.init(init_avg);
 
 		// All threads will enter this function and dynamically grab work
-		compute_averages_and_variances(*ctx, ctx->avg_var_preprocess_args);
+		compute_averages(*ctx, ctx->avg_preprocess_args);
 	}
 
-	// Wait for compute_averages_and_variances to complete before compressing
-	ctx->manage_avg_var.wait();
+	// Wait for compute_averages to complete before compressing
+	ctx->manage_avg.wait();
 
 	compress_image(*ctx, thread_index, image, *swizzle, data_out);
 
@@ -1060,12 +997,6 @@ astcenc_error astcenc_compress_image(
 	ctx->manage_compress.wait();
 
 	auto term_compress = [ctx]() {
-		delete[] ctx->input_averages;
-		ctx->input_averages = nullptr;
-
-		delete[] ctx->input_variances;
-		ctx->input_variances = nullptr;
-
 		delete[] ctx->input_alpha_averages;
 		ctx->input_alpha_averages = nullptr;
 	};
@@ -1090,7 +1021,7 @@ astcenc_error astcenc_compress_reset(
 		return ASTCENC_ERR_BAD_CONTEXT;
 	}
 
-	ctx->manage_avg_var.reset();
+	ctx->manage_avg.reset();
 	ctx->manage_compress.reset();
 	return ASTCENC_SUCCESS;
 #endif
diff --git a/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp b/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp
index 355a18e804..0b648b9d66 100644
--- a/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp
+++ b/lib/astc-encoder/Source/astcenc_find_best_partitioning.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -52,14 +52,12 @@
  * @brief Pick some initital kmeans cluster centers.
  *
  * @param      blk               The image block color data to compress.
- * @param      ewb               The image error weight block.
  * @param      texel_count       The number of texels in the block.
  * @param      partition_count   The number of partitions in the block.
  * @param[out] cluster_centers   The initital partition cluster center colors.
  */
 static void kmeans_init(
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int texel_count,
 	unsigned int partition_count,
 	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
@@ -82,8 +80,7 @@ static void kmeans_init(
 	{
 		vfloat4 color = blk.texel(i);
 		vfloat4 diff = color - center_color;
-		diff = diff * ewb.error_weights[i];
-		float distance = dot_s(diff, diff);
+		float distance = dot_s(diff * diff, blk.channel_weight);
 		distance_sum += distance;
 		distances[i] = distance;
 	}
@@ -128,8 +125,7 @@ static void kmeans_init(
 		{
 			vfloat4 color = blk.texel(i);
 			vfloat4 diff = color - center_color;
-			diff = diff * ewb.error_weights[i];
-			float distance = dot_s(diff, diff);
+			float distance = dot_s(diff * diff, blk.channel_weight);
 			distance = astc::min(distance, distances[i]);
 			distance_sum += distance;
 			distances[i] = distance;
@@ -141,7 +137,6 @@ static void kmeans_init(
  * @brief Assign texels to clusters, based on a set of chosen center points.
  *
  * @param      blk                  The image block color data to compress.
- * @param      ewb                  The image error weight block.
  * @param      texel_count          The number of texels in the block.
  * @param      partition_count      The number of partitions in the block.
  * @param      cluster_centers      The partition cluster center colors.
@@ -149,7 +144,6 @@ static void kmeans_init(
  */
 static void kmeans_assign(
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int texel_count,
 	unsigned int partition_count,
 	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
@@ -170,8 +164,7 @@ static void kmeans_assign(
 		for (unsigned int j = 0; j < partition_count; j++)
 		{
 			vfloat4 diff = color - cluster_centers[j];
-			diff = diff * ewb.error_weights[i];
-			float distance = dot_s(diff, diff);
+			float distance = dot_s(diff * diff, blk.channel_weight);
 			if (distance < best_distance)
 			{
 				best_distance = distance;
@@ -431,14 +424,12 @@ static void get_partition_ordering_by_mismatch_bits(
  *
  * @param      bsd                  The block size information.
  * @param      blk                  The image block color data to compress.
- * @param      ewb                  The image error weight block.
  * @param      partition_count      The desired number of partitions in the block.
  * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
-  */
+ */
 static void compute_kmeans_partition_ordering(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int partition_count,
 	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
 ) {
@@ -450,14 +441,14 @@ static void compute_kmeans_partition_ordering(
 	{
 		if (i == 0)
 		{
-			kmeans_init(blk, ewb, bsd.texel_count, partition_count, cluster_centers);
+			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
 		}
 		else
 		{
 			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
 		}
 
-		kmeans_assign(blk, ewb, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
 	}
 
 	// Construct the block bitmaps of texel assignments to each partition
@@ -482,11 +473,9 @@ static void compute_kmeans_partition_ordering(
 void find_best_partition_candidates(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int partition_count,
 	unsigned int partition_search_limit,
-	unsigned int& best_partition_uncor,
-	unsigned int& best_partition_samec
+	unsigned int best_partitions[2]
 ) {
 	// Constant used to estimate quantization error for a given partitioning; the optimal value for
 	// this depends on bitrate. These values have been determined empirically.
@@ -511,7 +500,7 @@ void find_best_partition_candidates(
 	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
 
 	unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
-	compute_kmeans_partition_ordering(bsd, blk, ewb, partition_count, partition_sequence);
+	compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
 
 	bool uses_alpha = !blk.is_constant_channel(3);
 
@@ -540,7 +529,7 @@ void find_best_partition_candidates(
 			// Compute weighting to give to each component in each partition
 			partition_metrics pms[BLOCK_MAX_PARTITIONS];
 
-			compute_avgs_and_dirs_4_comp(pi, blk, ewb, pms);
+			compute_avgs_and_dirs_4_comp(pi, blk, pms);
 
 			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
 			line4 samec_lines[BLOCK_MAX_PARTITIONS];
@@ -558,16 +547,14 @@ void find_best_partition_candidates(
 				uncor_lines[j].a = pm.avg;
 				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
 
-				uncor_plines[j].amod = (uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b)) * pm.icolor_scale;
-				uncor_plines[j].bs   = uncor_lines[j].b * pm.color_scale;
-				uncor_plines[j].bis  = uncor_lines[j].b * pm.icolor_scale;
+				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
+				uncor_plines[j].bs   = uncor_lines[j].b;
 
 				samec_lines[j].a = vfloat4::zero();
 				samec_lines[j].b = normalize_safe(pm.avg, unit4());
 
 				samec_plines[j].amod = vfloat4::zero();
-				samec_plines[j].bs   = samec_lines[j].b * pm.color_scale;
-				samec_plines[j].bis  = samec_lines[j].b * pm.icolor_scale;
+				samec_plines[j].bs   = samec_lines[j].b;
 			}
 
 			float uncor_error = 0.0f;
@@ -575,7 +562,6 @@ void find_best_partition_candidates(
 
 			compute_error_squared_rgba(pi,
 			                           blk,
-			                           ewb,
 			                           uncor_plines,
 			                           samec_plines,
 			                           uncor_line_lens,
@@ -595,20 +581,14 @@ void find_best_partition_candidates(
 
 			for (unsigned int j = 0; j < partition_count; j++)
 			{
-				partition_metrics& pm = pms[j];
-				float tpp = (float)(pi.partition_texel_count[j]);
-
-				vfloat4 ics = pm.icolor_scale;
-				vfloat4 error_weights = pm.error_weight * (tpp * weight_imprecision_estim);
-
-				vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j] * ics;
-				vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j] * ics;
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
 
-				uncor_vector = uncor_vector * uncor_vector;
-				samec_vector = samec_vector * samec_vector;
+				vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
+				vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
 
-				uncor_error += dot_s(uncor_vector, error_weights);
-				samec_error += dot_s(samec_vector, error_weights);
+				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot_s(samec_vector * samec_vector, error_weights);
 			}
 
 			if (uncor_error < uncor_best_error)
@@ -647,7 +627,7 @@ void find_best_partition_candidates(
 
 			// Compute weighting to give to each component in each partition
 			partition_metrics pms[BLOCK_MAX_PARTITIONS];
-			compute_avgs_and_dirs_3_comp_rgb(pi, blk, ewb, pms);
+			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
 
 			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
 
@@ -662,13 +642,11 @@ void find_best_partition_candidates(
 				pl.samec_line.a = vfloat4::zero();
 				pl.samec_line.b = normalize_safe(pm.avg.swz<0, 1, 2>(), unit3());
 
-				pl.uncor_pline.amod = (pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b)) * pm.icolor_scale.swz<0, 1, 2, 3>();
-				pl.uncor_pline.bs   = (pl.uncor_line.b * pm.color_scale.swz<0, 1, 2, 3>());
-				pl.uncor_pline.bis  = (pl.uncor_line.b * pm.icolor_scale.swz<0, 1, 2, 3>());
+				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
+				pl.uncor_pline.bs   = pl.uncor_line.b;
 
 				pl.samec_pline.amod = vfloat4::zero();
-				pl.samec_pline.bs   = (pl.samec_line.b * pm.color_scale.swz<0, 1, 2, 3>());
-				pl.samec_pline.bis  = (pl.samec_line.b * pm.icolor_scale.swz<0, 1, 2, 3>());
+				pl.samec_pline.bs   = pl.samec_line.b;
 			}
 
 			float uncor_error = 0.0f;
@@ -676,7 +654,6 @@ void find_best_partition_candidates(
 
 			compute_error_squared_rgb(pi,
 			                          blk,
-			                          ewb,
 			                          plines,
 			                          uncor_error,
 			                          samec_error);
@@ -693,25 +670,16 @@ void find_best_partition_candidates(
 
 			for (unsigned int j = 0; j < partition_count; j++)
 			{
-				partition_metrics& pm = pms[j];
 				partition_lines3& pl = plines[j];
 
-				float tpp = (float)(pi.partition_texel_count[j]);
-
-				vfloat4 ics = pm.icolor_scale;
-				ics.set_lane<3>(0.0f);
-
-				vfloat4 error_weights = pm.error_weight * (tpp * weight_imprecision_estim);
-				error_weights.set_lane<3>(0.0f);
-
-				vfloat4 uncor_vector = (pl.uncor_line.b * pl.uncor_line_len) * ics;
-				vfloat4 samec_vector = (pl.samec_line.b * pl.samec_line_len) * ics;
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
 
-				uncor_vector = uncor_vector * uncor_vector;
-				samec_vector = samec_vector * samec_vector;
+				vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
+				vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
 
-				uncor_error += dot3_s(uncor_vector, error_weights);
-				samec_error += dot3_s(samec_vector, error_weights);
+				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
 			}
 
 			if (uncor_error < uncor_best_error)
@@ -736,10 +704,24 @@ void find_best_partition_candidates(
 		}
 	}
 
-	best_partition_uncor = uncor_best_partition;
-
-	unsigned int index = samec_best_partitions[0] != uncor_best_partition ? 0 : 1;
-	best_partition_samec = samec_best_partitions[index];
+	// Same parition is best for both, so use this first unconditionally
+	if (uncor_best_partition == samec_best_partitions[0])
+	{
+		best_partitions[0] = samec_best_partitions[0];
+		best_partitions[1] = samec_best_partitions[1];
+	}
+	// Uncor is best
+	else if (uncor_best_error <= samec_best_errors[0])
+	{
+		best_partitions[0] = uncor_best_partition;
+		best_partitions[1] = samec_best_partitions[0];
+	}
+	// Samec is best
+	else
+	{
+		best_partitions[0] = samec_best_partitions[0];
+		best_partitions[1] = uncor_best_partition;
+	}
 }
 
 #endif
diff --git a/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp b/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp
index 46783c8e6f..ce2dd8ba1d 100644
--- a/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/lib/astc-encoder/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -31,7 +31,6 @@
  *
  * @param      bsd         The block size information.
  * @param      blk         The image block color data to compress.
- * @param      ewb         The image block weighted error data.
  * @param      pi          The partition info for the current trial.
  * @param[out] ei          The computed ideal endpoints and weights.
  * @param      component   The color component to compute.
@@ -39,7 +38,6 @@
 static void compute_ideal_colors_and_weights_1_comp(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	endpoints_and_weights& ei,
 	unsigned int component
@@ -54,78 +52,73 @@ static void compute_ideal_colors_and_weights_1_comp(
 	float lowvalues[BLOCK_MAX_PARTITIONS] { 1e10f, 1e10f, 1e10f, 1e10f };
 	float highvalues[BLOCK_MAX_PARTITIONS] { -1e10f, -1e10f, -1e10f, -1e10f };
 
-	float partition_error_scale[BLOCK_MAX_PARTITIONS];
-	float linelengths_rcp[BLOCK_MAX_PARTITIONS];
+	float length_squared[BLOCK_MAX_PARTITIONS];
+	float scale[BLOCK_MAX_PARTITIONS];
 
-	const float *error_weights = nullptr;
+	float error_weight;
 	const float* data_vr = nullptr;
 
 	assert(component < BLOCK_MAX_COMPONENTS);
 	switch (component)
 	{
 	case 0:
-		error_weights = ewb.texel_weight_r;
+		error_weight = blk.channel_weight.lane<0>();
 		data_vr = blk.data_r;
 		break;
 	case 1:
-		error_weights = ewb.texel_weight_g;
+		error_weight = blk.channel_weight.lane<1>();
 		data_vr = blk.data_g;
 		break;
 	case 2:
-		error_weights = ewb.texel_weight_b;
+		error_weight = blk.channel_weight.lane<2>();
 		data_vr = blk.data_b;
 		break;
 	default:
-		error_weights = ewb.texel_weight_a;
+		error_weight = blk.channel_weight.lane<3>();
 		data_vr = blk.data_a;
 		break;
 	}
 
 	for (int i = 0; i < texel_count; i++)
 	{
-		if (error_weights[i] > 1e-10f)
-		{
-			float value = data_vr[i];
-			int partition = pi.partition_of_texel[i];
+		float value = data_vr[i];
+		int partition = pi.partition_of_texel[i];
 
-			lowvalues[partition] = astc::min(value, lowvalues[partition]);
-			highvalues[partition] = astc::max(value, highvalues[partition]);
-		}
+		lowvalues[partition] = astc::min(value, lowvalues[partition]);
+		highvalues[partition] = astc::max(value, highvalues[partition]);
 	}
 
 	vmask4 sep_mask = vint4::lane_id() == vint4(component);
 	for (int i = 0; i < partition_count; i++)
 	{
-		float diff = highvalues[i] - lowvalues[i];
-
-		if (diff < 0)
+		float length = highvalues[i] - lowvalues[i];
+		if (length < 0.0f)
 		{
 			lowvalues[i] = 0.0f;
 			highvalues[i] = 0.0f;
 		}
 
-		diff = astc::max(diff, 1e-7f);
-
-		partition_error_scale[i] = diff * diff;
-		linelengths_rcp[i] = 1.0f / diff;
+		length = astc::max(length, 1e-7f);
+		length_squared[i] = length * length;
+		scale[i] = 1.0f / length;
 
 		ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalues[i]), sep_mask);
 		ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalues[i]), sep_mask);
 	}
 
 	bool is_constant_wes = true;
-	float constant_wes = partition_error_scale[pi.partition_of_texel[0]] * error_weights[0];
+	float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight;
 
 	for (int i = 0; i < texel_count; i++)
 	{
 		float value = data_vr[i];
 		int partition = pi.partition_of_texel[i];
 		value -= lowvalues[partition];
-		value *= linelengths_rcp[partition];
+		value *= scale[partition];
 		value = astc::clamp1f(value);
 
 		ei.weights[i] = value;
-		ei.weight_error_scale[i] = partition_error_scale[partition] * error_weights[i];
+		ei.weight_error_scale[i] = length_squared[partition] * error_weight;
 		assert(!astc::isnan(ei.weight_error_scale[i]));
 
 		is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes;
@@ -147,7 +140,6 @@ static void compute_ideal_colors_and_weights_1_comp(
  *
  * @param      bsd          The block size information.
  * @param      blk          The image block color data to compress.
- * @param      ewb          The image block weighted error data.
  * @param      pi           The partition info for the current trial.
  * @param[out] ei           The computed ideal endpoints and weights.
  * @param      component1   The first color component to compute.
@@ -156,7 +148,6 @@ static void compute_ideal_colors_and_weights_1_comp(
 static void compute_ideal_colors_and_weights_2_comp(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	endpoints_and_weights& ei,
 	int component1,
@@ -171,24 +162,29 @@ static void compute_ideal_colors_and_weights_2_comp(
 
 	partition_metrics pms[BLOCK_MAX_PARTITIONS];
 
-	const float *error_weights;
+	float error_weight;
 	const float* data_vr = nullptr;
 	const float* data_vg = nullptr;
 	if (component1 == 0 && component2 == 1)
 	{
-		error_weights = ewb.texel_weight_rg;
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
+
 		data_vr = blk.data_r;
 		data_vg = blk.data_g;
 	}
 	else if (component1 == 0 && component2 == 2)
 	{
-		error_weights = ewb.texel_weight_rb;
+		error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
+
 		data_vr = blk.data_r;
 		data_vg = blk.data_b;
 	}
 	else // (component1 == 1 && component2 == 2)
 	{
-		error_weights = ewb.texel_weight_gb;
+		assert(component1 == 1 && component2 == 2);
+
+		error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
+
 		data_vr = blk.data_g;
 		data_vg = blk.data_b;
 	}
@@ -200,7 +196,7 @@ static void compute_ideal_colors_and_weights_2_comp(
 	float scale[BLOCK_MAX_PARTITIONS];
 	float length_squared[BLOCK_MAX_PARTITIONS];
 
-	compute_avgs_and_dirs_2_comp(pi, blk, ewb, component1, component2, pms);
+	compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
 
 	for (int i = 0; i < partition_count; i++)
 	{
@@ -216,21 +212,14 @@ static void compute_ideal_colors_and_weights_2_comp(
 
 	for (int i = 0; i < texel_count; i++)
 	{
-		if (error_weights[i] > 1e-10f)
-		{
-			int partition = pi.partition_of_texel[i];
-			vfloat4 point = vfloat2(data_vr[i], data_vg[i]) * pms[partition].color_scale.swz<0, 1>();
-			line2 l = lines[partition];
-			float param = dot_s(point - l.a, l.b);
-			ei.weights[i] = param;
-
-			lowparam[partition] = astc::min(param, lowparam[partition]);
-			highparam[partition] = astc::max(param, highparam[partition]);
-		}
-		else
-		{
-			ei.weights[i] = -1e38f;
-		}
+		int partition = pi.partition_of_texel[i];
+		vfloat4 point = vfloat2(data_vr[i], data_vg[i]);
+		line2 l = lines[partition];
+		float param = dot_s(point - l.a, l.b);
+		ei.weights[i] = param;
+
+		lowparam[partition] = astc::min(param, lowparam[partition]);
+		highparam[partition] = astc::max(param, highparam[partition]);
 	}
 
 	vfloat4 lowvalues[BLOCK_MAX_PARTITIONS];
@@ -242,7 +231,7 @@ static void compute_ideal_colors_and_weights_2_comp(
 		if (length < 0.0f) // Case for when none of the texels had any weight
 		{
 			lowparam[i] = 0.0f;
-			highparam[i] = 1e-7f;
+			highparam[i] = 0.0f;
 		}
 
 		// It is possible for a uniform-color partition to produce length=0; this causes NaN issues
@@ -251,17 +240,11 @@ static void compute_ideal_colors_and_weights_2_comp(
 		length_squared[i] = length * length;
 		scale[i] = 1.0f / length;
 
-		vfloat4 ep0 = lines[i].a + lines[i].b * lowparam[i];
-		vfloat4 ep1 = lines[i].a + lines[i].b * highparam[i];
-
-		ep0 = ep0.swz<0, 1>() / pms[i].color_scale;
-
-		ep1 = ep1.swz<0, 1>() / pms[i].color_scale;
-
-		lowvalues[i] = ep0;
-		highvalues[i] = ep1;
+		lowvalues[i] = lines[i].a + lines[i].b * lowparam[i];
+		highvalues[i] = lines[i].a + lines[i].b * highparam[i];
 	}
 
+	// TODO: Merge this into loop above?
 	vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
 	vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
 	for (int i = 0; i < partition_count; i++)
@@ -274,7 +257,7 @@ static void compute_ideal_colors_and_weights_2_comp(
 	}
 
 	bool is_constant_wes = true;
-	float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weights[0];
+	float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight;
 
 	for (int i = 0; i < texel_count; i++)
 	{
@@ -283,7 +266,7 @@ static void compute_ideal_colors_and_weights_2_comp(
 		idx = astc::clamp1f(idx);
 
 		ei.weights[i] = idx;
-		ei.weight_error_scale[i] = length_squared[partition] * error_weights[i];
+		ei.weight_error_scale[i] = length_squared[partition] * error_weight;
 		assert(!astc::isnan(ei.weight_error_scale[i]));
 
 		is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes;
@@ -305,7 +288,6 @@ static void compute_ideal_colors_and_weights_2_comp(
  *
  * @param      bsd                 The block size information.
  * @param      blk                 The image block color data to compress.
- * @param      ewb                 The image block weighted error data.
  * @param      pi                  The partition info for the current trial.
  * @param[out] ei                  The computed ideal endpoints and weights.
  * @param      omitted_component   The color component excluded from the calculation.
@@ -313,7 +295,6 @@ static void compute_ideal_colors_and_weights_2_comp(
 static void compute_ideal_colors_and_weights_3_comp(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	endpoints_and_weights& ei,
 	unsigned int omitted_component
@@ -327,34 +308,34 @@ static void compute_ideal_colors_and_weights_3_comp(
 
 	partition_metrics pms[BLOCK_MAX_PARTITIONS];
 
-	const float *error_weights;
+	float error_weight;
 	const float* data_vr = nullptr;
 	const float* data_vg = nullptr;
 	const float* data_vb = nullptr;
 	if (omitted_component == 0)
 	{
-		error_weights = ewb.texel_weight_gba;
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f;
 		data_vr = blk.data_g;
 		data_vg = blk.data_b;
 		data_vb = blk.data_a;
 	}
 	else if (omitted_component == 1)
 	{
-		error_weights = ewb.texel_weight_rba;
+		error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()) / 3.0f;
 		data_vr = blk.data_r;
 		data_vg = blk.data_b;
 		data_vb = blk.data_a;
 	}
 	else if (omitted_component == 2)
 	{
-		error_weights = ewb.texel_weight_rga;
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()) / 3.0f;
 		data_vr = blk.data_r;
 		data_vg = blk.data_g;
 		data_vb = blk.data_a;
 	}
 	else
 	{
-		error_weights = ewb.texel_weight_rgb;
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f;
 		data_vr = blk.data_r;
 		data_vg = blk.data_g;
 		data_vb = blk.data_b;
@@ -367,7 +348,7 @@ static void compute_ideal_colors_and_weights_3_comp(
 	float scale[BLOCK_MAX_PARTITIONS];
 	float length_squared[BLOCK_MAX_PARTITIONS];
 
-	compute_avgs_and_dirs_3_comp(pi, blk, ewb, omitted_component, pms);
+	compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
 
 	for (unsigned int i = 0; i < partition_count; i++)
 	{
@@ -383,27 +364,20 @@ static void compute_ideal_colors_and_weights_3_comp(
 
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
-		if (error_weights[i] > 1e-10f)
-		{
-			int partition = pi.partition_of_texel[i];
-			vfloat4 point = vfloat3(data_vr[i], data_vg[i], data_vb[i]) * pms[partition].color_scale;
-			line3 l = lines[partition];
-			float param = dot3_s(point - l.a, l.b);
-			ei.weights[i] = param;
-
-			lowparam[partition] = astc::min(param, lowparam[partition]);
-			highparam[partition] = astc::max(param, highparam[partition]);
-		}
-		else
-		{
-			ei.weights[i] = -1e38f;
-		}
+		int partition = pi.partition_of_texel[i];
+		vfloat4 point = vfloat3(data_vr[i], data_vg[i], data_vb[i]);
+		line3 l = lines[partition];
+		float param = dot3_s(point - l.a, l.b);
+		ei.weights[i] = param;
+
+		lowparam[partition] = astc::min(param, lowparam[partition]);
+		highparam[partition] = astc::max(param, highparam[partition]);
 	}
 
 	for (unsigned int i = 0; i < partition_count; i++)
 	{
 		float length = highparam[i] - lowparam[i];
-		if (length < 0)			// Case for when none of the texels had any weight
+		if (length < 0.0f) // Case for when none of the texels had any weight
 		{
 			lowparam[i] = 0.0f;
 			highparam[i] = 1e-7f;
@@ -412,16 +386,12 @@ static void compute_ideal_colors_and_weights_3_comp(
 		// It is possible for a uniform-color partition to produce length=0; this causes NaN issues
 		// so set to a small value to avoid this problem.
 		length = astc::max(length, 1e-7f);
-
 		length_squared[i] = length * length;
 		scale[i] = 1.0f / length;
 
 		vfloat4 ep0 = lines[i].a + lines[i].b * lowparam[i];
 		vfloat4 ep1 = lines[i].a + lines[i].b * highparam[i];
 
-		ep0 = ep0 * pms[i].icolor_scale;
-		ep1 = ep1 * pms[i].icolor_scale;
-
 		vfloat4 bmin = blk.data_min;
 		vfloat4 bmax = blk.data_max;
 
@@ -449,7 +419,7 @@ static void compute_ideal_colors_and_weights_3_comp(
 
 
 	bool is_constant_wes = true;
-	float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weights[0];
+	float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight;
 
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
@@ -458,7 +428,7 @@ static void compute_ideal_colors_and_weights_3_comp(
 		idx = astc::clamp1f(idx);
 
 		ei.weights[i] = idx;
-		ei.weight_error_scale[i] = length_squared[partition] * error_weights[i];
+		ei.weight_error_scale[i] = length_squared[partition] * error_weight;
 		assert(!astc::isnan(ei.weight_error_scale[i]));
 
 		is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes;
@@ -480,18 +450,16 @@ static void compute_ideal_colors_and_weights_3_comp(
  *
  * @param      bsd                 The block size information.
  * @param      blk                 The image block color data to compress.
- * @param      ewb                 The image block weighted error data.
  * @param      pi                  The partition info for the current trial.
  * @param[out] ei                  The computed ideal endpoints and weights.
  */
 static void compute_ideal_colors_and_weights_4_comp(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	endpoints_and_weights& ei
 ) {
-	const float *error_weights = ewb.texel_weight;
+	const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
 
 	int partition_count = pi.partition_count;
 
@@ -509,7 +477,7 @@ static void compute_ideal_colors_and_weights_4_comp(
 
 	partition_metrics pms[BLOCK_MAX_PARTITIONS];
 
-	compute_avgs_and_dirs_4_comp(pi, blk, ewb, pms);
+	compute_avgs_and_dirs_4_comp(pi, blk, pms);
 
 	// If the direction points from light to dark then flip so ep0 is darkest
 	for (int i = 0; i < partition_count; i++)
@@ -526,50 +494,39 @@ static void compute_ideal_colors_and_weights_4_comp(
 
 	for (int i = 0; i < texel_count; i++)
 	{
-		if (error_weights[i] > 1e-10f)
-		{
-			int partition = pi.partition_of_texel[i];
+		int partition = pi.partition_of_texel[i];
 
-			vfloat4 point = blk.texel(i) * pms[partition].color_scale;
-			line4 l = lines[partition];
+		vfloat4 point = blk.texel(i);
+		line4 l = lines[partition];
 
-			float param = dot_s(point - l.a, l.b);
-			ei.weights[i] = param;
+		float param = dot_s(point - l.a, l.b);
+		ei.weights[i] = param;
 
-			lowparam[partition] = astc::min(param, lowparam[partition]);
-			highparam[partition] = astc::max(param, highparam[partition]);
-		}
-		else
-		{
-			ei.weights[i] = -1e38f;
-		}
+		lowparam[partition] = astc::min(param, lowparam[partition]);
+		highparam[partition] = astc::max(param, highparam[partition]);
 	}
 
 	for (int i = 0; i < partition_count; i++)
 	{
 		float length = highparam[i] - lowparam[i];
-		if (length < 0)
+		if (length < 0.0f) // Case for when none of the texels had any weight
 		{
 			lowparam[i] = 0.0f;
-			highparam[i] = 1e-7f;
+			highparam[i] = 0.0f;
 		}
 
 		// It is possible for a uniform-color partition to produce length=0; this causes NaN issues
 		// so set to a small value to avoid this problem.
 		length = astc::max(length, 1e-7f);
-
 		length_squared[i] = length * length;
 		scale[i] = 1.0f / length;
 
-		vfloat4 ep0 = lines[i].a + lines[i].b * lowparam[i];
-		vfloat4 ep1 = lines[i].a + lines[i].b * highparam[i];
-
-		ei.ep.endpt0[i] = ep0 * pms[i].icolor_scale;
-		ei.ep.endpt1[i] = ep1 * pms[i].icolor_scale;
+		ei.ep.endpt0[i] = lines[i].a + lines[i].b * lowparam[i];
+		ei.ep.endpt1[i] = lines[i].a + lines[i].b * highparam[i];
 	}
 
 	bool is_constant_wes = true;
-	float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weights[0];
+	float constant_wes = length_squared[pi.partition_of_texel[0]] * error_weight;
 
 	for (int i = 0; i < texel_count; i++)
 	{
@@ -578,7 +535,7 @@ static void compute_ideal_colors_and_weights_4_comp(
 		idx = astc::clamp1f(idx);
 
 		ei.weights[i] = idx;
-		ei.weight_error_scale[i] = error_weights[i] * length_squared[partition];
+		ei.weight_error_scale[i] = length_squared[partition] * error_weight;
 		assert(!astc::isnan(ei.weight_error_scale[i]));
 
 		is_constant_wes = is_constant_wes && ei.weight_error_scale[i] == constant_wes;
@@ -599,7 +556,6 @@ static void compute_ideal_colors_and_weights_4_comp(
 void compute_ideal_colors_and_weights_1plane(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	endpoints_and_weights& ei
 ) {
@@ -607,11 +563,11 @@ void compute_ideal_colors_and_weights_1plane(
 
 	if (uses_alpha)
 	{
-		compute_ideal_colors_and_weights_4_comp(bsd, blk, ewb, pi, ei);
+		compute_ideal_colors_and_weights_4_comp(bsd, blk, pi, ei);
 	}
 	else
 	{
-		compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb,  pi, ei, 3);
+		compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei, 3);
 	}
 }
 
@@ -619,7 +575,6 @@ void compute_ideal_colors_and_weights_1plane(
 void compute_ideal_colors_and_weights_2planes(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int plane2_component,
 	endpoints_and_weights& ei1,
 	endpoints_and_weights& ei2
@@ -633,43 +588,43 @@ void compute_ideal_colors_and_weights_2planes(
 	case 0: // Separate weights for red
 		if (uses_alpha)
 		{
-			compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb, pi, ei1, 0);
+			compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei1, 0);
 		}
 		else
 		{
-			compute_ideal_colors_and_weights_2_comp(bsd, blk, ewb, pi, ei1, 1, 2);
+			compute_ideal_colors_and_weights_2_comp(bsd, blk, pi, ei1, 1, 2);
 		}
-		compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 0);
+		compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 0);
 		break;
 
 	case 1: // Separate weights for green
 		if (uses_alpha)
 		{
-			compute_ideal_colors_and_weights_3_comp(bsd,blk, ewb,  pi, ei1, 1);
+			compute_ideal_colors_and_weights_3_comp(bsd,blk, pi, ei1, 1);
 		}
 		else
 		{
-			compute_ideal_colors_and_weights_2_comp(bsd, blk, ewb, pi, ei1, 0, 2);
+			compute_ideal_colors_and_weights_2_comp(bsd, blk, pi, ei1, 0, 2);
 		}
-		compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 1);
+		compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 1);
 		break;
 
 	case 2: // Separate weights for blue
 		if (uses_alpha)
 		{
-			compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb, pi, ei1, 2);
+			compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei1, 2);
 		}
 		else
 		{
-			compute_ideal_colors_and_weights_2_comp(bsd, blk, ewb, pi, ei1, 0, 1);
+			compute_ideal_colors_and_weights_2_comp(bsd, blk, pi, ei1, 0, 1);
 		}
-		compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 2);
+		compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 2);
 		break;
 
 	default: // Separate weights for alpha
 		assert(uses_alpha);
-		compute_ideal_colors_and_weights_3_comp(bsd, blk, ewb, pi, ei1, 3);
-		compute_ideal_colors_and_weights_1_comp(bsd, blk, ewb, pi, ei2, 3);
+		compute_ideal_colors_and_weights_3_comp(bsd, blk, pi, ei1, 3);
+		compute_ideal_colors_and_weights_1_comp(bsd, blk, pi, ei2, 3);
 		break;
 	}
 }
@@ -1098,9 +1053,9 @@ static inline vfloat4 compute_rgbo_vector(
 }
 
 /* See header for documentation. */
+// TODO: Specialize for 1 partition?
 void recompute_ideal_colors_1plane(
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	const decimation_info& di,
 	int weight_quant_mode,
@@ -1127,24 +1082,21 @@ void recompute_ideal_colors_1plane(
 	for (int i = 0; i < partition_count; i++)
 	{
 		vfloat4 rgba_sum(1e-17f);
-		vfloat4 rgba_weight_sum(1e-17f);
 
 		unsigned int texel_count = pi.partition_texel_count[i];
 		const uint8_t *texel_indexes = pi.texels_of_partition[i];
 
+		// TODO: Use gathers?
 		promise(texel_count > 0);
 		for (unsigned int j = 0; j < texel_count; j++)
 		{
 			unsigned int tix = texel_indexes[j];
-
-			vfloat4 rgba = blk.texel(tix);
-			vfloat4 error_weight = ewb.error_weights[tix];
-
-			rgba_sum += rgba * error_weight;
-			rgba_weight_sum += error_weight;
+			rgba_sum += blk.texel(tix);
 		}
 
-		vfloat4 scale_direction = normalize((rgba_sum * (1.0f / rgba_weight_sum)).swz<0, 1, 2>());
+		rgba_sum = rgba_sum * blk.channel_weight;
+		vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
+		vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
 
 		float scale_max = 0.0f;
 		float scale_min = 1e10f;
@@ -1152,28 +1104,25 @@ void recompute_ideal_colors_1plane(
 		float wmin1 = 1.0f;
 		float wmax1 = 0.0f;
 
-		vfloat4 left_sum    = vfloat4::zero();
-		vfloat4 middle_sum  = vfloat4::zero();
-		vfloat4 right_sum   = vfloat4::zero();
-		vfloat4 lmrs_sum    = vfloat4::zero();
+		float left_sum_s = 0.0f;
+		float middle_sum_s = 0.0f;
+		float right_sum_s = 0.0f;
 
 		vfloat4 color_vec_x = vfloat4::zero();
 		vfloat4 color_vec_y = vfloat4::zero();
 
 		vfloat4 scale_vec = vfloat4::zero();
 
-		vfloat4 weight_weight_sum = vfloat4(1e-17f);
-		float psum = 1e-17f;
+		float weight_weight_sum_s = 1e-17f;
+
+		vfloat4 color_weight = blk.channel_weight;
+		float ls_weight = hadd_rgb_s(color_weight);
 
 		for (unsigned int j = 0; j < texel_count; j++)
 		{
 			unsigned int tix = texel_indexes[j];
 
 			vfloat4 rgba = blk.texel(tix);
-			vfloat4 color_weight = ewb.error_weights[tix];
-
-			// TODO: Move this calculation out to the color block?
-			float ls_weight = hadd_rgb_s(color_weight);
 
 			float idx0;
 			if (!is_decimated)
@@ -1190,54 +1139,41 @@ void recompute_ideal_colors_1plane(
 			wmin1 = astc::min(idx0, wmin1);
 			wmax1 = astc::max(idx0, wmax1);
 
-			float scale = dot3_s(scale_direction, rgba);
+			float scale = dot3_s(scale_dir, rgba);
 			scale_min = astc::min(scale, scale_min);
 			scale_max = astc::max(scale, scale_max);
 
-			vfloat4 left   = color_weight * (om_idx0 * om_idx0);
-			vfloat4 middle = color_weight * (om_idx0 * idx0);
-			vfloat4 right  = color_weight * (idx0 * idx0);
-
-			vfloat4 lmrs = vfloat3(om_idx0 * om_idx0,
-			                       om_idx0 * idx0,
-			                       idx0 * idx0) * ls_weight;
-
-			left_sum   += left;
-			middle_sum += middle;
-			right_sum  += right;
-			lmrs_sum   += lmrs;
+			left_sum_s   += om_idx0 * om_idx0;
+			middle_sum_s += om_idx0 * idx0;
+			right_sum_s  += idx0 * idx0;
+			weight_weight_sum_s += idx0;
 
 			vfloat4 color_idx(idx0);
-			vfloat4 cwprod = color_weight * rgba;
+			vfloat4 cwprod = rgba;
 			vfloat4 cwiprod = cwprod * color_idx;
 
 			color_vec_y += cwiprod;
 			color_vec_x += cwprod - cwiprod;
 
-			scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
-			weight_weight_sum += color_weight * color_idx;
-			psum += dot3_s(color_weight * color_idx, color_idx);
+			scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
 		}
 
-		// Calculations specific to mode #7, the HDR RGB-scale mode
-		vfloat4 rgbq_sum = color_vec_x + color_vec_y;
-		rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
+		vfloat4 left_sum   = vfloat4(left_sum_s) * color_weight;
+		vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
+		vfloat4 right_sum  = vfloat4(right_sum_s) * color_weight;
+		vfloat4 lmrs_sum   = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
 
-		vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum,
-		                                  rgbq_sum, psum);
-		rgbo_vectors[i] = rgbovec;
+		vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
+		float psum = right_sum_s * hadd_rgb_s(color_weight);
 
-		// We will occasionally get a failure due to the use of a singular (non-invertible) matrix.
-		// Record whether such a failure has taken place; if it did, compute rgbo_vectors[] with a
-		// different method later
-		float chkval = dot_s(rgbovec, rgbovec);
-		int rgbo_fail = chkval != chkval;
+		color_vec_x = color_vec_x * color_weight;
+		color_vec_y = color_vec_y * color_weight;
 
 		// Initialize the luminance and scale vectors with a reasonable default
 		float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
 		scalediv = astc::clamp1f(scalediv);
 
-		vfloat4 sds = scale_direction * scale_max;
+		vfloat4 sds = scale_dir * scale_max;
 
 		rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
 
@@ -1245,7 +1181,7 @@ void recompute_ideal_colors_1plane(
 		{
 			// If all weights in the partition were equal, then just take average of all colors in
 			// the partition and use that as both endpoint colors
-			vfloat4 avg = (color_vec_x + color_vec_y) * (1.0f / rgba_weight_sum);
+			vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
 
 			vmask4 notnan_mask = avg == avg;
 			ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
@@ -1287,13 +1223,21 @@ void recompute_ideal_colors_1plane(
 			if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
 			{
 				float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
-				vfloat4 sdsm = scale_direction * scale_ep1;
+				vfloat4 sdsm = scale_dir * scale_ep1;
 				rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
 			}
 		}
 
-		// If the calculation of an RGB-offset vector failed, try to compute a value another way
-		if (rgbo_fail)
+		// Calculations specific to mode #7, the HDR RGB-scale mode
+		vfloat4 rgbq_sum = color_vec_x + color_vec_y;
+		rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
+
+		vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
+		rgbo_vectors[i] = rgbovec;
+
+		// We can get a failure due to the use of a singular (non-invertible) matrix
+		// If it failed, compute rgbo_vectors[] with a different method ...
+		if (astc::isnan(dot_s(rgbovec, rgbovec)))
 		{
 			vfloat4 v0 = ep.endpt0[i];
 			vfloat4 v1 = ep.endpt1[i];
@@ -1303,7 +1247,6 @@ void recompute_ideal_colors_1plane(
 
 			vfloat4 avg = (v0 + v1) * 0.5f;
 			vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
-
 			rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
 		}
 	}
@@ -1312,7 +1255,6 @@ void recompute_ideal_colors_1plane(
 /* See header for documentation. */
 void recompute_ideal_colors_2planes(
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const block_size_descriptor& bsd,
 	const decimation_info& di,
 	int weight_quant_mode,
@@ -1340,28 +1282,26 @@ void recompute_ideal_colors_2planes(
 		dec_weights_quant_uvalue_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f);
 	}
 
-	vfloat4 rgba_sum = ewb.block_error_weighted_rgba_sum;
-	vfloat4 rgba_weight_sum = ewb.block_error_weight_sum;
-
 	unsigned int texel_count = bsd.texel_count;
-	vfloat4 scale_direction = normalize((rgba_sum * (1.0f / rgba_weight_sum)).swz<0, 1, 2>());
+	vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
+	vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
 
 	float scale_max = 0.0f;
 	float scale_min = 1e10f;
 
 	float wmin1 = 1.0f;
 	float wmax1 = 0.0f;
+
 	float wmin2 = 1.0f;
 	float wmax2 = 0.0f;
 
-	vfloat4 left_sum    = vfloat4::zero();
-	vfloat4 middle_sum  = vfloat4::zero();
-	vfloat4 right_sum   = vfloat4::zero();
+	float left1_sum_s = 0.0f;
+	float middle1_sum_s = 0.0f;
+	float right1_sum_s = 0.0f;
 
-	vfloat4 left2_sum   = vfloat4::zero();
-	vfloat4 middle2_sum = vfloat4::zero();
-	vfloat4 right2_sum  = vfloat4::zero();
-	vfloat4 lmrs_sum    = vfloat4::zero();
+	float left2_sum_s = 0.0f;
+	float middle2_sum_s = 0.0f;
+	float right2_sum_s = 0.0f;
 
 	vfloat4 color_vec_x = vfloat4::zero();
 	vfloat4 color_vec_y = vfloat4::zero();
@@ -1369,15 +1309,14 @@ void recompute_ideal_colors_2planes(
 	vfloat4 scale_vec = vfloat4::zero();
 
 	vfloat4 weight_weight_sum = vfloat4(1e-17f);
-	float psum = 1e-17f;
+
+	vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
+	vfloat4 color_weight = blk.channel_weight;
+	float ls_weight = hadd_rgb_s(color_weight);
 
 	for (unsigned int j = 0; j < texel_count; j++)
 	{
 		vfloat4 rgba = blk.texel(j);
-		vfloat4 color_weight = ewb.error_weights[j];
-
-		// TODO: Move this calculation out to the color block?
-		float ls_weight = hadd_rgb_s(color_weight);
 
 		float idx0;
 		if (!is_decimated)
@@ -1394,22 +1333,13 @@ void recompute_ideal_colors_2planes(
 		wmin1 = astc::min(idx0, wmin1);
 		wmax1 = astc::max(idx0, wmax1);
 
-		float scale = dot3_s(scale_direction, rgba);
+		float scale = dot3_s(scale_dir, rgba);
 		scale_min = astc::min(scale, scale_min);
 		scale_max = astc::max(scale, scale_max);
 
-		vfloat4 left   = color_weight * (om_idx0 * om_idx0);
-		vfloat4 middle = color_weight * (om_idx0 * idx0);
-		vfloat4 right  = color_weight * (idx0 * idx0);
-
-		vfloat4 lmrs = vfloat3(om_idx0 * om_idx0,
-		                       om_idx0 * idx0,
-		                       idx0 * idx0) * ls_weight;
-
-		left_sum   += left;
-		middle_sum += middle;
-		right_sum  += right;
-		lmrs_sum   += lmrs;
+		left1_sum_s   += om_idx0 * om_idx0;
+		middle1_sum_s += om_idx0 * idx0;
+		right1_sum_s  += idx0 * idx0;
 
 		float idx1;
 		if (!is_decimated)
@@ -1426,18 +1356,13 @@ void recompute_ideal_colors_2planes(
 		wmin2 = astc::min(idx1, wmin2);
 		wmax2 = astc::max(idx1, wmax2);
 
-		vfloat4 left2   = color_weight * (om_idx1 * om_idx1);
-		vfloat4 middle2 = color_weight * (om_idx1 * idx1);
-		vfloat4 right2  = color_weight * (idx1 * idx1);
-
-		left2_sum   += left2;
-		middle2_sum += middle2;
-		right2_sum  += right2;
+		left2_sum_s   += om_idx1 * om_idx1;
+		middle2_sum_s += om_idx1 * idx1;
+		right2_sum_s  += idx1 * idx1;
 
-		vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
 		vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
 
-		vfloat4 cwprod = color_weight * rgba;
+		vfloat4 cwprod = rgba;
 		vfloat4 cwiprod = cwprod * color_idx;
 
 		color_vec_y += cwiprod;
@@ -1445,26 +1370,27 @@ void recompute_ideal_colors_2planes(
 
 		scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
 		weight_weight_sum += (color_weight * color_idx);
-		psum += dot3_s(color_weight * color_idx, color_idx);
 	}
 
-	// Calculations specific to mode #7, the HDR RGB-scale mode
-	vfloat4 rgbq_sum = color_vec_x + color_vec_y;
-	rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
+	vfloat4 left1_sum   = vfloat4(left1_sum_s) * color_weight;
+	vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
+	vfloat4 right1_sum  = vfloat4(right1_sum_s) * color_weight;
+	vfloat4 lmrs_sum    = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
 
-	rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
+	vfloat4 left2_sum   = vfloat4(left2_sum_s) * color_weight;
+	vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
+	vfloat4 right2_sum  = vfloat4(right2_sum_s) * color_weight;
+
+	float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
 
-	// We will occasionally get a failure due to the use of a singular (non-invertible) matrix.
-	// Record whether such a failure has taken place; if it did, compute rgbo_vectors[] with a
-	// different method later
-	float chkval = dot_s(rgbo_vector, rgbo_vector);
-	int rgbo_fail = chkval != chkval;
+	color_vec_x = color_vec_x * color_weight;
+	color_vec_y = color_vec_y * color_weight;
 
 	// Initialize the luminance and scale vectors with a reasonable default
 	float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
 	scalediv = astc::clamp1f(scalediv);
 
-	vfloat4 sds = scale_direction * scale_max;
+	vfloat4 sds = scale_dir * scale_max;
 
 	rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
 
@@ -1472,7 +1398,7 @@ void recompute_ideal_colors_2planes(
 	{
 		// If all weights in the partition were equal, then just take average of all colors in
 		// the partition and use that as both endpoint colors
-		vfloat4 avg = (color_vec_x + color_vec_y) * (1.0f / rgba_weight_sum);
+		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
 
 		vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
 		vmask4 notnan_mask = avg == avg;
@@ -1487,22 +1413,22 @@ void recompute_ideal_colors_2planes(
 	{
 		// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
 		// set of texel weights and pixel colors
-		vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
+		vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
 		vfloat4 color_rdet1 = 1.0f / color_det1;
 
 		float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
 		float ls_rdet1 = 1.0f / ls_det1;
 
-		vfloat4 color_mss1 = (left_sum * left_sum)
-		                   + (2.0f * middle_sum * middle_sum)
-		                   + (right_sum * right_sum);
+		vfloat4 color_mss1 = (left1_sum * left1_sum)
+		                   + (2.0f * middle1_sum * middle1_sum)
+		                   + (right1_sum * right1_sum);
 
 		float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
 		              + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
 		              + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
 
-		vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
-		vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
+		vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
+		vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
 
 		float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
 		float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
@@ -1518,7 +1444,7 @@ void recompute_ideal_colors_2planes(
 		if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
 		{
 			float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
-			vfloat4 sdsm = scale_direction * scale_ep1;
+			vfloat4 sdsm = scale_dir * scale_ep1;
 			rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
 		}
 	}
@@ -1527,9 +1453,8 @@ void recompute_ideal_colors_2planes(
 	{
 		// If all weights in the partition were equal, then just take average of all colors in
 		// the partition and use that as both endpoint colors
-		vfloat4 avg = (color_vec_x + color_vec_y) * (1.0f / rgba_weight_sum);
+		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
 
-		vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
 		vmask4 notnan_mask = avg == avg;
 		vmask4 full_mask = p2_mask & notnan_mask;
 
@@ -1550,7 +1475,6 @@ void recompute_ideal_colors_2planes(
 		vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
 		vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
 
-		vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
 		vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
 		vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
 		vmask4 full_mask = p2_mask & det_mask & notnan_mask;
@@ -1559,8 +1483,15 @@ void recompute_ideal_colors_2planes(
 		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
 	}
 
-	// If the calculation of an RGB-offset vector failed, try to compute a value another way
-	if (rgbo_fail)
+	// Calculations specific to mode #7, the HDR RGB-scale mode
+	vfloat4 rgbq_sum = color_vec_x + color_vec_y;
+	rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
+
+	rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
+
+	// We can get a failure due to the use of a singular (non-invertible) matrix
+	// If it failed, compute rgbo_vectors[] with a different method ...
+	if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
 	{
 		vfloat4 v0 = ep.endpt0[0];
 		vfloat4 v1 = ep.endpt1[0];
diff --git a/lib/astc-encoder/Source/astcenc_image.cpp b/lib/astc-encoder/Source/astcenc_image.cpp
index f4c8e00f96..47af5714a7 100644
--- a/lib/astc-encoder/Source/astcenc_image.cpp
+++ b/lib/astc-encoder/Source/astcenc_image.cpp
@@ -173,6 +173,8 @@ void fetch_image_block(
 	int idx = 0;
 
 	vfloat4 data_min(1e38f);
+	vfloat4 data_mean(0.0f);
+	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
 	vfloat4 data_max(-1e38f);
 	bool grayscale = true;
 
@@ -225,6 +227,7 @@ void fetch_image_block(
 
 				// Compute block metadata
 				data_min = min(data_min, datav);
+				data_mean += datav * data_mean_scale;
 				data_max = max(data_max, datav);
 
 				if (grayscale && (datav.lane<0>() != datav.lane<1>() || datav.lane<0>() != datav.lane<2>()))
@@ -259,6 +262,7 @@ void fetch_image_block(
 
 	// Store block metadata
 	blk.data_min = data_min;
+	blk.data_mean = data_mean;
 	blk.data_max = data_max;
 	blk.grayscale = grayscale;
 }
diff --git a/lib/astc-encoder/Source/astcenc_internal.h b/lib/astc-encoder/Source/astcenc_internal.h
index cf31cce6dd..5981fd1d02 100644
--- a/lib/astc-encoder/Source/astcenc_internal.h
+++ b/lib/astc-encoder/Source/astcenc_internal.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -60,7 +60,7 @@
 		#define promise(cond) if(!(cond)) { __builtin_unreachable(); }
 	#endif
 #else
-	#define promise(cond) assert(cond);
+	#define promise(cond) assert(cond)
 #endif
 
 /* ============================================================================
@@ -447,9 +447,10 @@ static inline unsigned int get_quant_level(quant_method method)
 	case QUANT_160: return 160;
 	case QUANT_192: return 192;
 	case QUANT_256: return 256;
-	// Unreachable - the enum is fully described
-	default:        return   0;
 	}
+
+	// Unreachable - the enum is fully described
+	return 0;
 }
 
 /**
@@ -457,15 +458,6 @@ static inline unsigned int get_quant_level(quant_method method)
  */
 struct partition_metrics
 {
-	/** @brief The sum of the error weights for texels in this partition. */
-	vfloat4 error_weight;
-
-	/** @brief The color scale factor used to weight color channels. */
-	vfloat4 color_scale;
-
-	/** @brief The 1 / color_scale used to avoid divisions. */
-	vfloat4 icolor_scale;
-
 	/** @brief The error-weighted average color in the partition. */
 	vfloat4 avg;
 
@@ -818,10 +810,16 @@ struct image_block
 	/** @brief The min component value of all texels in the block. */
 	vfloat4 data_min;
 
+	/** @brief The mean component value of all texels in the block. */
+	vfloat4 data_mean;
+
 	/** @brief The max component value of all texels in the block. */
 	vfloat4 data_max;
 
-	/** @brief Is this greyscale block where R == G == B for all texels? */
+	/** @brief The relative error significance of the color channels. */
+	vfloat4 channel_weight;
+
+	/** @brief Is this grayscale block where R == G == B for all texels? */
 	bool grayscale;
 
 	/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
@@ -923,85 +921,6 @@ struct image_block
 	}
 };
 
-/**
- * @brief Data structure representing per-texel and per-component error weights for a block.
- *
- * This structure stores a multiplier for the error weight to apply to each component when computing
- * block errors. This can be used as a general purpose technique to to amplify or diminish the
- * significance of texels and individual color components, based on what is being stored and the
- * compressor heuristics. It can be applied in many different ways, some of which are outlined in
- * the description below (this is not exhaustive).
- *
- * For blocks that span the edge of the texture, the weighting for texels outside of the texture
- * bounds can zeroed to maximize the quality of the texels inside the texture.
- *
- * For textures storing fewer than 4 components the weighting for color components that are unused
- * can be zeroed to maximize the quality of the components that are used. This is particularly
- * important for two component textures, which must be imported in LLLA format to match the two
- * component endpoint encoding. Without manual component weighting to correct significance the "L"
- * would be treated as three times more important than A because of the replication.
- *
- * For HDR textures we can use perceptual weighting which os approximately inverse to the luminance
- * of a texel.
- *
- * For normal maps we can use perceptual weighting which assigns higher weight to low-variability
- * regions than to high-variability regions, ensuring smooth surfaces don't pick up artifacts.
- *
- * For transparent texels we can multiply the RGB weights by the alpha value, ensuring that
- * the least transprent texels maintain the highest accuracy.
- */
-struct error_weight_block
-{
-	/** @brief Block error weighted RGBA sum for whole block / 1 partition. */
-	vfloat4 block_error_weighted_rgba_sum;
-
-	/** @brief Block error sum for whole block / 1 partition. */
-	vfloat4 block_error_weight_sum;
-
-	/** @brief The full per texel per component error weights. */
-	vfloat4 error_weights[BLOCK_MAX_TEXELS];
-
-
-	/** @brief The full per texel per component error weights. */
-	float texel_weight[BLOCK_MAX_TEXELS];
-
-
-	/** @brief The average of the GBA error weights per texel. */
-	float texel_weight_gba[BLOCK_MAX_TEXELS];
-
-	/** @brief The average of the RBA error weights per texel. */
-	float texel_weight_rba[BLOCK_MAX_TEXELS];
-
-	/** @brief The average of the RGA error weights per texel. */
-	float texel_weight_rga[BLOCK_MAX_TEXELS];
-
-	/** @brief The average of the RGB error weights per texel. */
-	float texel_weight_rgb[BLOCK_MAX_TEXELS];
-
-
-	/** @brief The average of the RG error weights per texel. */
-	float texel_weight_rg[BLOCK_MAX_TEXELS];
-
-	/** @brief The average of the RB error weights per texel. */
-	float texel_weight_rb[BLOCK_MAX_TEXELS];
-
-	/** @brief The average of the GB error weights per texel. */
-	float texel_weight_gb[BLOCK_MAX_TEXELS];
-
-
-	/** @brief The individual R component error weights per texel. */
-	float texel_weight_r[BLOCK_MAX_TEXELS];
-
-	/** @brief The individual G component error weights per texel. */
-	float texel_weight_g[BLOCK_MAX_TEXELS];
-
-	/** @brief The individual B component error weights per texel. */
-	float texel_weight_b[BLOCK_MAX_TEXELS];
-
-	/** @brief The individual A component error weights per texel. */
-	float texel_weight_a[BLOCK_MAX_TEXELS];
-};
-
 /**
  * @brief Data structure storing the color endpoints for a block.
  */
@@ -1076,9 +995,6 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
 	/** @brief Ideal decimated endpoints and weights for plane 2. */
 	endpoints_and_weights eix2[WEIGHTS_MAX_DECIMATION_MODES];
 
-	/** @brief The error weight block for the current thread. */
-	error_weight_block ewb;
-
 	/**
 	 * @brief Decimated ideal weight values.
 	 *
@@ -1240,26 +1156,17 @@ struct physical_compressed_block
  * This function takes a structure to avoid spilling arguments to the stack on every function
  * invocation, as there are a lot of parameters.
  */
-struct pixel_region_variance_args
+struct pixel_region_args
 {
 	/** @brief The image to analyze. */
 	const astcenc_image* img;
 
-	/** @brief The RGB component power adjustment. */
-	float rgb_power;
-
-	/** @brief The alpha component power adjustment. */
-	float alpha_power;
-
 	/** @brief The component swizzle pattern. */
 	astcenc_swizzle swz;
 
 	/** @brief Should the algorithm bother with Z axis processing? */
 	bool have_z;
 
-	/** @brief The kernel radius for average and variance. */
-	unsigned int avg_var_kernel_radius;
-
 	/** @brief The kernel radius for alpha processing. */
 	unsigned int alpha_kernel_radius;
 
@@ -1286,12 +1193,12 @@ struct pixel_region_variance_args
 };
 
 /**
- * @brief Parameter structure for @c compute_averages_and_variances_proc().
+ * @brief Parameter structure for @c compute_averages_proc().
  */
-struct avg_var_args
+struct avg_args
 {
 	/** @brief The arguments for the nested variance computation. */
-	pixel_region_variance_args arg;
+	pixel_region_args arg;
 
 	// The above has a reference to the image altread?
 	/** @brief The image X dimensions. */
@@ -1338,28 +1245,21 @@ struct astcenc_context
 	 * large structure size are omitted.
 	 */
 
-	/** @brief The input images averages table, may be @c nullptr if not needed. */
-	vfloat4 *input_averages;
-
-	/** @brief The input image RGBA channel variances table, may be @c nullptr if not needed. */
-	vfloat4 *input_variances;
-
-	/** @brief The input image alpha channel variances table, may be @c nullptr if not needed. */
+	/** @brief The input image alpha channel averages table, may be @c nullptr if not needed. */
 	float *input_alpha_averages;
 
-
 	/** @brief The scratch workign buffers, one per thread (see @c thread_count). */
 	compression_working_buffers* working_buffers;
 
 #if !defined(ASTCENC_DECOMPRESS_ONLY)
 	/** @brief The pixel region and variance worker arguments. */
-	avg_var_args avg_var_preprocess_args;
+	avg_args avg_preprocess_args;
 
 	/** @brief The per-texel deblocking weights for the current block size. */
 	float deblock_weights[BLOCK_MAX_TEXELS];
 
-	/** @brief The parallel manager for averages and variances computation. */
-	ParallelManager manage_avg_var;
+	/** @brief The parallel manager for averages computation. */
+	ParallelManager manage_avg;
 
 	/** @brief The parallel manager for compression. */
 	ParallelManager manage_compress;
@@ -1549,7 +1449,6 @@ unsigned int get_ise_sequence_bitcount(
  *
  * @param      pi           The partition info for the current trial.
  * @param      blk          The image block color data to be compressed.
- * @param      ewb          The image block weighted error data.
  * @param      component1   The first component included in the analysis.
  * @param      component2   The second component included in the analysis.
  * @param[out] pm           The output partition metrics.
@@ -1559,7 +1458,6 @@ unsigned int get_ise_sequence_bitcount(
 void compute_avgs_and_dirs_2_comp(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int component1,
 	unsigned int component2,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
@@ -1569,7 +1467,6 @@ void compute_avgs_and_dirs_2_comp(
  *
  * @param      pi                  The partition info for the current trial.
  * @param      blk                 The image block color data to be compressed.
- * @param      ewb                 The image block weighted error data.
  * @param      omitted_component   The component excluded from the analysis.
  * @param[out] pm                  The output partition metrics.
  *                                 - Only pi.partition_count array entries actually get initialized.
@@ -1578,7 +1475,6 @@ void compute_avgs_and_dirs_2_comp(
 void compute_avgs_and_dirs_3_comp(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int omitted_component,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
 
@@ -1590,7 +1486,6 @@ void compute_avgs_and_dirs_3_comp(
  *
  * @param      pi                  The partition info for the current trial.
  * @param      blk                 The image block color data to be compressed.
- * @param      ewb                 The image block weighted error data.
  * @param[out] pm                  The output partition metrics.
  *                                 - Only pi.partition_count array entries actually get initialized.
  *                                 - Direction vectors @c pm.dir are not normalized.
@@ -1598,7 +1493,6 @@ void compute_avgs_and_dirs_3_comp(
 void compute_avgs_and_dirs_3_comp_rgb(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
 
 /**
@@ -1606,7 +1500,6 @@ void compute_avgs_and_dirs_3_comp_rgb(
  *
  * @param      pi    The partition info for the current trial.
  * @param      blk   The image block color data to be compressed.
- * @param      ewb   The image block weighted error data.
  * @param[out] pm    The output partition metrics.
  *                   - Only pi.partition_count array entries actually get initialized.
  *                   - Direction vectors @c pm.dir are not normalized.
@@ -1614,7 +1507,6 @@ void compute_avgs_and_dirs_3_comp_rgb(
 void compute_avgs_and_dirs_4_comp(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
 
 /**
@@ -1629,7 +1521,6 @@ void compute_avgs_and_dirs_4_comp(
  *
  * @param         pi              The partition info for the current trial.
  * @param         blk             The image block color data to be compressed.
- * @param         ewb             The image block weighted error data.
  * @param[in,out] plines          Processed line inputs, and line length outputs.
  * @param[out]    uncor_error     The cumulative error for using the uncorrelated line.
  * @param[out]    samec_error     The cumulative error for using the same chroma line.
@@ -1637,7 +1528,6 @@ void compute_avgs_and_dirs_4_comp(
 void compute_error_squared_rgb(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
 	float& uncor_error,
 	float& samec_error);
@@ -1654,7 +1544,6 @@ void compute_error_squared_rgb(
  *
  * @param      pi              The partition info for the current trial.
  * @param      blk             The image block color data to be compressed.
- * @param      ewb             The image block weighted error data.
  * @param      uncor_plines    Processed uncorrelated partition lines for each partition.
  * @param      samec_plines    Processed same chroma partition lines for each partition.
  * @param[out] uncor_lengths   The length of each components deviation from the line.
@@ -1665,7 +1554,6 @@ void compute_error_squared_rgb(
 void compute_error_squared_rgba(
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
 	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
 	float uncor_lengths[BLOCK_MAX_PARTITIONS],
@@ -1676,73 +1564,62 @@ void compute_error_squared_rgba(
 /**
  * @brief Find the best set of partitions to trial for a given block.
  *
- * On return @c best_partition_uncor contains the best partition  assuming data has uncorrelated
- * chroma, @c best_partition_samec contains the best partition assuming data has corelated chroma.
+ * On return the @c best_partitions list will contain the two best partition
+ * candidates; one assuming data has uncorrelated chroma and one assuming the
+ * data has corelated chroma. The best candidate is returned first in the list.
  *
  * @param      bsd                        The block size information.
  * @param      blk                        The image block color data to compress.
- * @param      ewb                        The image block weighted error data.
  * @param      partition_count            The number of partitions in the block.
  * @param      partition_search_limit     The number of candidate partition encodings to trial.
- * @param[out] best_partition_uncor       The best partition for uncorrelated chroma.
- * @param[out] best_partition_samec       The best partition for correlated chroma.
+ * @param[out] best_partitions            The best partition candidates.
  */
 void find_best_partition_candidates(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int partition_count,
 	unsigned int partition_search_limit,
-	unsigned int& best_partition_uncor,
-	unsigned int& best_partition_samec);
+	unsigned int best_partitions[2]);
 
 /* ============================================================================
   Functionality for managing images and image related data.
 ============================================================================ */
 
 /**
- * @brief Setup computation of regional averages and variances in an image.
+ * @brief Setup computation of regional averages in an image.
  *
  * This must be done by only a single thread per image, before any thread calls
- * @c compute_averages_and_variances().
+ * @c compute_averages().
  *
- * Results are written back into @c img->input_averages, @c img->input_variances,
- * and @c img->input_alpha_averages.
+ * Results are written back into @c img->input_alpha_averages.
  *
  * @param      img                     The input image data, also holds output data.
- * @param      rgb_power               The RGB component power.
- * @param      alpha_power             The A component power.
- * @param      avg_var_kernel_radius   The kernel radius (in pixels) for avg and var.
  * @param      alpha_kernel_radius     The kernel radius (in pixels) for alpha mods.
  * @param      swz                     Input data component swizzle.
  * @param[out] ag                      The average variance arguments to init.
  *
  * @return The number of tasks in the processing stage.
  */
-unsigned int init_compute_averages_and_variances(
+unsigned int init_compute_averages(
 	const astcenc_image& img,
-	float rgb_power,
-	float alpha_power,
-	unsigned int avg_var_kernel_radius,
 	unsigned int alpha_kernel_radius,
 	const astcenc_swizzle& swz,
-	avg_var_args& ag);
+	avg_args& ag);
 
 /**
- * @brief Compute regional averages and variances.
+ * @brief Compute regional averages in an image.
  *
- * This function can be called by multiple threads, but only after a single thread calls the setup
- * function @c init_compute_averages_and_variances().
+ * This function can be called by multiple threads, but only after a single
+ * thread calls the setup function @c init_compute_averages().
  *
- * Results are written back into @c img->input_averages, @c img->input_variances,
- * and @c img->input_alpha_averages.
+ * Results are written back into @c img->input_alpha_averages.
  *
  * @param[out] ctx   The context.
  * @param      ag    The average and variance arguments created during setup.
  */
-void compute_averages_and_variances(
+void compute_averages(
 	astcenc_context& ctx,
-	const avg_var_args& ag);
+	const avg_args& ag);
 
 /**
  * @brief Fetch a single image block from the input image
@@ -1799,14 +1676,12 @@ void write_image_block(
  *
  * @param      bsd   The block size information.
  * @param      blk   The image block color data to compress.
- * @param      ewb   The image block weighted error data.
  * @param      pi    The partition info for the current trial.
  * @param[out] ei    The endpoint and weight values.
  */
 void compute_ideal_colors_and_weights_1plane(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	endpoints_and_weights& ei);
 
@@ -1819,7 +1694,6 @@ void compute_ideal_colors_and_weights_1plane(
  *
  * @param      bsd                The block size information.
  * @param      blk                The image block color data to compress.
- * @param      ewb                The image block weighted error data.
  * @param      plane2_component   The component assigned to plane 2.
  * @param[out] ei1                The endpoint and weight values for plane 1.
  * @param[out] ei2                The endpoint and weight values for plane 2.
@@ -1827,7 +1701,6 @@ void compute_ideal_colors_and_weights_1plane(
 void compute_ideal_colors_and_weights_2planes(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	unsigned int plane2_component,
 	endpoints_and_weights& ei1,
 	endpoints_and_weights& ei2);
@@ -2054,7 +1927,6 @@ void unpack_weights(
  * @param      bsd                           The block size information.
  * @param      pi                            The partition info for the current trial.
  * @param      blk                           The image block color data to compress.
- * @param      ewb                           The image block weighted error data.
  * @param      ep                            The ideal endpoints.
  * @param      qwt_bitcounts                 Bit counts for different quantization methods.
  * @param      qwt_errors                    Errors for different quantization methods.
@@ -2070,7 +1942,6 @@ unsigned int compute_ideal_endpoint_formats(
 	const block_size_descriptor& bsd,
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const endpoints& ep,
 	const int* qwt_bitcounts,
 	const float* qwt_errors,
@@ -2087,7 +1958,6 @@ unsigned int compute_ideal_endpoint_formats(
  * recompute the ideal colors for a specific weight set.
  *
  * @param         blk                        The image block color data to compress.
- * @param         ewb                        The image block weighted error data.
  * @param         pi                         The partition info for the current trial.
  * @param         di                         The weight grid decimation table.
  * @param         weight_quant_mode          The weight grid quantization level.
@@ -2098,7 +1968,6 @@ unsigned int compute_ideal_endpoint_formats(
  */
 void recompute_ideal_colors_1plane(
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const partition_info& pi,
 	const decimation_info& di,
 	int weight_quant_mode,
@@ -2114,7 +1983,6 @@ void recompute_ideal_colors_1plane(
  * recompute the ideal colors for a specific weight set.
  *
  * @param         blk                               The image block color data to compress.
- * @param         ewb                               The image block weighted error data.
  * @param         bsd                               The block_size descriptor.
  * @param         di                                The weight grid decimation table.
  * @param         weight_quant_mode                 The weight grid quantization level.
@@ -2127,7 +1995,6 @@ void recompute_ideal_colors_1plane(
  */
 void recompute_ideal_colors_2planes(
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const block_size_descriptor& bsd,
 	const decimation_info& di,
 	int weight_quant_mode,
@@ -2138,19 +2005,6 @@ void recompute_ideal_colors_2planes(
 	vfloat4& rgbo_vector,
 	int plane2_component);
 
-/**
- * @brief Expand the deblock weights based on the config deblocking parameter.
- *
- * The approach to deblocking is a general purpose approach which elevates the error weight
- * significance of texels closest to the block periphery. This function computes the deblock weights
- * for each texel, which can be mixed on a block-by-block basis with the other error weighting
- * parameters to compute a specific per-texel weight for a trial.
- *
- * @param[in,out] ctx   The context to expand.
- */
-void expand_deblock_weights(
-	astcenc_context& ctx);
-
 /**
  * @brief Expand the angular tables needed for the alternative to PCA that we use.
  */
@@ -2206,14 +2060,12 @@ void compute_angular_endpoints_2planes(
  * @brief Compress an image block into a physical block.
  *
  * @param      ctx      The compressor context and configuration.
- * @param      image    The input image information.
  * @param      blk      The image block color data to compress.
  * @param[out] pcb      The physical compressed block output.
  * @param[out] tmpbuf   Preallocated scratch buffers for the compressor.
  */
 void compress_block(
 	const astcenc_context& ctx,
-	const astcenc_image& image,
 	const image_block& blk,
 	physical_compressed_block& pcb,
 	compression_working_buffers& tmpbuf);
@@ -2246,7 +2098,6 @@ void decompress_symbolic_block(
  * @param bsd      The block size information.
  * @param scb      The symbolic compressed encoding.
  * @param blk      The original image block color data.
- * @param ewb      The error weight block data.
  *
  * @return Returns the computed error, or a negative value if the encoding
  *         should be rejected for any reason.
@@ -2255,8 +2106,7 @@ float compute_symbolic_block_difference(
 	const astcenc_config& config,
 	const block_size_descriptor& bsd,
 	const symbolic_compressed_block& scb,
-	const image_block& blk,
-	const error_weight_block& ewb) ;
+	const image_block& blk);
 
 /**
  * @brief Convert a symbolic representation into a binary physical encoding.
diff --git a/lib/astc-encoder/Source/astcenc_mathlib.h b/lib/astc-encoder/Source/astcenc_mathlib.h
index 0dc17b42d0..4876749bfe 100644
--- a/lib/astc-encoder/Source/astcenc_mathlib.h
+++ b/lib/astc-encoder/Source/astcenc_mathlib.h
@@ -458,21 +458,18 @@ struct processed_line2
 {
 	vfloat4 amod;
 	vfloat4 bs;
-	vfloat4 bis;
 };
 
 struct processed_line3
 {
 	vfloat4 amod;
 	vfloat4 bs;
-	vfloat4 bis;
 };
 
 struct processed_line4
 {
 	vfloat4 amod;
 	vfloat4 bs;
-	vfloat4 bis;
 };
 
 #endif
diff --git a/lib/astc-encoder/Source/astcenc_partition_tables.cpp b/lib/astc-encoder/Source/astcenc_partition_tables.cpp
index fd840add16..52d76cfaf2 100644
--- a/lib/astc-encoder/Source/astcenc_partition_tables.cpp
+++ b/lib/astc-encoder/Source/astcenc_partition_tables.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -58,7 +58,7 @@ static void generate_canonical_partitioning(
 	{
 		int index = partition_of_texel[i];
 
-		if (mapped_index[index] == -1)
+		if (mapped_index[index] < 0)
 		{
 			mapped_index[index] = map_weight_count++;
 		}
diff --git a/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp b/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp
index cbeb285535..140edb1029 100644
--- a/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp
+++ b/lib/astc-encoder/Source/astcenc_pick_best_endpoint_format.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -47,43 +47,6 @@
 
 #include <assert.h>
 
-/**
- * @brief Compute cumulative error weight of each partition.
- *
- * The cumulative error weight is used to determine the relative importance of each partiton when
- * deciding how to quantize colors, as not all partitions are equal. For example, some partitions
- * will have far fewer texels than others in the same block.
- *
- * @param      ewb             The block error weights.
- * @param      pi              The partiion info.
- * @param[out] error_weights   The output per-partition error_weight sum.
- */
-static void compute_partition_error_color_weightings(
-	const error_weight_block& ewb,
-	const partition_info& pi,
-	vfloat4 error_weights[BLOCK_MAX_PARTITIONS]
-) {
-	// TODO: Candidate for 4-group counting
-	int partition_count = pi.partition_count;
-	promise(partition_count > 0);
-
-	for (int i = 0; i < partition_count; i++)
-	{
-		vfloat4 error_weight(1e-12f);
-
-		int texel_count = pi.partition_texel_count[i];
-		promise(texel_count > 0);
-
-		for (int j = 0; j < texel_count; j++)
-		{
-			int tidx = pi.texels_of_partition[i][j];
-			error_weight += ewb.error_weights[tidx];
-		}
-
-		error_weights[i] = error_weight / pi.partition_texel_count[i];
-	}
-}
-
 /**
  * @brief Compute the errors of the endpoint line options for one partition.
  *
@@ -96,7 +59,6 @@ static void compute_partition_error_color_weightings(
  * @param      pi                The partition info data.
  * @param      partition_index   The partition index to compule the error for.
  * @param      blk               The image block.
- * @param      ewb               The error weight block.
  * @param      uncor_pline       The endpoint line assuming uncorrelated endpoints.
  * @param[out] uncor_err         The computed error for the uncorrelated endpoint line.
  * @param      samec_pline       The endpoint line assuming the same chroma for both endpoints.
@@ -111,7 +73,6 @@ static void compute_error_squared_rgb_single_partition(
 	const partition_info& pi,
 	int partition_index,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const processed_line3& uncor_pline,
 	float& uncor_err,
 	const processed_line3& samec_pline,
@@ -134,14 +95,14 @@ static void compute_error_squared_rgb_single_partition(
 	for (int i = 0; i < texels_in_partition; i++)
 	{
 		int tix = pi.texels_of_partition[partition_index][i];
-		float texel_weight = ewb.texel_weight_rgb[tix];
+		float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f;
 		if (texel_weight < 1e-20f)
 		{
 			continue;
 		}
 
 		vfloat4 point = blk.texel(tix);
-		vfloat4 ews = ewb.error_weights[tix];
+		vfloat4 ews = blk.channel_weight;
 
 		// Compute the error that arises from just ditching alpha
 		float default_alpha = blk.get_default_alpha();
@@ -149,24 +110,24 @@ static void compute_error_squared_rgb_single_partition(
 		a_drop_err += omalpha * omalpha * ews.lane<3>();
 
 		float param1 = dot3_s(point, uncor_pline.bs);
-		vfloat4 rp1 = uncor_pline.amod + param1 * uncor_pline.bis;
+		vfloat4 rp1 = uncor_pline.amod + param1 * uncor_pline.bs;
 		vfloat4 dist1 = rp1 - point;
 		uncor_err += dot3_s(ews, dist1 * dist1);
 
 		float param2 = dot3_s(point, samec_pline.bs);
 		// No samec amod - we know it's always zero
-		vfloat4 rp2 = /* samec_pline.amod + */ param2 * samec_pline.bis;
+		vfloat4 rp2 = /* samec_pline.amod + */ param2 * samec_pline.bs;
 		vfloat4 dist2 = rp2 - point;
 		samec_err += dot3_s(ews, dist2 * dist2);
 
 		float param3 = dot3_s(point,  rgbl_pline.bs);
-		vfloat4 rp3 = rgbl_pline.amod + param3 * rgbl_pline.bis;
+		vfloat4 rp3 = rgbl_pline.amod + param3 * rgbl_pline.bs;
 		vfloat4 dist3 = rp3 - point;
 		rgbl_err += dot3_s(ews, dist3 * dist3);
 
 		float param4 = dot3_s(point, l_pline.bs);
 		// No luma amod - we know it's always zero
-		vfloat4 rp4 = /* l_pline.amod + */ param4 * l_pline.bis;
+		vfloat4 rp4 = /* l_pline.amod + */ param4 * l_pline.bs;
 		vfloat4 dist4 = rp4 - point;
 		l_err += dot3_s(ews, dist4 * dist4);
 	}
@@ -182,7 +143,6 @@ static void compute_error_squared_rgb_single_partition(
  * @param      bsd   The block size information.
  * @param      blk   The image block.
  * @param      pi    The partition info data.
- * @param      ewb   The error weight block.
  * @param      ep    The idealized endpoints.
  * @param[out] eci   The resulting encoding choice error metrics.
   */
@@ -190,7 +150,6 @@ static void compute_encoding_choice_errors(
 	const block_size_descriptor& bsd,
 	const image_block& blk,
 	const partition_info& pi,
-	const error_weight_block& ewb,
 	const endpoints& ep,
 	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS])
 {
@@ -202,19 +161,19 @@ static void compute_encoding_choice_errors(
 
 	partition_metrics pms[BLOCK_MAX_PARTITIONS];
 
-	compute_avgs_and_dirs_3_comp_rgb(pi, blk, ewb, pms);
+	compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
 
 	for (int i = 0; i < partition_count; i++)
 	{
 		partition_metrics& pm = pms[i];
 
 		line3 uncor_rgb_lines;
-		line3 samec_rgb_lines;	// for LDR-RGB-scale
-		line3 rgb_luma_lines;	// for HDR-RGB-scale
+		line3 samec_rgb_lines;  // for LDR-RGB-scale
+		line3 rgb_luma_lines;   // for HDR-RGB-scale
 
 		processed_line3 uncor_rgb_plines;
-		processed_line3 samec_rgb_plines;	// for LDR-RGB-scale
-		processed_line3 rgb_luma_plines;	// for HDR-RGB-scale
+		processed_line3 samec_rgb_plines;
+		processed_line3 rgb_luma_plines;
 		processed_line3 luminance_plines;
 
 		float uncorr_rgb_error;
@@ -223,41 +182,31 @@ static void compute_encoding_choice_errors(
 		float luminance_rgb_error;
 		float alpha_drop_error;
 
-		vfloat4 csf = pm.color_scale;
-		vfloat4 csfn = normalize(csf);
-
-		vfloat4 icsf = pm.icolor_scale;
-		icsf.set_lane<3>(0.0f);
-
 		uncor_rgb_lines.a = pm.avg;
-		uncor_rgb_lines.b = normalize_safe(pm.dir, csfn);
+		uncor_rgb_lines.b = normalize_safe(pm.dir, unit3());
 
 		samec_rgb_lines.a = vfloat4::zero();
-		samec_rgb_lines.b = normalize_safe(pm.avg, csfn);
+		samec_rgb_lines.b = normalize_safe(pm.avg, unit3());
 
 		rgb_luma_lines.a = pm.avg;
-		rgb_luma_lines.b = csfn;
+		rgb_luma_lines.b = unit3();
 
-		uncor_rgb_plines.amod = (uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b)) * icsf;
-		uncor_rgb_plines.bs   = uncor_rgb_lines.b * csf;
-		uncor_rgb_plines.bis  = uncor_rgb_lines.b * icsf;
+		uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b);
+		uncor_rgb_plines.bs   = uncor_rgb_lines.b;
 
 		// Same chroma always goes though zero, so this is simpler than the others
 		samec_rgb_plines.amod = vfloat4::zero();
-		samec_rgb_plines.bs   = samec_rgb_lines.b * csf;
-		samec_rgb_plines.bis  = samec_rgb_lines.b * icsf;
+		samec_rgb_plines.bs   = samec_rgb_lines.b;
 
-		rgb_luma_plines.amod = (rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b)) * icsf;
-		rgb_luma_plines.bs   = rgb_luma_lines.b * csf;
-		rgb_luma_plines.bis  = rgb_luma_lines.b * icsf;
+		rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b);
+		rgb_luma_plines.bs   = rgb_luma_lines.b;
 
 		// Luminance always goes though zero, so this is simpler than the others
 		luminance_plines.amod = vfloat4::zero();
-		luminance_plines.bs   = csfn * csf;
-		luminance_plines.bis  = csfn * icsf;
+		luminance_plines.bs   = unit3();
 
 		compute_error_squared_rgb_single_partition(
-		    pi, i, blk, ewb,
+		    pi, i, blk,
 		    uncor_rgb_plines, uncorr_rgb_error,
 		    samec_rgb_plines, samechroma_rgb_error,
 		    rgb_luma_plines,  rgb_luma_error,
@@ -284,9 +233,9 @@ static void compute_encoding_choice_errors(
 		bool can_blue_contract = (mask(endpt_can_bc_lo & endpt_can_bc_hi) & 0x7) == 0x7;
 
 		// Store out the settings
-		eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f;	// empirical
-		eci[i].rgb_luma_error  = (rgb_luma_error - uncorr_rgb_error) * 1.5f;	// wild guess
-		eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f;	// empirical
+		eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f;  // empirical
+		eci[i].rgb_luma_error  = (rgb_luma_error - uncorr_rgb_error) * 1.5f;        // wild guess
+		eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f;   // empirical
 		eci[i].alpha_drop_error = alpha_drop_error * 3.0f;
 		eci[i].can_offset_encode = can_offset_encode;
 		eci[i].can_blue_contract = can_blue_contract;
@@ -688,7 +637,7 @@ static float one_partition_find_best_combination_for_bitcount(
 		int quant_level = quant_mode_table[integer_count][bits_available];
 
 		// Don't have enough bits to represent a given endpoint format at all!
-		if (quant_level == -1)
+		if (quant_level < 0)
 		{
 			continue;
 		}
@@ -791,7 +740,7 @@ static float two_partitions_find_best_combination_for_bitcount(
 		int quant_level = quant_mode_table[integer_count][bits_available];
 
 		// Don't have enough bits to represent a given endpoint format at all!
-		if (quant_level == -1)
+		if (quant_level < 0)
 		{
 			break;
 		}
@@ -916,7 +865,7 @@ static float three_partitions_find_best_combination_for_bitcount(
 		int quant_level = quant_mode_table[integer_count][bits_available];
 
 		// Don't have enough bits to represent a given endpoint format at all!
-		if (quant_level == -1)
+		if (quant_level < 0)
 		{
 			break;
 		}
@@ -1052,7 +1001,7 @@ static float four_partitions_find_best_combination_for_bitcount(
 		int quant_level = quant_mode_table[integer_count][bits_available];
 
 		// Don't have enough bits to represent a given endpoint format at all!
-		if (quant_level == -1)
+		if (quant_level < 0)
 		{
 			break;
 		}
@@ -1094,7 +1043,6 @@ unsigned int compute_ideal_endpoint_formats(
 	const block_size_descriptor& bsd,
 	const partition_info& pi,
 	const image_block& blk,
-	const error_weight_block& ewb,
 	const endpoints& ep,
 	 // bitcounts and errors computed for the various quantization methods
 	const int* qwt_bitcounts,
@@ -1117,12 +1065,7 @@ unsigned int compute_ideal_endpoint_formats(
 	// Compute the errors that result from various encoding choices (such as using luminance instead
 	// of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
 	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS];
-	compute_encoding_choice_errors(bsd, blk, pi, ewb, ep, eci);
-
-	// For each partition, compute the error weights to apply for that partition
-	vfloat4 error_weights[BLOCK_MAX_PARTITIONS];
-
-	compute_partition_error_color_weightings(ewb, pi, error_weights);
+	compute_encoding_choice_errors(bsd, blk, pi, ep, eci);
 
 	float best_error[BLOCK_MAX_PARTITIONS][21][4];
 	int format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
@@ -1130,7 +1073,7 @@ unsigned int compute_ideal_endpoint_formats(
 	{
 		compute_color_error_for_every_integer_count_and_quant_level(
 		    encode_hdr_rgb, encode_hdr_alpha, i,
-		    pi, eci[i], ep, error_weights[i], best_error[i],
+		    pi, eci[i], ep, blk.channel_weight, best_error[i],
 		    format_of_choice[i]);
 	}
 
@@ -1301,7 +1244,7 @@ unsigned int compute_ideal_endpoint_formats(
 			vmask mask = mask1 & mask2;
 			vbest_ep_error = select(vbest_ep_error, err, mask);
 			vbest_error_index = select(vbest_error_index, lane_ids, mask);
-			lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH);
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
 		}
 
 		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
diff --git a/lib/astc-encoder/Source/astcenc_vecmathlib.h b/lib/astc-encoder/Source/astcenc_vecmathlib.h
index ab86dc50c4..069c03c94f 100644
--- a/lib/astc-encoder/Source/astcenc_vecmathlib.h
+++ b/lib/astc-encoder/Source/astcenc_vecmathlib.h
@@ -243,7 +243,8 @@ static ASTCENC_SIMD_INLINE vfloat4 unit4()
  */
 static ASTCENC_SIMD_INLINE vfloat4 unit3()
 {
-	return vfloat4(0.57735f, 0.57735f, 0.57735f, 0.0f);
+	float val = 0.577350258827209473f;
+	return vfloat4(val, val, val, 0.0f);
 }
 
 /**
@@ -251,7 +252,8 @@ static ASTCENC_SIMD_INLINE vfloat4 unit3()
  */
 static ASTCENC_SIMD_INLINE vfloat4 unit2()
 {
-	return vfloat4(0.70711f, 0.70711f, 0.0f, 0.0f);
+	float val = 0.707106769084930420f;
+	return vfloat4(val, val, 0.0f, 0.0f);
 }
 
 /**
diff --git a/lib/astc-encoder/Source/astcenc_weight_align.cpp b/lib/astc-encoder/Source/astcenc_weight_align.cpp
index 4c1e04e4a0..e29ff8861e 100644
--- a/lib/astc-encoder/Source/astcenc_weight_align.cpp
+++ b/lib/astc-encoder/Source/astcenc_weight_align.cpp
@@ -65,7 +65,7 @@ static const unsigned int quantization_steps_for_level[13] {
 alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
 alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
 
-#if !defined(NDEBUG)
+#if defined(ASTCENC_DIAGNOSTICS)
 	static bool print_once { true };
 #endif
 
@@ -329,7 +329,7 @@ static void compute_angular_endpoints_for_quant_levels(
 		int bsi = (int)best_results[q].lane<1>();
 
 		// Did we find anything?
-#if !defined(NDEBUG)
+#if defined(ASTCENC_DIAGNOSTICS)
 		if ((bsi < 0) && print_once)
 		{
 			print_once = false;
@@ -493,7 +493,7 @@ static void compute_angular_endpoints_for_quant_levels_lwc(
 		int bsi = best_index[q];
 
 		// Did we find anything?
-#if !defined(NDEBUG)
+#if defined(ASTCENC_DIAGNOSTICS)
 		if ((bsi < 0) && print_once)
 		{
 			print_once = false;
diff --git a/lib/astc-encoder/Source/astcenccli_error_metrics.cpp b/lib/astc-encoder/Source/astcenccli_error_metrics.cpp
index 4e28f95e13..9023d97f91 100644
--- a/lib/astc-encoder/Source/astcenccli_error_metrics.cpp
+++ b/lib/astc-encoder/Source/astcenccli_error_metrics.cpp
@@ -129,7 +129,7 @@ void compute_error_metrics(
 	kahan_accum4 log_errorsum;
 	kahan_accum4 mpsnr_errorsum;
 	double mean_angular_errorsum = 0.0;
-	float worst_angular_errorsum = 0.0;
+	double worst_angular_errorsum = 0.0;
 
 	unsigned int dim_x = astc::min(img1->dim_x, img2->dim_x);
 	unsigned int dim_y = astc::min(img1->dim_y, img2->dim_y);
@@ -282,9 +282,9 @@ void compute_error_metrics(
 					// Float error can push this outside of valid range for acos, so clamp to avoid NaN issues
 					float normal_cos = clamp(-1.0f, 1.0f, dot3(normal1, normal2)).lane<0>();
 					float rad_to_degrees = 180.0f / astc::PI;
-					float error_degrees = std::acos(static_cast<double>(normal_cos)) * static_cast<double>(rad_to_degrees);
+					double error_degrees = std::acos(static_cast<double>(normal_cos)) * static_cast<double>(rad_to_degrees);
 
-					mean_angular_errorsum += static_cast<double>(error_degrees) / (dim_x * dim_y * dim_z);
+					mean_angular_errorsum += error_degrees / (dim_x * dim_y * dim_z);
 					worst_angular_errorsum = astc::max(worst_angular_errorsum, error_degrees);
 				}
 			}
@@ -396,7 +396,7 @@ void compute_error_metrics(
 	if (compute_normal_metrics)
 	{
 		printf("    Mean Angular Error:       %9.4f degrees\n", mean_angular_errorsum);
-		printf("    Worst Angular Error:      %9.4f degrees\n", (double)worst_angular_errorsum);
+		printf("    Worst Angular Error:      %9.4f degrees\n", worst_angular_errorsum);
 	}
 
 	printf("\n");
diff --git a/lib/astc-encoder/Source/astcenccli_image_load_store.cpp b/lib/astc-encoder/Source/astcenccli_image_load_store.cpp
index 1adb82ee23..d6afd0a923 100644
--- a/lib/astc-encoder/Source/astcenccli_image_load_store.cpp
+++ b/lib/astc-encoder/Source/astcenccli_image_load_store.cpp
@@ -778,17 +778,17 @@ static unsigned int get_format(
 struct ktx_header
 {
 	uint8_t magic[12];
-	uint32_t endianness;		// should be 0x04030201; if it is instead 0x01020304, then the endianness of everything must be switched.
-	uint32_t gl_type;			// 0 for compressed textures, otherwise value from table 3.2 (page 162) of OpenGL 4.0 spec
-	uint32_t gl_type_size;		// size of data elements to do endianness swap on (1=endian-neutral data)
-	uint32_t gl_format;			// 0 for compressed textures, otherwise value from table 3.3 (page 163) of OpenGLl spec
-	uint32_t gl_internal_format;	// sized-internal-format, corresponding to table 3.12 to 3.14 (pages 182-185) of OpenGL spec
+	uint32_t endianness;				// should be 0x04030201; if it is instead 0x01020304, then the endianness of everything must be switched.
+	uint32_t gl_type;					// 0 for compressed textures, otherwise value from table 3.2 (page 162) of OpenGL 4.0 spec
+	uint32_t gl_type_size;				// size of data elements to do endianness swap on (1=endian-neutral data)
+	uint32_t gl_format;					// 0 for compressed textures, otherwise value from table 3.3 (page 163) of OpenGL spec
+	uint32_t gl_internal_format;		// sized-internal-format, corresponding to table 3.12 to 3.14 (pages 182-185) of OpenGL spec
 	uint32_t gl_base_internal_format;	// unsized-internal-format: corresponding to table 3.11 (page 179) of OpenGL spec
-	uint32_t pixel_width;		// texture dimensions; not rounded up to block size for compressed.
-	uint32_t pixel_height;		// must be 0 for 1D textures.
-	uint32_t pixel_depth;		// must be 0 for 1D, 2D and cubemap textures.
+	uint32_t pixel_width;				// texture dimensions; not rounded up to block size for compressed.
+	uint32_t pixel_height;				// must be 0 for 1D textures.
+	uint32_t pixel_depth;				// must be 0 for 1D, 2D and cubemap textures.
 	uint32_t number_of_array_elements;	// 0 if not a texture array
-	uint32_t number_of_faces;	// 6 for cubemaps, 1 for non-cubemaps
+	uint32_t number_of_faces;			// 6 for cubemaps, 1 for non-cubemaps
 	uint32_t number_of_mipmap_levels;	// 0 or 1 for non-mipmapped textures; 0 indicates that auto-mipmap-gen should be done at load time.
 	uint32_t bytes_of_key_value_data;	// size in bytes of the key-and-value area immediately following the header.
 };
diff --git a/lib/astc-encoder/Source/astcenccli_toplevel.cpp b/lib/astc-encoder/Source/astcenccli_toplevel.cpp
index 273c421a9e..3f6a14dace 100644
--- a/lib/astc-encoder/Source/astcenccli_toplevel.cpp
+++ b/lib/astc-encoder/Source/astcenccli_toplevel.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -655,37 +655,6 @@ static int edit_astcenc_config(
 			argidx++;
 			cli_config.silentmode = 1;
 		}
-		else
-			if (!strcmp(argv[argidx], "-v"))
-		{
-			argidx += 7;
-			if (argidx > argc)
-			{
-				printf("ERROR: -v switch with less than 6 arguments\n");
-				return 1;
-			}
-
-			config.v_rgba_radius = atoi(argv[argidx - 6]);
-			config.v_rgb_power = static_cast<float>(atof(argv[argidx - 5]));
-			config.v_rgb_base = static_cast<float>(atof(argv[argidx - 4]));
-			config.v_rgb_mean = static_cast<float>(atof(argv[argidx - 3]));
-			config.v_rgb_stdev = static_cast<float>(atof(argv[argidx - 2]));
-			config.v_rgba_mean_stdev_mix = static_cast<float>(atof(argv[argidx - 1]));
-		}
-		else if (!strcmp(argv[argidx], "-va"))
-		{
-			argidx += 5;
-			if (argidx > argc)
-			{
-				printf("ERROR: -va switch with less than 4 arguments\n");
-				return 1;
-			}
-
-			config.v_a_power= static_cast<float>(atof(argv[argidx - 4]));
-			config.v_a_base = static_cast<float>(atof(argv[argidx - 3]));
-			config.v_a_mean = static_cast<float>(atof(argv[argidx - 2]));
-			config.v_a_stdev = static_cast<float>(atof(argv[argidx - 1]));
-		}
 		else if (!strcmp(argv[argidx], "-cw"))
 		{
 			argidx += 5;
@@ -711,17 +680,6 @@ static int edit_astcenc_config(
 
 			config.a_scale_radius = atoi(argv[argidx - 1]);
 		}
-		else if (!strcmp(argv[argidx], "-b"))
-		{
-			argidx += 2;
-			if (argidx > argc)
-			{
-				printf("ERROR: -b switch with no argument\n");
-				return 1;
-			}
-
-			config.b_deblock_weight = static_cast<float>(atof(argv[argidx - 1]));
-		}
 		else if (!strcmp(argv[argidx], "-esw"))
 		{
 			argidx += 2;
@@ -1125,17 +1083,6 @@ static void print_astcenc_config(
 		}
 
 		printf("    Bitrate:                    %3.2f bpp\n", 128.0 / (config.block_x * config.block_y * config.block_z));
-
-		printf("    Radius mean/stdev:          %u texels\n", config.v_rgba_radius);
-		printf("    RGB power:                  %g\n", (double)config.v_rgb_power );
-		printf("    RGB base weight:            %g\n", (double)config.v_rgb_base);
-		printf("    RGB mean weight:            %g\n", (double)config.v_rgb_mean);
-		printf("    RGB stdev weight:           %g\n", (double)config.v_rgb_stdev);
-		printf("    RGB mean/stdev mixing:      %g\n", (double)config.v_rgba_mean_stdev_mix);
-		printf("    Alpha power:                %g\n", (double)config.v_a_power);
-		printf("    Alpha base weight:          %g\n", (double)config.v_a_base);
-		printf("    Alpha mean weight:          %g\n", (double)config.v_a_mean);
-		printf("    Alpha stdev weight:         %g\n", (double)config.v_a_stdev);
 		printf("    RGB alpha scale weight:     %d\n", (config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT));
 		if ((config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT))
 		{
@@ -1146,7 +1093,6 @@ static void print_astcenc_config(
 		printf("    G component weight:         %g\n",(double)config.cw_g_weight);
 		printf("    B component weight:         %g\n",(double)config.cw_b_weight);
 		printf("    A component weight:         %g\n",(double)config.cw_a_weight);
-		printf("    Deblock artifact setting:   %g\n", (double)config.b_deblock_weight);
 		printf("    Partition cutoff:           %u partitions\n", config.tune_partition_count_limit);
 		printf("    Partition index cutoff:     %u partition ids\n", config.tune_partition_index_limit);
 		printf("    PSNR cutoff:                %g dB\n", (double)config.tune_db_limit);
@@ -1154,7 +1100,8 @@ static void print_astcenc_config(
 		printf("    3.2+ partition cutoff:      %g\n", (double)config.tune_3_partition_early_out_limit_factor);
 		printf("    2 plane correlation cutoff: %g\n", (double)config.tune_2_plane_early_out_limit_correlation);
 		printf("    Block mode centile cutoff:  %g%%\n", (double)(config.tune_block_mode_limit));
-		printf("    Max refinement cutoff:      %u iterations\n", config.tune_refinement_limit);
+		printf("    Candidate cutoff:           %u candidates\n", config.tune_candidate_limit);
+		printf("    Refinement cutoff:          %u iterations\n", config.tune_refinement_limit);
 		printf("    Compressor thread count:    %d\n", cli_config.thread_count);
 		printf("\n");
 	}
@@ -1512,7 +1459,7 @@ int main(
 	if (operation & ASTCENC_STAGE_ST_NCOMP)
 	{
 		int bitness = get_output_filename_enforced_bitness(output_filename.c_str());
-		if (bitness == -1)
+		if (bitness < 0)
 		{
 			return 1;
 		}
diff --git a/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp b/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp
index a32e741a23..e9da90ea9a 100644
--- a/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp
+++ b/lib/astc-encoder/Source/astcenccli_toplevel_help.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2021 Arm Limited
+// Copyright 2011-2022 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -161,14 +161,13 @@ COMPRESSION
        -mask
            The input texture is a mask texture with unrelated data stored
            in the various color components, so enable error heuristics that
-           aim to improve perceptual quality by minimizing the effect of
-           error cross-talk across the color components.
+           aim to improve quality by minimizing the effect of error
+           cross-talk across the color components.
 
        -normal
            The input texture is a three component linear LDR normal map
            storing unit length normals as (R=X, G=Y, B=Z). The output will
-           be a two component X+Y normal map stored as (RGB=X, A=Y),
-           optimized for angular error instead of simple PSNR. The Z
+           be a two component X+Y normal map stored as (RGB=X, A=Y). The Z
            component can be recovered programmatically in shader code by
            using the equation:
 
@@ -218,17 +217,8 @@ R"(
 COMPRESSION TIPS & TRICKS
        ASTC is a block-based format that can be prone to block artifacts.
        If block artifacts are a problem when compressing a given texture,
-       adding some or all of following command-line options may help:
-
-           -b 1.8
-           -v 2 1 1 0 25 0.1
-           -va 1 1 0 25
-           -dblimit 60
-
-       The -b option is a general-purpose block-artifact reduction option.
-       The -v and -va option settings will concentrate effort where smooth
-       regions lie next to regions with high detail, which are particularly
-       prone to block artifacts.
+       increasing the compressor quality preset can help to alleviate the
+       problem.
 
        If a texture exhibits severe block artifacts in only some of the
        color components, which is a common problem for mask textures, then
@@ -243,34 +233,6 @@ ADVANCED COMPRESSION
        These options provide low-level control of the codec error metric
        computation, used to determine what good compression looks like.
 
-       -v <radius> <power> <base> <mean> <stdev> <mix>
-           Compute the per-texel relative error weighting for the RGB color
-           components as follows:
-
-           weight = 1 / (<base> + <mean> * mean^2 + <stdev> * stdev^2)
-
-           The <radius> argument specifies the texel radius of the
-           neighborhood over which the average and standard deviation are
-           computed.
-
-           The <mix> parameter is used to control the degree of mixing of
-           the average and stddev error values across the color components.
-           Setting this parameter to 0 causes the computation to be done
-           separately for each color component; setting it to 1 causes the
-           results from the RGB components to be combined and applied to
-           all three components. Intermediate values between these two
-           settings do a linear mix of the two.
-
-           The <power> argument is a power used to raise the values of the
-           input texels before computing average and standard deviation;
-           e.g. a power of 0.5 causes the codec to take the square root
-           of every input texel value.
-
-       -va <power> <base> <mean> <stdev>
-           Compute the per-texel relative error weighting for the alpha
-           component, when used in conjunction with -v. See documentation
-           of -v for individual parameter documentation.
-
        -a <radius>
            For textures with alpha component, scale per-texel weights by
            the alpha value. The alpha value chosen for scaling of any
@@ -290,12 +252,6 @@ ADVANCED COMPRESSION
            significance, and values below 1 to decrease it. Set to 0 to
            exclude a component from error computation.
 
-       -b <weight>
-           Assign an additional weight scaling for texels at compression
-           block edges and corners. Setting this to a value above 1
-           increases the significance of texels closer to the edges of a
-           block, and can help to reduce block artifacts.
-
        -mpsnr <low> <high>
            Set the low and high f-stop values for the mPSNR error metric.
            The mPSNR error metric only applies to HDR textures.
diff --git a/lib/astc-encoder/Source/cmake_core.cmake b/lib/astc-encoder/Source/cmake_core.cmake
index e3f9c5088b..8431fd8c84 100644
--- a/lib/astc-encoder/Source/cmake_core.cmake
+++ b/lib/astc-encoder/Source/cmake_core.cmake
@@ -118,6 +118,7 @@ macro(astcenc_set_properties NAME)
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-c++98-c++11-compat-pedantic>
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-float-equal>
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations>
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-atomic-implicit-seq-cst>
 
             # Clang 10 also throws up warnings we need to investigate (ours)
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-old-style-cast>
@@ -127,8 +128,7 @@ macro(astcenc_set_properties NAME)
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-shift-sign-overflow>
             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-format-nonliteral>
 
-            $<$<CXX_COMPILER_ID:Clang>:-Wdocumentation>)
-
+           $<$<CXX_COMPILER_ID:Clang>:-Wdocumentation>)
 
     target_link_options(${NAME}
         PRIVATE
diff --git a/lib/astc-encoder/Utils/Example/CMakeLists.txt b/lib/astc-encoder/Utils/Example/CMakeLists.txt
index eec6ffcfaf..dbc104770d 100644
--- a/lib/astc-encoder/Utils/Example/CMakeLists.txt
+++ b/lib/astc-encoder/Utils/Example/CMakeLists.txt
@@ -35,7 +35,7 @@ project(astcencoder_example VERSION 1.0.0)
 ExternalProject_Add(astcencoder
     GIT_REPOSITORY https://github.com/ARM-software/astc-encoder
     GIT_TAG main
-    CMAKE_CACHE_ARGS -DCLI:String=OFF
+    CMAKE_CACHE_ARGS -DCLI:STRING=OFF -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
     INSTALL_COMMAND "")
 
 ExternalProject_Get_property(astcencoder
diff --git a/lib/astc-encoder/Utils/Example/astc_api_example.cpp b/lib/astc-encoder/Utils/Example/astc_api_example.cpp
index b60fa8d8da..7e95f7c86b 100644
--- a/lib/astc-encoder/Utils/Example/astc_api_example.cpp
+++ b/lib/astc-encoder/Utils/Example/astc_api_example.cpp
@@ -68,7 +68,7 @@ int main(int argc, char **argv)
 	uint8_t *image_data = (uint8_t*)stbi_load(argv[1], &image_x, &image_y, &image_c, 4);
 	if (!image_data)
 	{
-		printf("Failed to load image \"%s\"\n", image_data);
+		printf("Failed to load image \"%s\"\n", argv[1]);
 		return 1;
 	}