|  | 
| 95 | 95 | #include "filters/mbfilter_utf8.h" | 
| 96 | 96 | 
 | 
| 97 | 97 | #include "eaw_table.h" | 
|  | 98 | +#include "rare_cp_bitvec.h" | 
| 98 | 99 | 
 | 
| 99 | 100 | /* hex character table "0123456789ABCDEF" */ | 
| 100 | 101 | static char mbfl_hexchar_table[] = { | 
| @@ -236,26 +237,52 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd) | 
| 236 | 237 | /* | 
| 237 | 238 |  * encoding detector | 
| 238 | 239 |  */ | 
| 239 |  | -static int mbfl_estimate_encoding_likelihood(int c, void *void_data) | 
|  | 240 | +static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data) | 
| 240 | 241 | { | 
| 241 | 242 | 	mbfl_encoding_detector_data *data = void_data; | 
| 242 |  | - | 
| 243 |  | -	/* Receive wchars decoded from test string using candidate encoding | 
| 244 |  | -	 * If the test string was invalid in the candidate encoding, we assume | 
| 245 |  | -	 * it's the wrong one. */ | 
|  | 243 | +	unsigned int c = input_cp; | 
|  | 244 | + | 
|  | 245 | +	/* Receive wchars decoded from input string using candidate encoding. | 
|  | 246 | +	 * If the string was invalid in the candidate encoding, we assume | 
|  | 247 | +	 * it's the wrong one. Otherwise, give the candidate many 'demerits' | 
|  | 248 | +	 * for each 'rare' codepoint found, a smaller number for each ASCII | 
|  | 249 | +	 * punctuation character, and 1 for all other codepoints. | 
|  | 250 | +	 * | 
|  | 251 | +	 * The 'common' codepoints should cover the vast majority of | 
|  | 252 | +	 * codepoints we are likely to see in practice, while only covering | 
|  | 253 | +	 * a small minority of the entire Unicode encoding space. Why? | 
|  | 254 | +	 * Well, if the test string happens to be valid in an incorrect | 
|  | 255 | +	 * candidate encoding, the bogus codepoints which it decodes to will | 
|  | 256 | +	 * be more or less random. By treating the majority of codepoints as | 
|  | 257 | +	 * 'rare', we ensure that in almost all such cases, the bogus | 
|  | 258 | +	 * codepoints will include plenty of 'rares', thus giving the | 
|  | 259 | +	 * incorrect candidate encoding lots of demerits. See | 
|  | 260 | +	 * common_codepoints.txt for the actual list used. | 
|  | 261 | +	 * | 
|  | 262 | +	 * So, why give extra demerits for ASCII punctuation characters? It's | 
|  | 263 | +	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022, | 
|  | 264 | +	 * which deliberately only use bytes in the ASCII range. When | 
|  | 265 | +	 * misinterpreted as ASCII/UTF-8, strings in these encodings will | 
|  | 266 | +	 * have an unusually high number of ASCII punctuation characters. | 
|  | 267 | +	 * So giving extra demerits for such characters will improve | 
|  | 268 | +	 * detection accuracy for UTF-7 and similar encodings. | 
|  | 269 | +	 * | 
|  | 270 | +	 * Finally, why 1 demerit for all other characters? That penalizes | 
|  | 271 | +	 * long strings, meaning we will tend to choose a candidate encoding | 
|  | 272 | +	 * in which the test string decodes to a smaller number of | 
|  | 273 | +	 * codepoints. That prevents single-byte encodings in which almost | 
|  | 274 | +	 * every possible input byte decodes to a 'common' codepoint from | 
|  | 275 | +	 * being favored too much. */ | 
| 246 | 276 | 	if (c == MBFL_BAD_INPUT) { | 
| 247 | 277 | 		data->num_illegalchars++; | 
| 248 |  | -	} else if (c < 0x9 || (c >= 0xE && c <= 0x1F) || (c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) { | 
| 249 |  | -		/* Otherwise, count how many control characters and 'private use' | 
| 250 |  | -		 * codepoints we see. Those are rarely used and may indicate that | 
| 251 |  | -		 * the candidate encoding is not the right one. */ | 
| 252 |  | -		data->score += 10; | 
| 253 |  | -	} else if ((c >= 0x21 && c <= 0x2F) || (c >= 0x3A && c <= 0x40) || (c >= 0x5B && c <= 0x60)) { | 
| 254 |  | -		/* Punctuation is also less common than letters/digits; further, if | 
| 255 |  | -		 * text in ISO-2022 or similar encodings is mistakenly identified as | 
| 256 |  | -		 * ASCII or UTF-8, the misinterpreted string will tend to have an | 
| 257 |  | -		 * unusually high density of ASCII punctuation characters. */ | 
| 258 |  | -		data->score++; | 
|  | 278 | +	} else if (c > 0xFFFF) { | 
|  | 279 | +		data->score += 40; | 
|  | 280 | +	} else if (c >= 0x21 && c <= 0x2F) { | 
|  | 281 | +		data->score += 6; | 
|  | 282 | +	} else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) { | 
|  | 283 | +		data->score += 30; | 
|  | 284 | +	} else { | 
|  | 285 | +		data->score += 1; | 
| 259 | 286 | 	} | 
| 260 | 287 | 	return 0; | 
| 261 | 288 | } | 
|  | 
0 commit comments