Skip to content

Commit b5ff87c

Browse files
committed
Fix mbstring support for CP1252 encoding
It's a bit surprising how much was broken here. - Identify filter was utterly and completely wrong. - Instead of handling invalid CP1252 bytes as specified by `mb_substitute_character`, it would convert them to Unicode 0xFFFD (generic replacement character). - When converting ISO-8859-1 to CP1252, invalid ISO-8859-1 bytes would be passed through silently. - Unicode codepoints from 0x80-0x9F were converted to CP1252 bytes 0x80-0x9F, which is wrong. - Unicode codepoint 0xFFFD was converted to CP1252 0x9F, which is very wrong. Also clean up some unneeded code, and make the conversion table consistent with others by using zero as a 'invalid' marker, rather than 0xFFFD.
1 parent 2ce15d9 commit b5ff87c

File tree

2 files changed

+17
-34
lines changed

2 files changed

+17
-34
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp1252.c

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -74,32 +74,22 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp1252 = {
7474

7575
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
7676

77-
/*
78-
* wchar => cp1252
79-
*/
8077
int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter)
8178
{
82-
int s=-1, n;
79+
int s = -1;
8380

8481
if (c >= 0x100) {
85-
/* look it up from the cp1252 table */
86-
s = -1;
87-
n = 31;
88-
while (n >= 0) {
89-
if (c == cp1252_ucs_table[n] && c != 0xfffe) {
82+
/* Look it up from the CP1252 table */
83+
for (int n = 31; n >= 0; n--) {
84+
if (c == cp1252_ucs_table[n]) {
9085
s = 0x80 + n;
9186
break;
9287
}
93-
n--;
94-
}
95-
if (s <= 0 && (c & ~MBFL_WCSPLANE_MASK) == MBFL_WCSPLANE_8859_1)
96-
{
97-
s = c & MBFL_WCSPLANE_MASK;
9888
}
99-
}
100-
else if (c >= 0 && c < 0x100) {
89+
} else if (c <= 0x7F || c >= 0xA0) {
10190
s = c;
10291
}
92+
10393
if (s >= 0) {
10494
CK((*filter->output_function)(s, filter->data));
10595
} else {
@@ -108,15 +98,15 @@ int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter)
10898
return c;
10999
}
110100

111-
/*
112-
* cp1252 => wchar
113-
*/
114101
int mbfl_filt_conv_cp1252_wchar(int c, mbfl_convert_filter *filter)
115102
{
116103
int s;
117104

118-
if (c >= 0x80 && c < 0xa0) {
105+
if (c >= 0x80 && c < 0xA0) {
119106
s = cp1252_ucs_table[c - 0x80];
107+
if (!s) {
108+
s = c | MBFL_WCSGROUP_THROUGH;
109+
}
120110
} else {
121111
s = c;
122112
}
@@ -126,17 +116,10 @@ int mbfl_filt_conv_cp1252_wchar(int c, mbfl_convert_filter *filter)
126116
return c;
127117
}
128118

129-
/* We only distinguish the MS extensions to ISO-8859-1.
130-
* Actually, this is pretty much a NO-OP, since the identification
131-
* system doesn't allow us to discriminate between a positive match,
132-
* a possible match and a definite non-match.
133-
* The problem here is that cp1252 looks like SJIS for certain chars.
134-
* */
135119
static int mbfl_filt_ident_cp1252(int c, mbfl_identify_filter *filter)
136120
{
137-
if (c >= 0x80 && c < 0xa0)
138-
filter->flag = 0;
139-
else
140-
filter->flag = 1; /* not it */
121+
if (c >= 0x80 && c < 0xA0 && !cp1252_ucs_table[c - 0x80]) {
122+
filter->flag = 1;
123+
}
141124
return c;
142125
}

ext/mbstring/libmbfl/filters/unicode_table_cp1252.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@
3232
* as it only covers this range, while the rest cover 0xa0 onwards */
3333

3434
static const unsigned short cp1252_ucs_table[] = {
35-
0x20ac,0xfffd,0x201a,0x0192,0x201e,0x2026,0x2020,0x2021,
36-
0x02c6,0x2030,0x0160,0x2039,0x0152,0xfffd,0x017d,0xfffd,
37-
0xfffd,0x2018,0x2019,0x201c,0x201d,0x2022,0x2013,0x2014,
38-
0x02dc,0x2122,0x0161,0x203a,0x0153,0xfffd,0x017e,0x0178
35+
0x20ac,0x0000,0x201a,0x0192,0x201e,0x2026,0x2020,0x2021,
36+
0x02c6,0x2030,0x0160,0x2039,0x0152,0x0000,0x017d,0x0000,
37+
0x0000,0x2018,0x2019,0x201c,0x201d,0x2022,0x2013,0x2014,
38+
0x02dc,0x2122,0x0161,0x203a,0x0153,0x0000,0x017e,0x0178
3939
};
4040
#endif /* UNICODE_TABLE_CP1252_H */

0 commit comments

Comments
 (0)