@@ -19,9 +19,12 @@ const HTML_ENTITIES: Record<string, string> = {
1919} ;
2020
2121/**
22- * Pattern to match HTML entities we want to decode
22+ * Pattern to match HTML entities we want to decode:
23+ * - Named entities: & < > " ' '
24+ * - Decimal numeric entities: {
25+ * - Hex numeric entities: ï «
2326 */
24- const ENTITY_PATTERN = / & (?: a m p | l t | g t | q u o t | a p o s | n b s p | # 3 9 ) ; / g;
27+ const ENTITY_PATTERN = / & (?: a m p | l t | g t | q u o t | a p o s | n b s p | # 3 9 | # \d + | # x [ 0 - 9 a - f A - F ] + ) ; / g;
2528
2629/**
2730 * Maximum iterations to prevent infinite loops
@@ -52,7 +55,27 @@ export const decodeHtmlEntities = (text: string): string => {
5255 while ( iterations < MAX_DECODE_ITERATIONS ) {
5356 const decoded = result . replaceAll (
5457 ENTITY_PATTERN ,
55- ( match ) => HTML_ENTITIES [ match ] ?? match
58+ ( match ) => {
59+ // Check named entities first
60+ if ( HTML_ENTITIES [ match ] ) {
61+ return HTML_ENTITIES [ match ] ;
62+ }
63+ // Handle hex numeric entities: ï -> ï
64+ if ( match . startsWith ( "&#x" ) || match . startsWith ( "&#X" ) ) {
65+ const codePoint = parseInt ( match . slice ( 3 , - 1 ) , 16 ) ;
66+ if ( ! isNaN ( codePoint ) ) {
67+ return String . fromCodePoint ( codePoint ) ;
68+ }
69+ }
70+ // Handle decimal numeric entities: ï -> ï
71+ if ( match . startsWith ( "&#" ) ) {
72+ const codePoint = parseInt ( match . slice ( 2 , - 1 ) , 10 ) ;
73+ if ( ! isNaN ( codePoint ) ) {
74+ return String . fromCodePoint ( codePoint ) ;
75+ }
76+ }
77+ return match ;
78+ }
5679 ) ;
5780
5881 // No more entities to decode
0 commit comments