@@ -755,100 +755,113 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
755
755
756
756
PyObject * PyCodec_XMLCharRefReplaceErrors (PyObject * exc )
757
757
{
758
- if (PyObject_TypeCheck (exc , (PyTypeObject * )PyExc_UnicodeEncodeError )) {
759
- PyObject * restuple ;
760
- PyObject * object ;
761
- Py_ssize_t i ;
762
- Py_ssize_t start ;
763
- Py_ssize_t end ;
764
- PyObject * res ;
765
- Py_UCS1 * outp ;
766
- Py_ssize_t ressize ;
767
- Py_UCS4 ch ;
768
- if (PyUnicodeEncodeError_GetStart (exc , & start ))
769
- return NULL ;
770
- if (PyUnicodeEncodeError_GetEnd (exc , & end ))
771
- return NULL ;
772
- if (!(object = PyUnicodeEncodeError_GetObject (exc )))
773
- return NULL ;
774
- if (end - start > PY_SSIZE_T_MAX / (2 + 7 + 1 ))
775
- end = start + PY_SSIZE_T_MAX / (2 + 7 + 1 );
776
- for (i = start , ressize = 0 ; i < end ; ++ i ) {
777
- /* object is guaranteed to be "ready" */
778
- ch = PyUnicode_READ_CHAR (object , i );
779
- if (ch < 10 )
780
- ressize += 2 + 1 + 1 ;
781
- else if (ch < 100 )
782
- ressize += 2 + 2 + 1 ;
783
- else if (ch < 1000 )
784
- ressize += 2 + 3 + 1 ;
785
- else if (ch < 10000 )
786
- ressize += 2 + 4 + 1 ;
787
- else if (ch < 100000 )
788
- ressize += 2 + 5 + 1 ;
789
- else if (ch < 1000000 )
790
- ressize += 2 + 6 + 1 ;
791
- else
792
- ressize += 2 + 7 + 1 ;
758
+ if (!PyObject_TypeCheck (exc , (PyTypeObject * )PyExc_UnicodeEncodeError )) {
759
+ wrong_exception_type (exc );
760
+ return NULL ;
761
+ }
762
+
763
+ PyObject * obj ;
764
+ Py_ssize_t objlen , start , end , slen ;
765
+ if (_PyUnicodeError_GetParams (exc ,
766
+ & obj , & objlen ,
767
+ & start , & end , & slen , false) < 0 )
768
+ {
769
+ return NULL ;
770
+ }
771
+
772
+ // The number of characters that each character 'ch' contributes
773
+ // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
774
+ // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
775
+ // range is below 10^7, each "block" requires at most 2 + 7 + 1
776
+ // characters.
777
+ if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1 )) {
778
+ end = start + PY_SSIZE_T_MAX / (2 + 7 + 1 );
779
+ end = Py_MIN (end , objlen );
780
+ slen = Py_MAX (0 , end - start );
781
+ }
782
+
783
+ Py_ssize_t ressize = 0 ;
784
+ for (Py_ssize_t i = start ; i < end ; ++ i ) {
785
+ /* object is guaranteed to be "ready" */
786
+ Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
787
+ if (ch < 10 ) {
788
+ ressize += 2 + 1 + 1 ;
793
789
}
794
- /* allocate replacement */
795
- res = PyUnicode_New (ressize , 127 );
796
- if (res == NULL ) {
797
- Py_DECREF (object );
798
- return NULL ;
790
+ else if (ch < 100 ) {
791
+ ressize += 2 + 2 + 1 ;
799
792
}
800
- outp = PyUnicode_1BYTE_DATA (res );
801
- /* generate replacement */
802
- for (i = start ; i < end ; ++ i ) {
803
- int digits ;
804
- int base ;
805
- ch = PyUnicode_READ_CHAR (object , i );
806
- * outp ++ = '&' ;
807
- * outp ++ = '#' ;
808
- if (ch < 10 ) {
809
- digits = 1 ;
810
- base = 1 ;
811
- }
812
- else if (ch < 100 ) {
813
- digits = 2 ;
814
- base = 10 ;
815
- }
816
- else if (ch < 1000 ) {
817
- digits = 3 ;
818
- base = 100 ;
819
- }
820
- else if (ch < 10000 ) {
821
- digits = 4 ;
822
- base = 1000 ;
823
- }
824
- else if (ch < 100000 ) {
825
- digits = 5 ;
826
- base = 10000 ;
827
- }
828
- else if (ch < 1000000 ) {
829
- digits = 6 ;
830
- base = 100000 ;
831
- }
832
- else {
833
- digits = 7 ;
834
- base = 1000000 ;
835
- }
836
- while (digits -- > 0 ) {
837
- * outp ++ = '0' + ch /base ;
838
- ch %= base ;
839
- base /= 10 ;
840
- }
841
- * outp ++ = ';' ;
793
+ else if (ch < 1000 ) {
794
+ ressize += 2 + 3 + 1 ;
795
+ }
796
+ else if (ch < 10000 ) {
797
+ ressize += 2 + 4 + 1 ;
798
+ }
799
+ else if (ch < 100000 ) {
800
+ ressize += 2 + 5 + 1 ;
801
+ }
802
+ else if (ch < 1000000 ) {
803
+ ressize += 2 + 6 + 1 ;
804
+ }
805
+ else {
806
+ assert (ch < 10000000 );
807
+ ressize += 2 + 7 + 1 ;
842
808
}
843
- assert (_PyUnicode_CheckConsistency (res , 1 ));
844
- restuple = Py_BuildValue ("(Nn)" , res , end );
845
- Py_DECREF (object );
846
- return restuple ;
847
809
}
848
- else {
849
- wrong_exception_type (exc );
810
+
811
+ /* allocate replacement */
812
+ PyObject * res = PyUnicode_New (ressize , 127 );
813
+ if (res == NULL ) {
814
+ Py_DECREF (obj );
850
815
return NULL ;
851
816
}
817
+ Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
818
+ /* generate replacement */
819
+ for (Py_ssize_t i = start ; i < end ; ++ i ) {
820
+ int digits , base ;
821
+ Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
822
+ if (ch < 10 ) {
823
+ digits = 1 ;
824
+ base = 1 ;
825
+ }
826
+ else if (ch < 100 ) {
827
+ digits = 2 ;
828
+ base = 10 ;
829
+ }
830
+ else if (ch < 1000 ) {
831
+ digits = 3 ;
832
+ base = 100 ;
833
+ }
834
+ else if (ch < 10000 ) {
835
+ digits = 4 ;
836
+ base = 1000 ;
837
+ }
838
+ else if (ch < 100000 ) {
839
+ digits = 5 ;
840
+ base = 10000 ;
841
+ }
842
+ else if (ch < 1000000 ) {
843
+ digits = 6 ;
844
+ base = 100000 ;
845
+ }
846
+ else {
847
+ assert (ch < 10000000 );
848
+ digits = 7 ;
849
+ base = 1000000 ;
850
+ }
851
+ * outp ++ = '&' ;
852
+ * outp ++ = '#' ;
853
+ while (digits -- > 0 ) {
854
+ assert (base >= 1 );
855
+ * outp ++ = '0' + ch / base ;
856
+ ch %= base ;
857
+ base /= 10 ;
858
+ }
859
+ * outp ++ = ';' ;
860
+ }
861
+ assert (_PyUnicode_CheckConsistency (res , 1 ));
862
+ PyObject * restuple = Py_BuildValue ("(Nn)" , res , end );
863
+ Py_DECREF (obj );
864
+ return restuple ;
852
865
}
853
866
854
867
PyObject * PyCodec_BackslashReplaceErrors (PyObject * exc )
0 commit comments