8357289: Break down the String constructor into smaller methods

wenshao · wenshao · commit 839cede1a46b · 2025-06-27T14:06:12.000Z
Reviewed-by: liach, rriggs
diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java
@@ -553,135 +553,125 @@ public String(byte[] bytes, int offset, int length, Charset charset) {
      * disambiguate it against other similar methods of this class.
      */
     private String(Charset charset, byte[] bytes, int offset, int length) {
+        String str;
         if (length == 0) {
-            this.value = "".value;
-            this.coder = "".coder;
+            str = "";
         } else if (charset == UTF_8.INSTANCE) {
-            if (COMPACT_STRINGS) {
-                int dp = StringCoding.countPositives(bytes, offset, length);
-                if (dp == length) {
-                    this.value = Arrays.copyOfRange(bytes, offset, offset + length);
-                    this.coder = LATIN1;
-                    return;
+            str = utf8(bytes, offset, length);
+        } else if (charset == ISO_8859_1.INSTANCE) {
+            str = iso88591(bytes, offset, length);
+        } else if (charset == US_ASCII.INSTANCE) {
+            str = ascii(bytes, offset, length);
+        } else {
+            str = decode(charset, bytes, offset, length);
+        }
+        this(str);
+    }
+
+    private static String utf8(byte[] bytes, int offset, int length) {
+        if (COMPACT_STRINGS) {
+            int dp = StringCoding.countPositives(bytes, offset, length);
+            if (dp == length) {
+                return new String(Arrays.copyOfRange(bytes, offset, offset + length), LATIN1);
+            }
+            // Decode with a stable copy, to be the result if the decoded length is the same
+            byte[] latin1 = Arrays.copyOfRange(bytes, offset, offset + length);
+            int sp = dp;            // first dp bytes are already in the copy
+            while (sp < length) {
+                int b1 = latin1[sp++];
+                if (b1 >= 0) {
+                    latin1[dp++] = (byte) b1;
+                    continue;
                 }
-                // Decode with a stable copy, to be the result if the decoded length is the same
-                byte[] latin1 = Arrays.copyOfRange(bytes, offset, offset + length);
-                int sp = dp;            // first dp bytes are already in the copy
-                while (sp < length) {
-                    int b1 = latin1[sp++];
-                    if (b1 >= 0) {
-                        latin1[dp++] = (byte)b1;
+                if ((b1 & 0xfe) == 0xc2 && sp < length) { // b1 either 0xc2 or 0xc3
+                    int b2 = latin1[sp];
+                    if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65
+                        latin1[dp++] = (byte) decode2(b1, b2);
+                        sp++;
                         continue;
                     }
-                    if ((b1 & 0xfe) == 0xc2 && sp < length) { // b1 either 0xc2 or 0xc3
-                        int b2 = latin1[sp];
-                        if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65
-                            latin1[dp++] = (byte)decode2(b1, b2);
-                            sp++;
-                            continue;
-                        }
-                    }
-                    // anything not a latin1, including the REPL
-                    // we have to go with the utf16
-                    sp--;
-                    break;
-                }
-                if (sp == length) {
-                    if (dp != latin1.length) {
-                        latin1 = Arrays.copyOf(latin1, dp);
-                    }
-                    this.value = latin1;
-                    this.coder = LATIN1;
-                    return;
-                }
-                byte[] utf16 = StringUTF16.newBytesFor(length);
-                StringLatin1.inflate(latin1, 0, utf16, 0, dp);
-                dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true);
-                if (dp != length) {
-                    utf16 = Arrays.copyOf(utf16, dp << 1);
                 }
-                this.value = utf16;
-                this.coder = UTF16;
-            } else { // !COMPACT_STRINGS
-                byte[] dst = StringUTF16.newBytesFor(length);
-                int dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, true);
-                if (dp != length) {
-                    dst = Arrays.copyOf(dst, dp << 1);
+                // anything not a latin1, including the REPL
+                // we have to go with the utf16
+                sp--;
+                break;
+            }
+            if (sp == length) {
+                if (dp != latin1.length) {
+                    latin1 = Arrays.copyOf(latin1, dp);
                 }
-                this.value = dst;
-                this.coder = UTF16;
+                return new String(latin1, LATIN1);
             }
-        } else if (charset == ISO_8859_1.INSTANCE) {
-            if (COMPACT_STRINGS) {
-                this.value = Arrays.copyOfRange(bytes, offset, offset + length);
-                this.coder = LATIN1;
-            } else {
-                this.value = StringLatin1.inflate(bytes, offset, length);
-                this.coder = UTF16;
+            byte[] utf16 = StringUTF16.newBytesFor(length);
+            StringLatin1.inflate(latin1, 0, utf16, 0, dp);
+            dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true);
+            if (dp != length) {
+                utf16 = Arrays.copyOf(utf16, dp << 1);
             }
-        } else if (charset == US_ASCII.INSTANCE) {
-            if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) {
-                this.value = Arrays.copyOfRange(bytes, offset, offset + length);
-                this.coder = LATIN1;
-            } else {
-                byte[] dst = StringUTF16.newBytesFor(length);
-                int dp = 0;
-                while (dp < length) {
-                    int b = bytes[offset++];
-                    StringUTF16.putChar(dst, dp++, (b >= 0) ? (char) b : REPL);
-                }
-                this.value = dst;
-                this.coder = UTF16;
+            return new String(utf16, UTF16);
+        } else { // !COMPACT_STRINGS
+            byte[] dst = StringUTF16.newBytesFor(length);
+            int dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, true);
+            if (dp != length) {
+                dst = Arrays.copyOf(dst, dp << 1);
             }
+            return new String(dst, UTF16);
+        }
+    }
+
+    private static String iso88591(byte[] bytes, int offset, int length) {
+        if (COMPACT_STRINGS) {
+            return new String(Arrays.copyOfRange(bytes, offset, offset + length), LATIN1);
         } else {
-            // (1)We never cache the "external" cs, the only benefit of creating
-            // an additional StringDe/Encoder object to wrap it is to share the
-            // de/encode() method. These SD/E objects are short-lived, the young-gen
-            // gc should be able to take care of them well. But the best approach
-            // is still not to generate them if not really necessary.
-            // (2)The defensive copy of the input byte/char[] has a big performance
-            // impact, as well as the outgoing result byte/char[]. Need to do the
-            // optimization check of (sm==null && classLoader0==null) for both.
-            CharsetDecoder cd = charset.newDecoder();
-            // ArrayDecoder fastpaths
-            if (cd instanceof ArrayDecoder ad) {
-                // ascii
-                if (ad.isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) {
-                    if (COMPACT_STRINGS) {
-                        this.value = Arrays.copyOfRange(bytes, offset, offset + length);
-                        this.coder = LATIN1;
-                        return;
-                    }
-                    this.value = StringLatin1.inflate(bytes, offset, length);
-                    this.coder = UTF16;
-                    return;
-                }
+            return new String(StringLatin1.inflate(bytes, offset, length), UTF16);
+        }
+    }
+
+    private static String ascii(byte[] bytes, int offset, int length) {
+        if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) {
+            return new String(Arrays.copyOfRange(bytes, offset, offset + length), LATIN1);
+        } else {
+            byte[] dst = StringUTF16.newBytesFor(length);
+            int dp = 0;
+            while (dp < length) {
+                int b = bytes[offset++];
+                StringUTF16.putChar(dst, dp++, (b >= 0) ? (char) b : REPL);
+            }
+            return new String(dst, UTF16);
+        }
+    }
 
+    private static String decode(Charset charset, byte[] bytes, int offset, int length) {
+        // (1)We never cache the "external" cs, the only benefit of creating
+        // an additional StringDe/Encoder object to wrap it is to share the
+        // de/encode() method. These SD/E objects are short-lived, the young-gen
+        // gc should be able to take care of them well. But the best approach
+        // is still not to generate them if not really necessary.
+        // (2)The defensive copy of the input byte/char[] has a big performance
+        // impact, as well as the outgoing result byte/char[]. Need to do the
+        // optimization check of (sm==null && classLoader0==null) for both.
+        CharsetDecoder cd = charset.newDecoder();
+        // ArrayDecoder fastpaths
+        if (cd instanceof ArrayDecoder ad) {
+            // ascii
+            if (ad.isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) {
+                return iso88591(bytes, offset, length);
+            } else {
                 // fastpath for always Latin1 decodable single byte
                 if (COMPACT_STRINGS && ad.isLatin1Decodable()) {
                     byte[] dst = new byte[length];
                     ad.decodeToLatin1(bytes, offset, length, dst);
-                    this.value = dst;
-                    this.coder = LATIN1;
-                    return;
-                }
-
-                int en = scale(length, cd.maxCharsPerByte());
-                cd.onMalformedInput(CodingErrorAction.REPLACE)
-                        .onUnmappableCharacter(CodingErrorAction.REPLACE);
-                char[] ca = new char[en];
-                int clen = ad.decode(bytes, offset, length, ca);
-                if (COMPACT_STRINGS) {
-                    byte[] val = StringUTF16.compress(ca, 0, clen);;
-                    this.coder = StringUTF16.coderFromArrayLen(val, clen);
-                    this.value = val;
-                    return;
+                    return new String(dst, LATIN1);
+                } else {
+                    int en = scale(length, cd.maxCharsPerByte());
+                    cd.onMalformedInput(CodingErrorAction.REPLACE)
+                            .onUnmappableCharacter(CodingErrorAction.REPLACE);
+                    char[] ca = new char[en];
+                    int clen = ad.decode(bytes, offset, length, ca);
+                    return new String(ca, 0, clen, null);
                 }
-                coder = UTF16;
-                value = StringUTF16.toBytes(ca, 0, clen);
-                return;
             }
-
+        } else {
             // decode using CharsetDecoder
             int en = scale(length, cd.maxCharsPerByte());
             cd.onMalformedInput(CodingErrorAction.REPLACE)
@@ -694,14 +684,7 @@ private String(Charset charset, byte[] bytes, int offset, int length) {
                 // Substitution is enabled, so this shouldn't happen
                 throw new Error(x);
             }
-            if (COMPACT_STRINGS) {
-                byte[] val = StringUTF16.compress(ca, 0, caLen);
-                this.coder = StringUTF16.coderFromArrayLen(val, caLen);
-                this.value = val;
-                return;
-            }
-            coder = UTF16;
-            value = StringUTF16.toBytes(ca, 0, caLen);
+            return new String(ca, 0, caLen, null);
         }
     }