Kim

Douglas Crockford · Douglas Crockford · commit 327e9a56fd2a · 2013-04-19T11:10:37.000-07:00
diff --git a/Kim.java b/Kim.java
@@ -0,0 +1,374 @@
+package org.json;
+
+
+/*
+ Copyright (c) 2013 JSON.org
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in all
+ copies or substantial portions of the Software.
+
+ The Software shall be used for Good, not Evil.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ */
+
+/**
+ * Kim makes immutable eight bit Unicode strings. If the MSB of a byte is set,
+ * then the next byte is a continuation byte. The last byte of a character
+ * never has the MSB reset. Every byte that is not the last byte has the MSB
+ * set. Kim stands for "Keep it minimal". A Unicode character is never longer
+ * than 3 bytes. Every byte contributes 7 bits to the character. ASCII is
+ * unmodified.
+ *
+ *                  Kim             UTF-8
+ * one byte         U+007F          U+007F
+ * two bytes        U+3FFF          U+07FF
+ * three bytes      U+10FFF         U+FFFF
+ * four bytes                       U+10FFFF
+ *
+ * Characters in the ranges U+0800..U+3FFF and U+10000..U+10FFFF will be one
+ * byte smaller when encoded in Kim compared to UTF-8.
+ *
+ * Kim is beneficial when using scripts such as Old South Arabian, Aramaic,
+ * Avestan, Balinese, Batak, Bopomofo, Buginese, Buhid, Carian, Cherokee,
+ * Coptic, Cyrillic, Deseret, Egyptian Hieroglyphs, Ethiopic, Georgian,
+ * Glagolitic, Gothic, Hangul Jamo, Hanunoo, Hiragana, Kanbun, Kaithi,
+ * Kannada, Katakana, Kharoshthi, Khmer, Lao, Lepcha, Limbu, Lycian, Lydian,
+ * Malayalam, Mandaic, Meroitic, Miao, Mongolian, Myanmar, New Tai Lue,
+ * Ol Chiki, Old Turkic, Oriya, Osmanya, Pahlavi, Parthian, Phags-Pa,
+ * Phoenician, Samaritan, Sharada, Sinhala, Sora Sompeng, Tagalog, Tagbanwa,
+ * Takri, Tai Le, Tai Tham, Tamil, Telugu, Thai, Tibetan, Tifinagh, UCAS.
+ *
+ * A kim object can be constructed from an ordinary UTF-16 string, or from a
+ * byte array. A kim object can produce a UTF-16 string.
+ *
+ * As with UTF-8, Kim can be sorted, and it is possible to detect character
+ * boundaries within a byte sequence. UTF-8 is one of the world's great
+ * inventions. While Kim is more efficient, it is not clear that it is worth
+ * the expense of transition.
+ *
+ * @version 2013-04-18
+ */
+public class Kim {
+
+    /**
+     * The byte array containing the kim's content.
+     */
+    private byte[] bytes = null;
+
+    /**
+     * The kim's hashcode, conforming to Java's hashcode recommendations.
+     */
+    private int hashcode = 0;
+
+    /**
+     * The number of bytes in the kim. The number of bytes can be as much as
+     * three times the number of characters.
+     */
+    public int length = 0;
+
+    /**
+     * The memoization of toString().
+     */
+    private String string = null;
+
+    /**
+     * Make a kim from a portion of a byte array.
+     *
+     * @param bytes
+     *            A byte array.
+     * @param from
+     *            The index of the first byte.
+     * @param thru
+     *            The index of the last byte plus one.
+     */
+    public Kim(byte[] bytes, int from, int thru) {
+
+// As the bytes are copied into the new kim, a hashcode is computed using a
+// modified Fletcher code.
+
+        int sum = 1;
+        int value;
+        this.hashcode = 0;
+        this.length = thru - from;
+        if (this.length > 0) {
+            this.bytes = new byte[this.length];
+            for (int at = 0; at < this.length; at += 1) {
+                value = (int) bytes[at + from] & 0xFF;
+                sum += value;
+                this.hashcode += sum;
+                this.bytes[at] = (byte) value;
+            }
+            this.hashcode += sum << 16;
+        }
+    }
+
+    /**
+     * Make a kim from a byte array.
+     *
+     * @param bytes
+     *            The byte array.
+     * @param length
+     *            The number of bytes.
+     */
+    public Kim(byte[] bytes, int length) {
+        this(bytes, 0, length);
+    }
+
+    /**
+     * Make a new kim from a substring of an existing kim. The coordinates are
+     * in byte units, not character units.
+     *
+     * @param kim
+     *            The source of bytes.
+     * @param from
+     *            The point at which to take bytes.
+     * @param thru
+     *            The point at which to stop taking bytes.
+     * @return the substring
+     */
+    public Kim(Kim kim, int from, int thru) {
+        this(kim.bytes, from, thru);
+    }
+
+    /**
+     * Make a kim from a string.
+     *
+     * @param string
+     *            The string.
+     * @throws JSONException
+     *             if surrogate pair mismatch.
+     */
+    public Kim(String string) throws JSONException {
+        int stringLength = string.length();
+        this.hashcode = 0;
+        this.length = 0;
+
+// First pass: Determine the length of the kim, allowing for the UTF-16
+// to UTF-32 conversion, and then the UTF-32 to kim conversion.
+
+        if (stringLength > 0) {
+            for (int i = 0; i < stringLength; i += 1) {
+                int c = string.charAt(i);
+                if (c <= 0x7F) {
+                    this.length += 1;
+                } else if (c <= 0x3FFF) {
+                    this.length += 2;
+                } else {
+                    if (c >= 0xD800 && c <= 0xDFFF) {
+                        i += 1;
+                        int d = string.charAt(i);
+                        if (c > 0xDBFF || d < 0xDC00 || d > 0xDFFF) {
+                            throw new JSONException("Bad UTF16");
+                        }
+                    }
+                    this.length += 3;
+                }
+            }
+
+// Second pass: Allocate a byte array and fill that array with the conversion
+// while computing the hashcode.
+
+            this.bytes = new byte[length];
+            int at = 0;
+            int b;
+            int sum = 1;
+            for (int i = 0; i < stringLength; i += 1) {
+                int character = string.charAt(i);
+                if (character <= 0x7F) {
+                    bytes[at] = (byte) character;
+                    sum += character;
+                    this.hashcode += sum;
+                    at += 1;
+                } else if (character <= 0x3FFF) {
+                    b = 0x80 | (character >>> 7);
+                    bytes[at] = (byte) b;
+                    sum += b;
+                    this.hashcode += sum;
+                    at += 1;
+                    b = character & 0x7F;
+                    bytes[at] = (byte) b;
+                    sum += b;
+                    this.hashcode += sum;
+                    at += 1;
+                } else {
+                    if (character >= 0xD800 && character <= 0xDBFF) {
+                        i += 1;
+                        character = (((character & 0x3FF) << 10) | (string
+                                .charAt(i) & 0x3FF)) + 65536;
+                    }
+                    b = 0x80 | (character >>> 14);
+                    bytes[at] = (byte) b;
+                    sum += b;
+                    this.hashcode += sum;
+                    at += 1;
+                    b = 0x80 | ((character >>> 7) & 0xFF);
+                    bytes[at] = (byte) b;
+                    sum += b;
+                    this.hashcode += sum;
+                    at += 1;
+                    b = character & 0x7F;
+                    bytes[at] = (byte) b;
+                    sum += b;
+                    this.hashcode += sum;
+                    at += 1;
+                }
+            }
+            this.hashcode += sum << 16;
+        }
+    }
+
+    /**
+     * Returns the character at the specified index. The index refers to byte
+     * values and ranges from 0 to length - 1. The index of the next character
+     * is at index + Kim.characterSize(kim.characterAt(index)).
+     *
+     * @param at
+     *            the index of the char value. The first character is at 0.
+     * @returns a Unicode character between 0 and 0x10FFFF.
+     * @throws JSONException
+     *             if at does not point to a valid character.
+     */
+    public int characterAt(int at) throws JSONException {
+        int c = get(at);
+        if ((c & 0x80) == 0) {
+            return c;
+        }
+        int character;
+        int c1 = get(at + 1);
+        if ((c1 & 0x80) == 0) {
+            character = ((c & 0x7F) << 7) | c1;
+            if (character > 0x7F) {
+                return character;
+            }
+        } else {
+            int c2 = get(at + 2);
+            character = ((c & 0x7F) << 14) | ((c1 & 0x7F) << 7) | c2;
+            if ((c2 & 0x80) == 0 && character > 0x3FFF && character <= 0x10FFFF
+                    && (character < 0xD800 || character > 0xDFFF)) {
+                return character;
+            }
+        }
+        throw new JSONException("Bad character at " + at);
+    }
+
+    /**
+     * Returns the number of bytes needed to contain the character in Kim
+     * format.
+     *
+     * @param character
+     *            a Unicode character between 0 and 0x10FFFF.
+     * @return 1, 2, or 3
+     * @throws JSONException
+     *             if the character is not representable in a kim.
+     */
+    public static int characterSize(int character) throws JSONException {
+        if (character < 0 || character > 0x10FFFF) {
+            throw new JSONException("Bad character " + character);
+        }
+        return character <= 0x7F ? 1 : character <= 0x3FFF ? 2 : 3;
+    }
+
+    /**
+     * Copy the contents of this kim to a byte array.
+     *
+     * @param bytes
+     *            A byte array of sufficient size.
+     * @param at
+     *            The position within the byte array to take the byes.
+     * @return The position immediately after the copy.
+     */
+    public int copy(byte[] bytes, int at) {
+        System.arraycopy(this.bytes, 0, bytes, at, this.length);
+        return at + this.length;
+    }
+
+    /**
+     * Two kim objects containing exactly the same bytes in the same order are
+     * equal to each other.
+     *
+     * @param obj
+     *            the other kim with which to compare.
+     * @returns true if this and obj are both kim objects containing identical
+     *          byte sequences.
+     */
+    public boolean equals(Object obj) {
+        if (!(obj instanceof Kim)) {
+            return false;
+        }
+        Kim that = (Kim) obj;
+        if (this == that) {
+            return true;
+        }
+        if (this.hashcode != that.hashcode) {
+            return false;
+        }
+        return java.util.Arrays.equals(this.bytes, that.bytes);
+    }
+
+    /**
+     *
+     * @param at
+     *            The position of the byte. The first byte is at 0.
+     * @return The byte.
+     * @throws JSONException
+     *             if there is no byte at that position.
+     */
+    public int get(int at) throws JSONException {
+        if (at < 0 || at > this.length) {
+            throw new JSONException("Bad character at " + at);
+        }
+        return ((int) this.bytes[at]) & 0xFF;
+    }
+
+    /**
+     * Returns a hash code value for the kim.
+     */
+    public int hashCode() {
+        return this.hashcode;
+    }
+
+    /**
+     * Produce a UTF-16 String from this kim. The number of codepoints in the
+     * string will not be greater than the number of bytes in the kim, although
+     * it could be less.
+     *
+     * @return The string. A kim memoizes its string representation.
+     * @throws JSONException
+     *             if the kim is invalid.
+     */
+    public String toString() throws JSONException {
+        if (this.string == null) {
+            int c;
+            int length = 0;
+            char chars[] = new char[this.length];
+            for (int at = 0; at < this.length; at += characterSize(c)) {
+                c = this.characterAt(at);
+                if (c < 0x10000) {
+                    chars[length] = (char) c;
+                    length += 1;
+                } else {
+                    chars[length] = (char) (0xD800 | ((c - 0x10000) >>> 10));
+                    length += 1;
+                    chars[length] = (char) (0xDC00 | (c & 0x03FF));
+                    length += 1;
+                }
+            }
+            this.string = new String(chars, 0, length);
+        }
+        return this.string;
+    }
+}