Skip to content

Commit 327e9a5

Browse files
author
Douglas Crockford
committed
Kim
1 parent a73066f commit 327e9a5

File tree

1 file changed

+374
-0
lines changed

1 file changed

+374
-0
lines changed

Kim.java

+374
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,374 @@
1+
package org.json;
2+
3+
4+
/*
5+
Copyright (c) 2013 JSON.org
6+
7+
Permission is hereby granted, free of charge, to any person obtaining a copy
8+
of this software and associated documentation files (the "Software"), to deal
9+
in the Software without restriction, including without limitation the rights
10+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
copies of the Software, and to permit persons to whom the Software is
12+
furnished to do so, subject to the following conditions:
13+
14+
The above copyright notice and this permission notice shall be included in all
15+
copies or substantial portions of the Software.
16+
17+
The Software shall be used for Good, not Evil.
18+
19+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25+
SOFTWARE.
26+
*/
27+
28+
/**
29+
* Kim makes immutable eight bit Unicode strings. If the MSB of a byte is set,
30+
* then the next byte is a continuation byte. The last byte of a character
31+
* never has the MSB reset. Every byte that is not the last byte has the MSB
32+
* set. Kim stands for "Keep it minimal". A Unicode character is never longer
33+
* than 3 bytes. Every byte contributes 7 bits to the character. ASCII is
34+
* unmodified.
35+
*
36+
* Kim UTF-8
37+
* one byte U+007F U+007F
38+
* two bytes U+3FFF U+07FF
39+
* three bytes U+10FFF U+FFFF
40+
* four bytes U+10FFFF
41+
*
42+
* Characters in the ranges U+0800..U+3FFF and U+10000..U+10FFFF will be one
43+
* byte smaller when encoded in Kim compared to UTF-8.
44+
*
45+
* Kim is beneficial when using scripts such as Old South Arabian, Aramaic,
46+
* Avestan, Balinese, Batak, Bopomofo, Buginese, Buhid, Carian, Cherokee,
47+
* Coptic, Cyrillic, Deseret, Egyptian Hieroglyphs, Ethiopic, Georgian,
48+
* Glagolitic, Gothic, Hangul Jamo, Hanunoo, Hiragana, Kanbun, Kaithi,
49+
* Kannada, Katakana, Kharoshthi, Khmer, Lao, Lepcha, Limbu, Lycian, Lydian,
50+
* Malayalam, Mandaic, Meroitic, Miao, Mongolian, Myanmar, New Tai Lue,
51+
* Ol Chiki, Old Turkic, Oriya, Osmanya, Pahlavi, Parthian, Phags-Pa,
52+
* Phoenician, Samaritan, Sharada, Sinhala, Sora Sompeng, Tagalog, Tagbanwa,
53+
* Takri, Tai Le, Tai Tham, Tamil, Telugu, Thai, Tibetan, Tifinagh, UCAS.
54+
*
55+
* A kim object can be constructed from an ordinary UTF-16 string, or from a
56+
* byte array. A kim object can produce a UTF-16 string.
57+
*
58+
* As with UTF-8, Kim can be sorted, and it is possible to detect character
59+
* boundaries within a byte sequence. UTF-8 is one of the world's great
60+
* inventions. While Kim is more efficient, it is not clear that it is worth
61+
* the expense of transition.
62+
*
63+
* @version 2013-04-18
64+
*/
65+
public class Kim {
66+
67+
/**
68+
* The byte array containing the kim's content.
69+
*/
70+
private byte[] bytes = null;
71+
72+
/**
73+
* The kim's hashcode, conforming to Java's hashcode recommendations.
74+
*/
75+
private int hashcode = 0;
76+
77+
/**
78+
* The number of bytes in the kim. The number of bytes can be as much as
79+
* three times the number of characters.
80+
*/
81+
public int length = 0;
82+
83+
/**
84+
* The memoization of toString().
85+
*/
86+
private String string = null;
87+
88+
/**
89+
* Make a kim from a portion of a byte array.
90+
*
91+
* @param bytes
92+
* A byte array.
93+
* @param from
94+
* The index of the first byte.
95+
* @param thru
96+
* The index of the last byte plus one.
97+
*/
98+
public Kim(byte[] bytes, int from, int thru) {
99+
100+
// As the bytes are copied into the new kim, a hashcode is computed using a
101+
// modified Fletcher code.
102+
103+
int sum = 1;
104+
int value;
105+
this.hashcode = 0;
106+
this.length = thru - from;
107+
if (this.length > 0) {
108+
this.bytes = new byte[this.length];
109+
for (int at = 0; at < this.length; at += 1) {
110+
value = (int) bytes[at + from] & 0xFF;
111+
sum += value;
112+
this.hashcode += sum;
113+
this.bytes[at] = (byte) value;
114+
}
115+
this.hashcode += sum << 16;
116+
}
117+
}
118+
119+
/**
120+
* Make a kim from a byte array.
121+
*
122+
* @param bytes
123+
* The byte array.
124+
* @param length
125+
* The number of bytes.
126+
*/
127+
public Kim(byte[] bytes, int length) {
128+
this(bytes, 0, length);
129+
}
130+
131+
/**
132+
* Make a new kim from a substring of an existing kim. The coordinates are
133+
* in byte units, not character units.
134+
*
135+
* @param kim
136+
* The source of bytes.
137+
* @param from
138+
* The point at which to take bytes.
139+
* @param thru
140+
* The point at which to stop taking bytes.
141+
* @return the substring
142+
*/
143+
public Kim(Kim kim, int from, int thru) {
144+
this(kim.bytes, from, thru);
145+
}
146+
147+
/**
148+
* Make a kim from a string.
149+
*
150+
* @param string
151+
* The string.
152+
* @throws JSONException
153+
* if surrogate pair mismatch.
154+
*/
155+
public Kim(String string) throws JSONException {
156+
int stringLength = string.length();
157+
this.hashcode = 0;
158+
this.length = 0;
159+
160+
// First pass: Determine the length of the kim, allowing for the UTF-16
161+
// to UTF-32 conversion, and then the UTF-32 to kim conversion.
162+
163+
if (stringLength > 0) {
164+
for (int i = 0; i < stringLength; i += 1) {
165+
int c = string.charAt(i);
166+
if (c <= 0x7F) {
167+
this.length += 1;
168+
} else if (c <= 0x3FFF) {
169+
this.length += 2;
170+
} else {
171+
if (c >= 0xD800 && c <= 0xDFFF) {
172+
i += 1;
173+
int d = string.charAt(i);
174+
if (c > 0xDBFF || d < 0xDC00 || d > 0xDFFF) {
175+
throw new JSONException("Bad UTF16");
176+
}
177+
}
178+
this.length += 3;
179+
}
180+
}
181+
182+
// Second pass: Allocate a byte array and fill that array with the conversion
183+
// while computing the hashcode.
184+
185+
this.bytes = new byte[length];
186+
int at = 0;
187+
int b;
188+
int sum = 1;
189+
for (int i = 0; i < stringLength; i += 1) {
190+
int character = string.charAt(i);
191+
if (character <= 0x7F) {
192+
bytes[at] = (byte) character;
193+
sum += character;
194+
this.hashcode += sum;
195+
at += 1;
196+
} else if (character <= 0x3FFF) {
197+
b = 0x80 | (character >>> 7);
198+
bytes[at] = (byte) b;
199+
sum += b;
200+
this.hashcode += sum;
201+
at += 1;
202+
b = character & 0x7F;
203+
bytes[at] = (byte) b;
204+
sum += b;
205+
this.hashcode += sum;
206+
at += 1;
207+
} else {
208+
if (character >= 0xD800 && character <= 0xDBFF) {
209+
i += 1;
210+
character = (((character & 0x3FF) << 10) | (string
211+
.charAt(i) & 0x3FF)) + 65536;
212+
}
213+
b = 0x80 | (character >>> 14);
214+
bytes[at] = (byte) b;
215+
sum += b;
216+
this.hashcode += sum;
217+
at += 1;
218+
b = 0x80 | ((character >>> 7) & 0xFF);
219+
bytes[at] = (byte) b;
220+
sum += b;
221+
this.hashcode += sum;
222+
at += 1;
223+
b = character & 0x7F;
224+
bytes[at] = (byte) b;
225+
sum += b;
226+
this.hashcode += sum;
227+
at += 1;
228+
}
229+
}
230+
this.hashcode += sum << 16;
231+
}
232+
}
233+
234+
/**
235+
* Returns the character at the specified index. The index refers to byte
236+
* values and ranges from 0 to length - 1. The index of the next character
237+
* is at index + Kim.characterSize(kim.characterAt(index)).
238+
*
239+
* @param at
240+
* the index of the char value. The first character is at 0.
241+
* @returns a Unicode character between 0 and 0x10FFFF.
242+
* @throws JSONException
243+
* if at does not point to a valid character.
244+
*/
245+
public int characterAt(int at) throws JSONException {
246+
int c = get(at);
247+
if ((c & 0x80) == 0) {
248+
return c;
249+
}
250+
int character;
251+
int c1 = get(at + 1);
252+
if ((c1 & 0x80) == 0) {
253+
character = ((c & 0x7F) << 7) | c1;
254+
if (character > 0x7F) {
255+
return character;
256+
}
257+
} else {
258+
int c2 = get(at + 2);
259+
character = ((c & 0x7F) << 14) | ((c1 & 0x7F) << 7) | c2;
260+
if ((c2 & 0x80) == 0 && character > 0x3FFF && character <= 0x10FFFF
261+
&& (character < 0xD800 || character > 0xDFFF)) {
262+
return character;
263+
}
264+
}
265+
throw new JSONException("Bad character at " + at);
266+
}
267+
268+
/**
269+
* Returns the number of bytes needed to contain the character in Kim
270+
* format.
271+
*
272+
* @param character
273+
* a Unicode character between 0 and 0x10FFFF.
274+
* @return 1, 2, or 3
275+
* @throws JSONException
276+
* if the character is not representable in a kim.
277+
*/
278+
public static int characterSize(int character) throws JSONException {
279+
if (character < 0 || character > 0x10FFFF) {
280+
throw new JSONException("Bad character " + character);
281+
}
282+
return character <= 0x7F ? 1 : character <= 0x3FFF ? 2 : 3;
283+
}
284+
285+
/**
286+
* Copy the contents of this kim to a byte array.
287+
*
288+
* @param bytes
289+
* A byte array of sufficient size.
290+
* @param at
291+
* The position within the byte array to take the byes.
292+
* @return The position immediately after the copy.
293+
*/
294+
public int copy(byte[] bytes, int at) {
295+
System.arraycopy(this.bytes, 0, bytes, at, this.length);
296+
return at + this.length;
297+
}
298+
299+
/**
300+
* Two kim objects containing exactly the same bytes in the same order are
301+
* equal to each other.
302+
*
303+
* @param obj
304+
* the other kim with which to compare.
305+
* @returns true if this and obj are both kim objects containing identical
306+
* byte sequences.
307+
*/
308+
public boolean equals(Object obj) {
309+
if (!(obj instanceof Kim)) {
310+
return false;
311+
}
312+
Kim that = (Kim) obj;
313+
if (this == that) {
314+
return true;
315+
}
316+
if (this.hashcode != that.hashcode) {
317+
return false;
318+
}
319+
return java.util.Arrays.equals(this.bytes, that.bytes);
320+
}
321+
322+
/**
323+
*
324+
* @param at
325+
* The position of the byte. The first byte is at 0.
326+
* @return The byte.
327+
* @throws JSONException
328+
* if there is no byte at that position.
329+
*/
330+
public int get(int at) throws JSONException {
331+
if (at < 0 || at > this.length) {
332+
throw new JSONException("Bad character at " + at);
333+
}
334+
return ((int) this.bytes[at]) & 0xFF;
335+
}
336+
337+
/**
338+
* Returns a hash code value for the kim.
339+
*/
340+
public int hashCode() {
341+
return this.hashcode;
342+
}
343+
344+
/**
345+
* Produce a UTF-16 String from this kim. The number of codepoints in the
346+
* string will not be greater than the number of bytes in the kim, although
347+
* it could be less.
348+
*
349+
* @return The string. A kim memoizes its string representation.
350+
* @throws JSONException
351+
* if the kim is invalid.
352+
*/
353+
public String toString() throws JSONException {
354+
if (this.string == null) {
355+
int c;
356+
int length = 0;
357+
char chars[] = new char[this.length];
358+
for (int at = 0; at < this.length; at += characterSize(c)) {
359+
c = this.characterAt(at);
360+
if (c < 0x10000) {
361+
chars[length] = (char) c;
362+
length += 1;
363+
} else {
364+
chars[length] = (char) (0xD800 | ((c - 0x10000) >>> 10));
365+
length += 1;
366+
chars[length] = (char) (0xDC00 | (c & 0x03FF));
367+
length += 1;
368+
}
369+
}
370+
this.string = new String(chars, 0, length);
371+
}
372+
return this.string;
373+
}
374+
}

0 commit comments

Comments
 (0)