Skip to content

Uri encode / decode #1733

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 93 commits into from
Apr 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
93 commits
Select commit Hold shift + click to select a range
4f7bc3d
init (wip)
MaxGraey Nov 16, 2018
b0fbf21
wip
MaxGraey Nov 17, 2018
15a9e91
Merge branch 'master' into uri-encodes
MaxGraey Aug 21, 2020
82da348
refactor
MaxGraey Aug 21, 2020
f8308f8
more
MaxGraey Aug 21, 2020
68e8cc9
wip
MaxGraey Aug 21, 2020
2f6a92c
wip
MaxGraey Aug 21, 2020
76f3ac8
more
MaxGraey Aug 21, 2020
0f99ff1
refactor
MaxGraey Aug 21, 2020
d403720
more
MaxGraey Aug 22, 2020
890e65d
Merge branch 'master' into uri-encodes
MaxGraey Mar 16, 2021
ac22e36
remove some verbosity
MaxGraey Mar 16, 2021
8a30361
update heap methods
MaxGraey Mar 16, 2021
342da70
fix
MaxGraey Mar 16, 2021
7051dc4
add more refs
MaxGraey Mar 16, 2021
514a4c7
simplify
MaxGraey Mar 17, 2021
6b01c7d
store utf8 code points
MaxGraey Mar 17, 2021
bf5e656
simplify names
MaxGraey Mar 17, 2021
6f0d5fa
refactor
MaxGraey Mar 17, 2021
9abda8f
more
MaxGraey Mar 17, 2021
651702f
fixes
MaxGraey Mar 17, 2021
f6c5051
refactors
MaxGraey Mar 17, 2021
2408cf2
progress
MaxGraey Mar 17, 2021
1e4eed8
refactor
MaxGraey Mar 17, 2021
2810119
more
MaxGraey Mar 17, 2021
92ba20e
fix leaks
MaxGraey Mar 19, 2021
27f881b
wip
MaxGraey Mar 20, 2021
ece370c
wip
MaxGraey Mar 26, 2021
e603cd6
wip
MaxGraey Mar 26, 2021
c551f94
fixes (wip)
MaxGraey Mar 26, 2021
4c0555f
more (wip)
MaxGraey Mar 26, 2021
a0c49f2
more (wip)
MaxGraey Mar 26, 2021
6c13459
better size estimation
MaxGraey Mar 26, 2021
d011a38
add URIError
MaxGraey Mar 26, 2021
163ff90
wip
MaxGraey Mar 26, 2021
44b7705
minor opt
MaxGraey Mar 26, 2021
632db8a
refactor
MaxGraey Mar 26, 2021
aa26e2a
minor opt
MaxGraey Mar 26, 2021
4c812f3
refactoring
MaxGraey Mar 26, 2021
20e2065
better
MaxGraey Mar 26, 2021
b1fe142
more tests
MaxGraey Mar 26, 2021
67c1b91
fix
MaxGraey Mar 26, 2021
1ed516d
more tests
MaxGraey Mar 26, 2021
cbd5ea4
invert tables for better memory packing
MaxGraey Mar 26, 2021
953c43b
even more smaller
MaxGraey Mar 26, 2021
2174b69
more detailed comments
MaxGraey Mar 26, 2021
3a4cdd4
add encodeURI tests
MaxGraey Mar 26, 2021
4938263
wip
MaxGraey Mar 26, 2021
d8814fe
fixes (wip)
MaxGraey Mar 26, 2021
a350c00
fixes (wip)
MaxGraey Mar 26, 2021
43f90cb
opt (experimental)
MaxGraey Mar 26, 2021
16cfeb3
refactor
MaxGraey Mar 26, 2021
5b9ba29
more tests
MaxGraey Mar 26, 2021
54fa6dd
fix
MaxGraey Mar 26, 2021
f6a9c89
wip
MaxGraey Mar 27, 2021
8204e0e
refactor
MaxGraey Mar 27, 2021
62d0fef
fix
MaxGraey Mar 27, 2021
c94869e
add assert
MaxGraey Mar 27, 2021
4efd052
opt
MaxGraey Mar 27, 2021
96d413e
refactoring
MaxGraey Mar 27, 2021
8050ce7
more
MaxGraey Mar 27, 2021
4b4fdc8
optimize utf8 byte count
MaxGraey Mar 27, 2021
69a45b9
opt utf8_len table
MaxGraey Mar 27, 2021
29ed2a8
refactor
MaxGraey Mar 27, 2021
f599d38
comment
MaxGraey Mar 27, 2021
ed44562
more tests
MaxGraey Mar 27, 2021
00561ee
opt
MaxGraey Mar 27, 2021
9475f6b
opt more
MaxGraey Mar 27, 2021
abec3f0
fix
MaxGraey Mar 27, 2021
3f3c1b8
more
MaxGraey Mar 27, 2021
62e0ee2
fix
MaxGraey Mar 27, 2021
b4da6ce
better comments
MaxGraey Mar 27, 2021
620e5d2
more opts
MaxGraey Mar 27, 2021
afa90fd
revert more precise hex char checks
MaxGraey Mar 27, 2021
daa7296
refactor
MaxGraey Mar 27, 2021
a42b0c8
more tests
MaxGraey Mar 27, 2021
e00b7e4
align shinked unicode range for 4 bytes range with table
MaxGraey Mar 28, 2021
b3952b0
simplify utf8LenFromUpperByte
MaxGraey Mar 28, 2021
81e5a13
better comments
MaxGraey Mar 28, 2021
3250ef8
refactor
MaxGraey Mar 28, 2021
71a40ce
Merge branch 'master' into uri-encodes
MaxGraey Mar 30, 2021
91be0b2
refactoring
MaxGraey Mar 30, 2021
3ca72e4
more
MaxGraey Mar 30, 2021
ed60c38
Merge branch 'master' into uri-encodes
MaxGraey Apr 1, 2021
1132f6c
remove fast pathes
MaxGraey Apr 1, 2021
94041a5
Merge branch 'master' into uri-encodes
MaxGraey Apr 16, 2021
b5ef31b
update fixture
MaxGraey Apr 16, 2021
0c5a7bb
Merge branch 'master' into uri-encodes
MaxGraey Apr 17, 2021
e23fa80
upd fixture
MaxGraey Apr 17, 2021
48a032a
more comments
MaxGraey Apr 17, 2021
d10f016
fix typos
MaxGraey Apr 17, 2021
1e78551
more typos
MaxGraey Apr 17, 2021
a544405
fix
MaxGraey Apr 17, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions std/assembly/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,10 @@ export class SyntaxError extends Error {
this.name = "SyntaxError";
}
}

export class URIError extends Error {
constructor(message: string = "") {
super(message);
this.name = "URIError";
}
}
11 changes: 11 additions & 0 deletions std/assembly/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,14 @@ declare function fmod(x: f64, y: f64): f64;
declare function fmodf(x: f32, y: f32): f32;
/** Returns the number of parameters in the given function signature type. */
declare function lengthof<T extends (...args: any[]) => any>(func?: T): i32;
/** Encodes a text string as a valid Uniform Resource Identifier (URI). */
declare function encodeURI(str: string): string;
/** Encodes a text string as a valid component of a Uniform Resource Identifier (URI). */
declare function encodeURIComponent(str: string): string;
/** Decodes a Uniform Resource Identifier (URI) previously created by encodeURI. */
declare function decodeURI(str: string): string;
/** Decodes a Uniform Resource Identifier (URI) component previously created by encodeURIComponent. */
declare function decodeURIComponent(str: string): string;

/** Atomic operations. */
declare namespace atomic {
Expand Down Expand Up @@ -1771,6 +1779,9 @@ declare class TypeError extends Error { }
/** Class for indicating an error when trying to interpret syntactically invalid code. */
declare class SyntaxError extends Error { }

/** Class for indicating an error when a global URI handling function was used in a wrong way. */
declare class URIError extends Error { }

interface Boolean {
toString(radix?: number): string;
}
Expand Down
17 changes: 17 additions & 0 deletions std/assembly/uri.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { encode, decode, URI_UNSAFE, URL_UNSAFE } from "./util/uri";

export function encodeURI(str: string): string {
return changetype<string>(encode(changetype<usize>(str), str.length, URI_UNSAFE));
}

export function decodeURI(str: string): string {
return changetype<string>(decode(changetype<usize>(str), str.length, false));
}

export function encodeURIComponent(str: string): string {
return changetype<string>(encode(changetype<usize>(str), str.length, URL_UNSAFE));
}

export function decodeURIComponent(str: string): string {
return changetype<string>(decode(changetype<usize>(str), str.length, true));
}
4 changes: 4 additions & 0 deletions std/assembly/util/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,7 @@ export const E_ALREADY_PINNED: string = "Object already pinned";
// @ts-ignore: decorator
@lazy @inline
export const E_NOT_PINNED: string = "Object is not pinned";

// @ts-ignore: decorator
@lazy @inline
export const E_URI_MALFORMED: string = "URI malformed";
2 changes: 2 additions & 0 deletions std/assembly/util/string.ts
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ import { ipow32 } from "../math";
// @ts-ignore: decorator
@inline
export const enum CharCode {
PERCENT = 0x25,
PLUS = 0x2B,
MINUS = 0x2D,
DOT = 0x2E,
Expand All @@ -484,6 +485,7 @@ export const enum CharCode {
e = 0x65,
n = 0x6E,
o = 0x6F,
u = 0x75,
x = 0x78,
z = 0x7A
}
Expand Down
276 changes: 276 additions & 0 deletions std/assembly/util/uri.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
import { E_URI_MALFORMED } from "./error";
import { CharCode } from "./string";

// Truncated lookup boolean table that helps us quickly determine
// if a char needs to be escaped for URIs (RFC 2396).
// @ts-ignore: decorator
@lazy export const URI_UNSAFE = memory.data<u8>([
/* skip 32 + 1 always set to '1' head slots
*/ 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, /*
skip 128 + 1 always set to '1' tail slots */
]);

// Truncated lookup boolean table that helps us quickly determine
// if a char needs to be escaped for URLs (RFC 3986).
// @ts-ignore: decorator
@lazy export const URL_UNSAFE = memory.data<u8>([
/* skip 32 + 1 always set to '1' head slots
*/ 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, /*
skip 128 + 1 always set to '1' tail slots */
]);

// Truncated lookup boolean table for determine reserved chars: ;/?:@&=+$,#
// @ts-ignore: decorator
@lazy export const URI_RESERVED = memory.data<u8>([
/* skip 32 + 3 always set to '0' head slots
*/ 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
1, /* skip 191 always set to '0' tail slots */
]);

export function encode(src: usize, len: usize, table: usize): usize {
if (!len) return src;

var i: usize = 0, offset: usize = 0, outSize = len << 1;
var dst = __new(outSize, idof<String>());

while (i < len) {
let org = i;
let c: u32, c1: u32;
// fast scan a check chars until it valid ASCII
// and safe for copying withoud escaping.
do {
c = <u32>load<u16>(src + (i << 1));
// is it valid ASII and safe?
if (c - 33 < 94) { // 127 - 33
if (load<u8>(table + (c - 33))) break;
} else break;
} while (++i < len);

// if we have some safe range of sequence just copy it without encoding
if (i > org) {
let size = i - org << 1;
if (offset + size > outSize) {
outSize = offset + size;
dst = __renew(dst, outSize);
}
// TODO: should we optimize for short cases like 2 byte size?
memory.copy(
dst + offset,
src + (org << 1),
size
);
offset += size;
// return if we reach end on input string
if (i >= len) break;
}

// decode UTF16 with checking for unpaired surrogates
if (c >= 0xD800) {
if (c >= 0xDC00 && c <= 0xDFFF) {
throw new URIError(E_URI_MALFORMED);
}
if (c <= 0xDBFF) {
if (i >= len) {
throw new URIError(E_URI_MALFORMED);
}
c1 = <u32>load<u16>(src + (++i << 1));
if (c1 < 0xDC00 || c1 > 0xDFFF) {
throw new URIError(E_URI_MALFORMED);
}
c = (((c & 0x3FF) << 10) | (c1 & 0x3FF)) + 0x10000;
}
}

let estSize = offset + (c < 0x80 ? 1 * 6 : 4 * 6);
if (estSize > outSize) {
// doubling estimated size but only for greater than one
// input lenght due to we already estemated it for worst case
outSize = len > 1 ? estSize << 1 : estSize;
dst = __renew(dst, outSize);
}

if (c < 0x80) {
// encode ASCII unsafe code point
storeHex(dst, offset, c);
offset += 6;
} else {
// encode UTF-8 unsafe code point
if (c < 0x800) {
storeHex(dst, offset, (c >> 6) | 0xC0);
offset += 6;
} else {
if (c < 0x10000) {
storeHex(dst, offset, (c >> 12) | 0xE0);
offset += 6;
} else {
storeHex(dst, offset, (c >> 18) | 0xF0);
offset += 6;
storeHex(dst, offset, (c >> 12 & 0x3F) | 0x80);
offset += 6;
}
storeHex(dst, offset, (c >> 6 & 0x3F) | 0x80);
offset += 6;
}
storeHex(dst, offset, (c & 0x3F) | 0x80);
offset += 6;
}
++i;
}
// shink output string buffer if necessary
if (outSize > offset) {
dst = __renew(dst, offset);
}
return dst;
}

export function decode(src: usize, len: usize, component: bool): usize {
if (!len) return src;

var i: usize = 0, offset: usize = 0, ch: u32 = 0;
var dst = __new(len << 1, idof<String>());

while (i < len) {
let org = i;
while (i < len && (ch = load<u16>(src + (i << 1))) != CharCode.PERCENT) i++;

if (i > org) {
let size = i - org << 1;
// TODO: should we optimize for short cases like 2 byte size?
memory.copy(
dst + offset,
src + (org << 1),
size
);
offset += size;
if (i >= len) break;
}

// decode hex
if (
i + 2 >= len ||
ch != CharCode.PERCENT ||
(ch = loadHex(src, i + 1 << 1)) == -1
) throw new URIError(E_URI_MALFORMED);

i += 3;
if (ch < 0x80) {
if (!component && isReserved(ch)) {
ch = CharCode.PERCENT;
i -= 2;
}
} else {
// decode UTF-8 sequence
let nb = utf8LenFromUpperByte(ch);
// minimal surrogate: 2 => 0x80, 3 => 0x800, 4 => 0x10000, _ => -1
let lo: u32 = 1 << (17 * nb >> 2) - 1;
// mask: 2 => 31, 3 => 15, 4 => 7, _ => 0
ch &= nb ? (0x80 >> nb) - 1 : 0;

while (--nb != 0) {
let c1: u32;
// decode hex
if (
i + 2 >= len ||
load<u16>(src + (i << 1)) != CharCode.PERCENT ||
(c1 = loadHex(src, i + 1 << 1)) == -1
) throw new URIError(E_URI_MALFORMED);

i += 3;
if ((c1 & 0xC0) != 0x80) {
ch = 0;
break;
}
ch = (ch << 6) | (c1 & 0x3F);
}

// check if UTF8 code point properly fit into invalid UTF16 encoding
if (ch < lo || lo == -1 || ch > 0x10FFFF || (ch >= 0xD800 && ch < 0xE000)) {
throw new URIError(E_URI_MALFORMED);
}

// encode UTF16
if (ch >= 0x10000) {
ch -= 0x10000;
let lo = ch >> 10 | 0xD800;
let hi = (ch & 0x03FF) | 0xDC00;
store<u32>(dst + offset, lo | (hi << 16));
offset += 4;
continue;
}
}
store<u16>(dst + offset, ch);
offset += 2;
}

assert(offset <= (len << 1));
// shink output string buffer if necessary
if ((len << 1) > offset) {
dst = __renew(dst, offset);
}
return dst;
}

function storeHex(dst: usize, offset: usize, ch: u32): void {
// @ts-ignore: decorator
const HEX_CHARS = memory.data<u8>([
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46
]);

store<u16>(dst + offset, CharCode.PERCENT, 0); // %
store<u32>(
dst + offset,
<u32>load<u8>(HEX_CHARS + (ch >> 4 & 0x0F)) |
<u32>load<u8>(HEX_CHARS + (ch & 0x0F)) << 16,
2
); // XX
}

function loadHex(src: usize, offset: usize): u32 {
let c0 = <u32>load<u16>(src + offset, 0);
let c1 = <u32>load<u16>(src + offset, 2);
return isHex(c0) && isHex(c1)
? fromHex(c0) << 4 | fromHex(c1)
: -1;
}

// @ts-ignore: decorator
@inline function fromHex(ch: u32): u32 {
return (ch | 32) % 39 - 9;
}

// @ts-ignore: decorator
@inline function utf8LenFromUpperByte(c0: u32): u32 {
// same as
// if (c0 - 0xC0 <= 0xDF - 0xC0) return 2;
// if (c0 - 0xE0 <= 0xEF - 0xE0) return 3;
// if (c0 - 0xF0 <= 0xF7 - 0xF0) return 4;
// return 0;
return c0 - 0xC0 < 56
? clz(~(c0 << 24))
: 0;
}

// @ts-ignore: decorator
@inline function isReserved(ch: u32): bool {
return ch - 35 < 30
? <bool>load<u8>(URI_RESERVED + (ch - 35))
: false;
}

// @ts-ignore: decorator
@inline function isHex(ch: u32): bool {
// @ts-ignore
return (ch - CharCode._0 < 10) | ((ch | 32) - CharCode.a < 6);
}
5 changes: 5 additions & 0 deletions tests/compiler/std/uri.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"asc_flags": [
],
"asc_rtrace": true
}
Loading