From 7c619f00a6e63a9299ba4c323dab8bdc77c71a76 Mon Sep 17 00:00:00 2001 From: Antonio Nuno Monteiro Date: Sun, 22 Dec 2019 14:50:04 -0800 Subject: [PATCH] Rework the URI parser with Angstrom This diff reworks the current parser based on regexes in favor of one that uses Angstrom. It also removes the dependency on `ocaml-re`, moving the regular expressions to a new package `uri-re`. This happens to fix https://github.com/mirage/ocaml-uri/issues/21. --- lib/dune | 2 +- lib/uri.ml | 366 +++++++++++++++++++++++----------------- lib/uri.mli | 10 +- lib_re/dune | 4 + lib_re/uri_re.ml | 78 +++++++++ lib_re/uri_re.mli | 7 + lib_test/test_runner.ml | 26 +++ uri-re.opam | 27 +++ uri.opam | 2 +- 9 files changed, 361 insertions(+), 161 deletions(-) create mode 100644 lib_re/dune create mode 100644 lib_re/uri_re.ml create mode 100644 lib_re/uri_re.mli create mode 100644 uri-re.opam diff --git a/lib/dune b/lib/dune index 4974d05..d249f23 100644 --- a/lib/dune +++ b/lib/dune @@ -1,4 +1,4 @@ (library (name uri) (public_name uri) - (libraries re.posix stringext)) + (libraries stringext angstrom)) diff --git a/lib/uri.ml b/lib/uri.ml index d06f706..aab058f 100644 --- a/lib/uri.ml +++ b/lib/uri.ml @@ -18,107 +18,6 @@ [@@@ocaml.warning "-32"] -module Uri_re = struct - open Re - - module Raw = struct - let (+) a b = seq [a;b] - let (/) a b = alt [a;b] - - let gen_delims = Posix.re "[:/?#\\[\\]@]" - let sub_delims = Posix.re "[!$&'()*+,;=]" - let c_at = char '@' - let c_colon = char ':' - let c_slash = char '/' - let c_slash2 = Posix.re "//" - let c_dot = char '.' - let c_question = char '?' - let c_hash = char '#' - - let reserved = gen_delims / sub_delims - let unreserved = Posix.re "[A-Za-z0-9-._~]" - let hexdig = Posix.re "[0-9A-Fa-f]" - let pct_encoded = (char '%') + hexdig + hexdig - - let dec_octet = Posix.re "25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?" - let ipv4_address = (repn (dec_octet + c_dot) 3 (Some 3)) + dec_octet - - (* following RFC2234, RFC3986, RFC6874 and - http://people.spodhuis.org/phil.pennock/software/emit_ipv6_regexp-0.304 - *) - let zone_id = unreserved / pct_encoded - let ipv6_address = - let (=|) n a = repn a n (Some n) in - let (<|) n a = repn a 0 (Some n) in - let h16 = repn hexdig 1 (Some 4) in - let h16c = h16 + c_colon in - let cc = c_colon + c_colon in - let ls32 = (h16c + h16) / ipv4_address in - ( char '[' - + (((6=|h16c) + ls32) - / ( cc + (5=|h16c) + ls32) - / ((1<| h16) + cc + (4=|h16c) + ls32) - / ((1<|((1<|h16c) + h16)) + cc + (3=|h16c) + ls32) - / ((1<|((2<|h16c) + h16)) + cc + (2=|h16c) + ls32) - / ((1<|((3<|h16c) + h16)) + cc + h16c + ls32) - / ((1<|((4<|h16c) + h16)) + cc + ls32) - / ((1<|((5<|h16c) + h16)) + cc + h16) - / ((1<|((6<|h16c) + h16)) + cc ) - ) - + (opt (Posix.re "%25" + rep1 zone_id)) - + char ']' - ) - - let reg_name = rep ( unreserved / pct_encoded / sub_delims ) - - let host = ipv6_address / ipv4_address / reg_name (* | ipv4_literal TODO *) - let userinfo = rep (unreserved / pct_encoded / sub_delims / c_colon) - let port = Posix.re "[0-9]*" - let authority = (opt ((group userinfo) + c_at)) + (group host) + (opt (c_colon + (group port))) - let null_authority = (group empty) + (group empty) + (group empty) - - let pchar = unreserved / pct_encoded / sub_delims / c_colon / c_at - let segment = rep pchar - let segment_nz = rep1 pchar - let segment_nz_nc = repn (unreserved / pct_encoded / sub_delims / c_at) 1 None - let path_abempty = rep (c_slash + segment) - let path_absolute = c_slash + (opt (segment_nz + (rep (c_slash + segment)))) - let path_noscheme = segment_nz_nc + (rep (c_slash + segment )) - let path_rootless = segment_nz + (rep (c_slash + segment )) - let path_empty = empty - - let path = path_abempty (* begins with "/" or is empty *) - / path_absolute (* begins with "/" but not "//" *) - / path_noscheme (* begins with a non-colon segment *) - / path_rootless (* begins with a segment *) - / path_empty (* zero characters *) - - let hier_part = (c_slash2 + authority + path_abempty) - / (path_absolute / path_rootless / path_empty) - - let scheme = Posix.re "[A-Za-z][A-Za-z0-9+\\\\-\\.]*" - let query = group (rep ( pchar / c_slash / c_question)) - let fragment = group (rep (pchar / c_slash / c_question)) - - let absolute_uri = scheme + c_colon + hier_part + (opt (c_question + query)) - - let uri = scheme + c_colon + hier_part + (opt (c_question + query)) + (opt (c_hash + fragment)) - - let relative_part = (c_slash2 + authority + path_abempty) / (path_absolute / path_noscheme / path_empty) - - let relative_ref = relative_part + (opt (c_question + query)) + (opt (c_hash + fragment)) - - let uri_reference = Posix.re "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" - end - - let ipv4_address = Posix.compile Raw.ipv4_address - let ipv6_address = Posix.compile Raw.ipv6_address - let uri_reference = Posix.compile Raw.uri_reference - let authority = Posix.compile Raw.authority - - let host = Posix.compile Raw.host -end - type component = [ | `Scheme | `Authority @@ -694,56 +593,6 @@ let make ?scheme ?userinfo ?host ?port ?path ?query ?fragment () = { scheme; userinfo; host=decode host; port; path; query; fragment=decode fragment } -(** Parse a URI string into a structure *) -let of_string s = - (* Given a series of Re substrings, cast each component - * into a Pct.encoded and return an optional type (None if - * the component is not present in the Uri *) - let get_opt_encoded s n = - try Some (Pct.cast_encoded (Re.Group.get s n)) - with Not_found -> None - in - let get_opt s n = - try - let pct = Pct.cast_encoded (Re.Group.get s n) in - Some (Pct.decode pct) - with Not_found -> None - in - let subs = Re.exec Uri_re.uri_reference s in - let scheme = get_opt subs 2 in - let userinfo, host, port = - match get_opt_encoded subs 4 with - |None -> None, None, None - |Some a -> - let subs' = Re.exec Uri_re.authority (Pct.uncast_encoded a) in - let userinfo = match get_opt_encoded subs' 1 with - | Some x -> Some (Userinfo.userinfo_of_encoded (Pct.uncast_encoded x)) - | None -> None - in - let host = get_opt subs' 2 in - let port = - match get_opt subs' 3 with - |None -> None - |Some x -> - (try - Some (int_of_string (Pct.uncast_decoded x)) - with _ -> None) - in - userinfo, host, port - in - let path = - match get_opt_encoded subs 5 with - | Some x -> Path.path_of_encoded (Pct.uncast_encoded x) - | None -> [] - in - let query = - match get_opt_encoded subs 7 with - | Some x -> Query.of_raw (Pct.uncast_encoded x) - | None -> Query.Raw (None, Lazy.from_val []) - in - let fragment = get_opt subs 9 in - normalize scheme { scheme; userinfo; host; port; path; query; fragment } - (** Convert a URI structure into a percent-encoded string *) @@ -996,5 +845,218 @@ let canonicalize uri = let pp ppf uri = Format.pp_print_string ppf (to_string uri) let pp_hum ppf uri = Format.pp_print_string ppf (to_string uri) -module Re = Uri_re +module Parser = struct + open Angstrom + + let string_of_char_list chars = + String.concat "" (List.map (String.make 1) chars) + + let string_of_char = String.make 1 + + let scheme = + lift + (fun s -> Some (Pct.decode (Pct.cast_encoded s))) + (take_while (fun c -> c <> ':' && c <> '/' && c <> '?' && c <> '#') + <* char ':') + <|> return None + + let is_digit = function '0' .. '9' -> true | _ -> false + + let hex_digit = + satisfy (function + | '0' .. '9' | 'A' .. 'F' | 'a' .. 'f' -> + true + | _ -> + false) + + let hexadecimal = lift string_of_char_list (many hex_digit) + + let c_dot = char '.' + + let c_at = char '@' + + let c_colon = char ':' + + let dec_octet = + take_while1 (function '0' .. '9' -> true | _ -> false) >>= fun num -> + if int_of_string num < 256 then + return num + else + fail "invalid octect" + + let ipv4_address = + lift2 + (fun three one -> String.concat "." three ^ "." ^ one) + (count 3 (dec_octet <* c_dot)) + dec_octet + + (* -- after double colon, IPv4 dotted notation could appear anywhere *) + let after_double_colon = + fix (fun f -> + list [ ipv4_address ] + <|> lift2 (fun x y -> x :: y) hexadecimal (c_colon *> f <|> return [])) + + let double_colon count = + after_double_colon <|> return [] >>= fun rest -> + let filler_length = 8 - count - List.length rest in + if filler_length <= 0 then + fail "too many parts in IPv6 address" + else + return ("" :: rest) + + let rec part = function + | 7 -> + (* max 8 parts in an IPv6 address *) + lift (fun x -> [ x ]) hexadecimal + | 6 -> + (* after 6 parts it could end in IPv4 dotted notation *) + list [ ipv4_address ] <|> hex_part 6 + | n -> + hex_part n + + and hex_part n = + lift2 + (fun x y -> x :: y) + hexadecimal + (c_colon *> (c_colon *> double_colon (n + 1) <|> part (n + 1))) + + let rec split_with f xs = + match xs with + | [] -> + [], [] + | y :: ys -> + if f y then + let zs, ts = split_with f ys in + y :: zs, ts + else + [], xs + + let ipv6 = + let format_addr segments = + let before_double_colon, after_double_colon = + split_with (fun segment -> segment <> "") segments + in + let before = String.concat ":" before_double_colon in + let res = + match after_double_colon with + | "" :: xs -> + before ^ "::" ^ String.concat ":" xs + | _ -> + before + in + res + in + lift format_addr (c_colon *> c_colon *> double_colon 0 <|> part 0) + + let ipv6_address = + lift3 + (fun lb ip rb -> + String.concat "" [ String.make 1 lb; ip; String.make 1 rb ]) + (char '[') + ipv6 + (char ']') + + let pct_encoded = + lift2 + (fun pct digits -> string_of_char_list (pct :: digits)) + (char '%') + (count 2 hex_digit) + + let sub_delims = + satisfy (function + | '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' -> + true + | _ -> + false) + + let unreserved = + (* "[A-Za-z0-9-._~]" *) + satisfy (function + | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' | '.' | '_' | '~' -> + true + | _ -> + false) + + let reg_name = + lift + (String.concat "") + (many + (choice + [ string_of_char <$> unreserved + ; pct_encoded + ; string_of_char <$> sub_delims + ])) + + let host = + lift + (fun s -> Pct.decode (Pct.cast_encoded s)) + (choice + [ reg_name; ipv4_address; ipv6_address (* | ipv4_literal TODO *) ]) + + let userinfo = + lift + (fun x -> + let s = String.concat "" x in + Some (Userinfo.userinfo_of_encoded s)) + (many + (choice + [ string_of_char <$> unreserved + ; pct_encoded + ; string_of_char <$> sub_delims + ; string_of_char <$> c_colon + ]) + <* c_at) + <|> return None + + let port = + peek_char >>= function + | Some ':' -> + c_colon *> take_while is_digit >>| fun port -> + let decoded = Pct.decode (Pct.cast_encoded port) in + (try Some (int_of_string (Pct.uncast_decoded decoded)) with _ -> None) + | Some _ | None -> + return None + + let authority = + string "//" + *> lift3 + (fun userinfo host port -> userinfo, Some host, port) + userinfo + host + port + <|> return (None, None, None) + + let path = + lift + Path.path_of_encoded + (take_while (function '?' | '#' -> false | _ -> true)) + + let query = + lift + Query.of_raw + (char '?' *> take_till (function '#' -> true | _ -> false)) + <|> return (Query.Raw (None, Lazy.from_val [])) + + let fragment = + lift + (fun s -> Some (Pct.decode (Pct.cast_encoded s))) + (char '#' *> take_while (fun _ -> true)) + <|> return None + + let uri_reference = + lift4 + (fun scheme (userinfo, host, port) path query fragment -> + normalize scheme { scheme; userinfo; host; port; path; query; fragment }) + scheme + authority + path + query + <*> fragment +end +let of_string s = + match Angstrom.parse_string Parser.uri_reference s with + | Ok t -> t + | Error _ -> + (* Shouldn't really happen if the parser is forgiving. *) + empty diff --git a/lib/uri.mli b/lib/uri.mli index b929fe9..4829bb9 100644 --- a/lib/uri.mli +++ b/lib/uri.mli @@ -259,11 +259,7 @@ val pp : Format.formatter -> t -> unit [@@ocaml.toplevel_printer] (** [pp_hum] is now an alias for the {!pp} function. *) val pp_hum : Format.formatter -> t -> unit -(** Regular expressions for URI parsing. *) -module Re : sig - val ipv4_address : Re.re - val ipv6_address : Re.re - val uri_reference : Re.re - val authority : Re.re - val host : Re.re +module Parser : sig + val ipv6 : string Angstrom.t + val uri_reference : t Angstrom.t end diff --git a/lib_re/dune b/lib_re/dune new file mode 100644 index 0000000..11760b4 --- /dev/null +++ b/lib_re/dune @@ -0,0 +1,4 @@ +(library + (name uri_re) + (public_name uri-re) + (libraries re.posix stringext)) diff --git a/lib_re/uri_re.ml b/lib_re/uri_re.ml new file mode 100644 index 0000000..fd3b791 --- /dev/null +++ b/lib_re/uri_re.ml @@ -0,0 +1,78 @@ +(* + * Copyright (c) 2012-2014 Anil Madhavapeddy + * Copyright (c) 2012-2014 David Sheets + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + *) + + open Re + + module Raw = struct + let (+) a b = seq [a;b] + let (/) a b = alt [a;b] + + let sub_delims = Posix.re "[!$&'()*+,;=]" + let c_at = char '@' + let c_colon = char ':' + let c_dot = char '.' + + let unreserved = Posix.re "[A-Za-z0-9-._~]" + let hexdig = Posix.re "[0-9A-Fa-f]" + let pct_encoded = (char '%') + hexdig + hexdig + + let dec_octet = Posix.re "25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?" + let ipv4_address = (repn (dec_octet + c_dot) 3 (Some 3)) + dec_octet + + (* following RFC2234, RFC3986, RFC6874 and + http://people.spodhuis.org/phil.pennock/software/emit_ipv6_regexp-0.304 + *) + let zone_id = unreserved / pct_encoded + let ipv6_address = + let (=|) n a = repn a n (Some n) in + let (<|) n a = repn a 0 (Some n) in + let h16 = repn hexdig 1 (Some 4) in + let h16c = h16 + c_colon in + let cc = c_colon + c_colon in + let ls32 = (h16c + h16) / ipv4_address in + ( char '[' + + (((6=|h16c) + ls32) + / ( cc + (5=|h16c) + ls32) + / ((1<| h16) + cc + (4=|h16c) + ls32) + / ((1<|((1<|h16c) + h16)) + cc + (3=|h16c) + ls32) + / ((1<|((2<|h16c) + h16)) + cc + (2=|h16c) + ls32) + / ((1<|((3<|h16c) + h16)) + cc + h16c + ls32) + / ((1<|((4<|h16c) + h16)) + cc + ls32) + / ((1<|((5<|h16c) + h16)) + cc + h16) + / ((1<|((6<|h16c) + h16)) + cc ) + ) + + (opt (Posix.re "%25" + rep1 zone_id)) + + char ']' + ) + + let reg_name = rep ( unreserved / pct_encoded / sub_delims ) + + let host = ipv6_address / ipv4_address / reg_name (* | ipv4_literal TODO *) + let userinfo = rep (unreserved / pct_encoded / sub_delims / c_colon) + let port = Posix.re "[0-9]*" + let authority = (opt ((group userinfo) + c_at)) + (group host) + (opt (c_colon + (group port))) + + let uri_reference = Posix.re "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" + end + + let ipv4_address = Posix.compile Raw.ipv4_address + let ipv6_address = Posix.compile Raw.ipv6_address + let uri_reference = Posix.compile Raw.uri_reference + let authority = Posix.compile Raw.authority + + let host = Posix.compile Raw.host diff --git a/lib_re/uri_re.mli b/lib_re/uri_re.mli new file mode 100644 index 0000000..1a4e989 --- /dev/null +++ b/lib_re/uri_re.mli @@ -0,0 +1,7 @@ + +(** Regular expressions for URI parsing. *) +val ipv4_address : Re.re +val ipv6_address : Re.re +val uri_reference : Re.re +val authority : Re.re +val host : Re.re diff --git a/lib_test/test_runner.ml b/lib_test/test_runner.ml index 23f8de4..b593a3c 100644 --- a/lib_test/test_runner.ml +++ b/lib_test/test_runner.ml @@ -649,6 +649,31 @@ let test_with_uri = ) ) (List.map (fun (i, o) -> Uri.to_string i, o) with_uri) +let ipv6_addresses = + ["::", "::" + ; "::1", "::1" + ;"fe02::1","fe02::1" + ;"::ffff:192.0.2.1", "::ffff:192.0.2.1" + ;"2001:DB8::42","2001:DB8::42" + ;"2001:DB8:1234:5678:90ab:cdef:0123:4567","2001:DB8:1234:5678:90ab:cdef:0123:4567" + ;"2001:DB8:1234:5678:90ab:cdef:0123::","2001:DB8:1234:5678:90ab:cdef:0123::" + ;"2001:DB8:1234:5678:90ab:cdef::0123","2001:DB8:1234:5678:90ab:cdef::0123" + ;"2001:DB8:1234:5678:90ab:cdef:192.0.2.1","2001:DB8:1234:5678:90ab:cdef:192.0.2.1" + ;"2001:DB8:1234:5678:90ab:cdef:192.0.2.1","2001:DB8:1234:5678:90ab:cdef:192.0.2.1" + ] + +let test_ipv6_parsing = + List.map (fun (input, expected) -> + let name = sprintf "ipv6:%s" input in + let test () = + match Angstrom.parse_string Uri.Parser.ipv6 input with + | Ok parsed -> + assert_equal ~printer:(fun x -> x) expected parsed + | Error msg -> assert_failure msg + in + name >:: test + ) ipv6_addresses + (* Returns true if the result list contains successes only. Copied from oUnit source as it isnt exposed by the mli *) let rec was_successful = @@ -682,6 +707,7 @@ let _ = @ test_with_change @ test_canonicalize @ test_with_uri + @ test_ipv6_parsing ) in let verbose = ref false in let set_verbose _ = verbose := true in diff --git a/uri-re.opam b/uri-re.opam new file mode 100644 index 0000000..7fee90b --- /dev/null +++ b/uri-re.opam @@ -0,0 +1,27 @@ +opam-version: "2.0" +maintainer: "anil@recoil.org" +authors: ["Anil Madhavapeddy" "David Sheets" "Rudi Grinberg"] +license: "ISC" +tags: ["url" "uri" "org:mirage" "org:xapi-project"] +homepage: "https://github.com/mirage/ocaml-uri" +bug-reports: "https://github.com/mirage/ocaml-uri/issues" +dev-repo: "git+https://github.com/mirage/ocaml-uri.git" +doc: "https://mirage.github.io/ocaml-uri/" +synopsis: "An RFC3986 URI/URL parsing library" +description: """ +This is an OCaml implementation of the [RFC3986](http://tools.ietf.org/html/rfc3986) specification +for parsing URI or URLs. +""" +depends: [ + "ocaml" {>= "4.04.0"} + "dune" {>= "1.2.0"} + "ounit" {with-test & >= "1.0.2"} + "ppx_sexp_conv" {with-test & >= "v0.9.0"} + "re" {>= "1.9.0"} + "stringext" {>= "1.4.0"} +] +build: [ + ["dune" "subst"] {pinned} + ["dune" "build" "-p" name "-j" jobs] + ["dune" "runtest" "-p" name "-j" jobs] {with-test} +] diff --git a/uri.opam b/uri.opam index cdb9168..23ae70f 100644 --- a/uri.opam +++ b/uri.opam @@ -17,8 +17,8 @@ depends: [ "dune" {>= "1.2.0"} "ounit" {with-test & >= "1.0.2"} "ppx_sexp_conv" {with-test & >= "v0.9.0"} - "re" {>= "1.9.0"} "stringext" {>= "1.4.0"} + "angstrom" ] build: [ ["dune" "subst"] {pinned}