PPX for Working with Regular Expressions

Made possible by forking ppx_regexp.
Our upstream contributions to ppx_regexp come from another repo.

PPX for Working with Regular Expressions

This repo provides a PPX providing regular expression-based routing:

ppx_mikmatch maps to re with the conventional last-match extraction into string and string option.

This syntax extension turns:

function%mikmatch
| {| re1 |} -> e1
...
| {| reN |} -> eN
| _ -> e0

into suitable invocations of the Re library, and similar for match%mikmatch.

It also accepts:

let%mikmatch var = {| some regex |}

to define reusable patterns, and much more.

Full usage guide

ppx_mikmatch guide.

Quick Links

Motivational Examples

URL parsing:

let parse s =
  let (scheme, first) =
    match s.[4] with
    | ':' -> `Http, 7
    | 's' -> `Https, 8
    | _ -> failwith "parse"
  in
  let last = String.index_from s first '/' in
  let host = String.slice s ~first ~last in
  let (host,port) =
    match Stre.splitc host ':' with
    | exception _ -> host, default_port scheme
    | (host,port) -> host, int_of_string port
  in
  ...

(* in mikmatch: *)

let parse s =
  match%mikmatch s with
  | {|/ "http" ('s' as https)? "://" ([^ '/' ':']+ as host) (":" (digit+ as port : int))? '/'? (_* as rest) /|} ->
      let scheme = match https with Some _ -> `Https | None -> `Http in
      let port = match port with Some p -> p | None -> default_port scheme in
      ...
  | _ -> failwith "parse"

let rex =
  let origins = "csv|pdf|html|xlsv|xml"
  Re2.create_exn (sprintf {|^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)(?:\.(\d+))?\.(%s)\.(\d+)\.(\d+)$|} origins)

let of_string s =
  try
    let m = Re2.first_match_exn rex s in
    let start = Re2.Match.get_exn ~sub:(`Index 1) m |> U.strptime "%Y-%m-%dT%H:%M:%S%z" |> U.timegm in
    let shard = int_of_string (Re2.Match.get_exn ~sub:(`Index 2) m) in
    let origin = origin_of_string (Re2.Match.get_exn ~sub:(`Index 3) m) in
    let partition = int_of_string (Re2.Match.get_exn ~sub:(`Index 4) m) in
    let worker = int_of_string (Re2.Match.get_exn ~sub:(`Index 5) m) in
    { start; shard; origin; partition; worker }
  with _ -> invalid_arg (sprintf "error: %s" s)

(* in mikmatch: *)

let%mikmatch origins = {| "csv" | "pdf" | "html" | "xlsv" | "xml" |}

let of_string s =
  match%mikmatch s with
  | {|/ (digit{4} '-' digit{2} '-' digit{2} 'T' digit{2} ':' digit{2} ':' digit{2} 'Z' as timestamp)
      ('.' (digit+ as shard : int))? 
      '.' (origins as origin := origin_of_string)
      '.' (digit+ as partition : int)
      '.' (digit+ as worker : int) /|} ->
      let start = U.strptime "%Y-%m-%dT%H:%M:%S%z" timestamp |> U.timegm in
      let shard = match shard with Some s -> s | None -> 0 in
      { start; shard; origin; partition; worker }
  | _ -> invalid_arg (sprintf "error: %s" s)

Performance Considerations

The different syntax extensions behave differently:

match%mikmatch will compile all branches into suitable groups. The group creation follows the invariant:

If the regexes in the current group:
1. do not have pattern guards, then if the current regex:
  1. is pattern guardless as well, it can belong to the same group
  2. has a pattern guard, it starts a new group
2. have pattern guards, then if the current regex
  1. has the same RE and flags, it can belong to the same group
  2. doesn't have the same RE and flags, then start new group
Each group is compiled as a single Regex using alternations and tried against the input string.
the general extension defined here compiles each branch into a separate Regex, so it is less efficient than the first option.

When compared to mikmatch or Re2 using Match.get_exn, ppx_mikmatch is considerably faster, as the other tools take the same approach as the general extension.

A comparison:

(* the REs used here are direct equivalents to the branches in the mikmatchlike functions below *)
let extract_httpheader_re2 s =
  match Re2.first_match content_encoding_re s with
  | Ok mtch ->
    let v = Re2.Match.get_exn ~sub:(`Index 1) mtch in
    `ContentEncoding (String.lowercase_ascii @@ strip v)
  | Error _ ->
  ...
  match Re2.first_match link_re s with
  | Ok mtch ->
    let url = Re2.Match.get_exn ~sub:(`Index 1) mtch in
    let rest = Re2.Match.get_exn ~sub:(`Index 2) mtch in
    `Link (url, String.lowercase_ascii @@ strip rest)
  | Error _ -> `Other
end

let extract_httpheader_mikmatch s =
  match s with
  | / "content-encoding:"~ ' '* (_* as v) "\r\n"? eos / -> `ContentEncoding (String.lowercase_ascii @@ strip v)
  | / "content-type:"~ ' '* (_* as v) "\r\n"? eos / -> `ContentType (String.lowercase_ascii @@ strip v)
  | / "last-modified:"~ ' '* (_* as v) "\r\n"? eos / -> `LastModified (strip v)
  | / "content-length:"~ ' '* (_* as v) "\r\n"? eos / -> `ContentLength (strip v)
  | / "etag:"~ ' '* (_* as v) "\r\n"? eos / -> `ETag (strip v)
  | / "server:"~ ' '* (_* as v) "\r\n"? eos / -> `Server (strip v)
  | / "x-robots-tag:"~ ' '* (_* as v) "\r\n"? eos / -> `XRobotsTag (strip v)
  | / "location:"~ ' '* (_* as v) "\r\n"? eos / -> `Location (strip v)
  | / "link:"~ ' '* '<' (re_link_url as url) '>' ' '* ';' (_* as rest) "\r\n"? eos / -> `Link (url, String.lowercase_ascii @@ strip rest)
  | _ -> "Other"

let extract_httpheader_ppx_mikmatch s =
  match%mikmatch s with
  | {| "content-encoding:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `ContentEncoding (String.lowercase_ascii v)
  | {| "content-type:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `ContentType (String.lowercase_ascii v)
  | {| "last-modified:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `LastModified v
  | {| "content-length:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `ContentLength v
  | {| "etag:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `ETag v
  | {| "server:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `Server v
  | {| "x-robots-tag:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `XRobotsTag v
  | {| "location:"~ ' '* (_* as v := String.strip) "\r\n"? |} -> `Location v
  | {| "link:"~ ' '* '<' (re_link_url as url) '>' ' '* ';' (_* as rest := String.strip) "\r\n"? |} ->
    `Link (url, String.lowercase_ascii rest)
  | _ -> "Other"

Benchmarking these three yields:

run_bench 3 cases (count 10000)
         re2 : allocated    496.4MB, heap         0B, collection 0 0 248, elapsed 1.01 seconds, 9887.97/sec : ok
    mikmatch : allocated    147.9MB, heap         0B, collection 0 0 73, elapsed 0.2817 seconds, 35504.18/sec : ok
ppx_mikmatch : allocated    155.5MB, heap         0B, collection 0 0 77, elapsed 0.0669 seconds, 149445.91/sec : ok

Limitations

No Exhaustiveness Check

The syntax extension will always warn if no catch-all case is provided. No exhaustiveness check is attempted. Doing it right would require reimplementing full regular expression parsing and an algorithm which would ideally produce a counter-example.

Bug Reports

The processor is currently new and not well tested. Please break it and file bug reports in the GitHub issue tracker. Any exception raised by generated code except for Match_failure is a bug.

Name		Name	Last commit message	Last commit date
Latest commit History 4 Commits
lib		lib
src		src
tests		tests
.gitignore		.gitignore
.ocamlformat		.ocamlformat
COPYING		COPYING
COPYING.LESSER		COPYING.LESSER
COPYING.LINKING		COPYING.LINKING
MIKMATCH.md		MIKMATCH.md
README.md		README.md
dune-project		dune-project
dune-workspace.dev		dune-workspace.dev
ppx_mikmatch.opam		ppx_mikmatch.opam

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Licenses found

Uh oh!

Repository files navigation

PPX for Working with Regular Expressions

Full usage guide

Quick Links

Motivational Examples

Performance Considerations

Limitations

No Exhaustiveness Check

Bug Reports

About

Licenses found

Uh oh!

Releases 1

Packages

Languages

License

Licenses found

ahrefs/ppx_mikmatch

Folders and files

Latest commit

History

Repository files navigation

PPX for Working with Regular Expressions

Full usage guide

Quick Links

Motivational Examples

Performance Considerations

Limitations

No Exhaustiveness Check

Bug Reports

About

Resources

License

Licenses found

Uh oh!

Stars

Watchers

Forks

Releases 1

Packages 0

Languages

Packages