Skip to content

Commit

Permalink
Add RegEx support using RE2
Browse files Browse the repository at this point in the history
Introduces 5 new built-in methods to the stdlib:
- `regexFullMatch(pattern, str)` -- Full match regex
- `regexPartialMatch(pattern, str)` -- Partial match regex
- `regexQuoteMeta(str)` -- Escape regex metachararacters
- `regexReplace(str, pattern, to)` -- Replace single occurance using regex
- `regexGlobalReplace(str, pattern, to)` -- Replace globally using regex

Since both `regexFullMatch` and `regexPartialMatch` can perform captures
these functions return a "match" object upon match or `null` otherwise.
For example:

```
$ ./jsonnet -e 'std.regexFullMatch("h(?P<mid>.*)o", "hello")'
{
   "captures": [
      "ell"
   ],
   "namedCaptures": {
      "mid": "ell"
   },
   "string": "hello"
}
```

Introduces a dependency on RE2 2019-06-01.
Builds tested using make, CMake and Bazel on Ubuntu 18.04.
  • Loading branch information
dcoles committed Jun 2, 2019
1 parent 0134fd6 commit 543f422
Show file tree
Hide file tree
Showing 11 changed files with 291 additions and 12 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ matrix:
apt:
packages:
- g++-4.9
- libre2-dev
sources: &sources
- llvm-toolchain-precise-3.8
- ubuntu-toolchain-r-test
Expand All @@ -16,6 +17,7 @@ matrix:
apt:
packages:
- clang-3.8
- libre2-dev
sources: *sources
- os: osx
osx_image: xcode8
Expand Down Expand Up @@ -49,4 +51,4 @@ notifications:
channels:
- "chat.freenode.net#jsonnet"
template:
- "%{repository}/%{branch} (%{commit} - %{author}): %{message}"
- "%{repository}/%{branch} (%{commit} - %{author}): %{message}"
46 changes: 44 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,50 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})

# Include external RE2 project. This runs a CMake sub-script
# (RE2CMakeLists.txt.in) that downloads googletest source. It's then built as part
# of the jsonnet project. The conventional way of handling CMake dependencies is
# to use a find_package script, which finds and installs the library from
# known locations on the local machine. Downloading the library ourselves
# allows us to pin to a specific version and makes things easier for users
# who don't have package managers.

# Generate and download RE2 project.
set(RE2_DIR ${GLOBAL_OUTPUT_PATH}/re2-download)
configure_file(RE2CMakeLists.txt.in ${RE2_DIR}/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${RE2_DIR}
)
if(result)
message(FATAL_ERROR "RE2 download failed: ${result}")
endif()

# Build RE2.
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${RE2_DIR})
if(result)
message(FATAL_ERROR "Build step for re2 failed: ${result}")
endif()

# Add RE2 directly to our build. This defines
# the re2 target.
add_subdirectory(${GLOBAL_OUTPUT_PATH}/re2-src
${GLOBAL_OUTPUT_PATH}/re2-build)

# Include RE2 headers.
include_directories("${RE2_SOURCE_DIR}/include")

# Allow linking into a shared library.
set_property(TARGET re2 PROPERTY POSITION_INDEPENDENT_CODE ON)

# RE2 requires pthreads
set_property(TARGET re2 PROPERTY INTERFACE_COMPILE_OPTIONS $<${UNIX}:-pthread>)
set_property(TARGET re2 PROPERTY INTERFACE_LINK_LIBRARIES $<${UNIX}:-pthread>)

# Include external googletest project. This runs a CMake sub-script
# (CMakeLists.txt.in) that downloads googletest source. It's then built as part
# (GoogleTestCMakeLists.txt.in) that downloads googletest source. It's then built as part
# of the jsonnet project. The conventional way of handling CMake dependencies is
# to use a find_package script, which finds and installs the library from
# known locations on the local machine. Downloading the library ourselves
Expand All @@ -41,7 +83,7 @@ if (BUILD_TESTS AND NOT USE_SYSTEM_GTEST)

# Generate and download googletest project.
set(GOOGLETEST_DIR ${GLOBAL_OUTPUT_PATH}/googletest-download)
configure_file(CMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
configure_file(GoogleTestCMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${GOOGLETEST_DIR}
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ CFLAGS ?= -g $(OPT) -Wall -Wextra -pedantic -std=c99 -fPIC -Iinclude
MAKEDEPENDFLAGS ?= -Iinclude -Ithird_party/md5 -Ithird_party/json
EMCXXFLAGS = $(CXXFLAGS) -g0 -Os --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s OUTLINING_LIMIT=10000 -s RESERVED_FUNCTION_POINTERS=20 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
EMCFLAGS = $(CFLAGS) --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
LDFLAGS ?=
LDFLAGS ?= -lre2

SHARED_LDFLAGS ?= -shared

Expand Down Expand Up @@ -121,11 +121,11 @@ core/desugarer.cpp: core/std.jsonnet.h

# Commandline executable.
jsonnet: cmd/jsonnet.cpp cmd/utils.cpp $(LIB_OBJ)
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)

# Commandline executable (reformatter).
jsonnetfmt: cmd/jsonnetfmt.cpp cmd/utils.cpp $(LIB_OBJ)
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)

# C binding.
libjsonnet.so: $(LIB_OBJ)
Expand Down
18 changes: 18 additions & 0 deletions RE2CMakeLists.txt.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# CMake script run a generation-time. This must be separate from the main
# CMakeLists.txt file to allow downloading and building googletest at generation
# time.
cmake_minimum_required(VERSION 2.8.2)

project(re2-download NONE)

include(ExternalProject)
ExternalProject_Add(re2
GIT_REPOSITORY https://github.com/google/re2.git
GIT_TAG 2019-06-01
SOURCE_DIR "${GLOBAL_OUTPUT_PATH}/re2-src"
BINARY_DIR "${GLOBAL_OUTPUT_PATH}/re2-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
10 changes: 9 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,19 @@ git_repository(
git_repository(
name = "com_google_googletest",
remote = "https://github.com/google/googletest.git",
# If updating googletest version, also update CMakeLists.txt.in.
# If updating googletest version, also update GoogleTestCMakeLists.txt.in.
commit = "2fe3bd994b3189899d93f1d5a881e725e046fdc2", # release: release-1.8.1
shallow_since = "1535728917 -0400",
)

git_repository(
name = "com_googlesource_code_re2",
remote = "https://github.com/google/re2.git",
# If updating RE2 version, also update RE2CMakeLists.txt.in.
commit = "0c95bcce2f1f0f071a786ca2c42384b211b8caba", # release: 2019-06-01
shallow_since = "1558525654 +0000",
)

load("//tools/build_defs:python_repo.bzl", "python_interpreter")

python_interpreter(name = "default_python")
1 change: 1 addition & 0 deletions core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ cc_library(
"//stdlib:std",
"//third_party/json",
"//third_party/md5:libmd5",
"@com_googlesource_code_re2//:re2",
],
)

Expand Down
8 changes: 4 additions & 4 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ set(LIBJSONNET_SOURCE
vm.cpp)

add_library(libjsonnet SHARED ${LIBJSONNET_HEADERS} ${LIBJSONNET_SOURCE})
add_dependencies(libjsonnet md5 stdlib)
target_link_libraries(libjsonnet md5)
add_dependencies(libjsonnet md5 re2 stdlib)
target_link_libraries(libjsonnet md5 re2)

# CMake prepends CMAKE_SHARED_LIBRARY_PREFIX to shared libraries, so without
# this step the output would be |liblibjsonnet|.
Expand All @@ -45,8 +45,8 @@ install(TARGETS libjsonnet

# Static library for jsonnet command-line tool.
add_library(libjsonnet_static STATIC ${LIBJSONNET_SOURCE})
add_dependencies(libjsonnet_static md5 stdlib)
target_link_libraries(libjsonnet_static md5)
add_dependencies(libjsonnet_static md5 re2 stdlib)
target_link_libraries(libjsonnet_static md5 re2)
set_target_properties(libjsonnet_static PROPERTIES OUTPUT_NAME jsonnet)
install(TARGETS libjsonnet_static DESTINATION "${CMAKE_INSTALL_LIBDIR}")

Expand Down
7 changes: 6 additions & 1 deletion core/desugarer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct BuiltinDecl {
std::vector<UString> params;
};

static unsigned long max_builtin = 37;
static unsigned long max_builtin = 42;
BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
{
switch (builtin) {
Expand Down Expand Up @@ -76,6 +76,11 @@ BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
case 35: return {U"parseJson", {U"str"}};
case 36: return {U"encodeUTF8", {U"str"}};
case 37: return {U"decodeUTF8", {U"arr"}};
case 38: return {U"regexFullMatch", {U"pattern", U"str"}};
case 39: return {U"regexPartialMatch", {U"pattern", U"str"}};
case 40: return {U"regexQuoteMeta", {U"str"}};
case 41: return {U"regexReplace", {U"str", U"pattern", U"to"}};
case 42: return {U"regexGlobalReplace", {U"str", U"pattern", U"to"}};
default:
std::cerr << "INTERNAL ERROR: Unrecognized builtin function: " << builtin << std::endl;
std::abort();
Expand Down
133 changes: 133 additions & 0 deletions core/vm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ limitations under the License.
#include "json.hpp"
#include "md5.h"
#include "parser.h"
#include "re2/re2.h"
#include "state.h"
#include "static_analysis.h"
#include "string_utils.h"
Expand All @@ -35,6 +36,10 @@ using json = nlohmann::json;

namespace {

static const Fodder EF; // Empty fodder.

static const LocationRange E; // Empty.

/** Turn a path e.g. "/a/b/c" into a dir, e.g. "/a/b/". If there is no path returns "".
*/
std::string dir_name(const std::string &path)
Expand Down Expand Up @@ -881,6 +886,11 @@ class Interpreter {
builtins["parseJson"] = &Interpreter::builtinParseJson;
builtins["encodeUTF8"] = &Interpreter::builtinEncodeUTF8;
builtins["decodeUTF8"] = &Interpreter::builtinDecodeUTF8;
builtins["regexFullMatch"] = &Interpreter::builtinRegexFullMatch;
builtins["regexPartialMatch"] = &Interpreter::builtinRegexPartialMatch;
builtins["regexQuoteMeta"] = &Interpreter::builtinRegexQuoteMeta;
builtins["regexReplace"] = &Interpreter::builtinRegexReplace;
builtins["regexGlobalReplace"] = &Interpreter::builtinRegexGlobalReplace;
}

/** Clean up the heap, stack, stash, and builtin function ASTs. */
Expand Down Expand Up @@ -1373,6 +1383,129 @@ class Interpreter {
return decodeUTF8();
}

const AST *regexMatch(const std::string &pattern, const std::string &string, bool full)
{
RE2 re(pattern, RE2::CannedOptions::Quiet);
if(!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

int num_groups = re.NumberOfCapturingGroups();

std::vector<std::string> rcaptures(num_groups);
std::vector<RE2::Arg> rargv(num_groups);
std::vector<const RE2::Arg*> rargs(num_groups);
for(int i=0; i<num_groups; ++i) {
rargs[i] = &rargv[i];
rargv[i] = &rcaptures[i];
}

if(full ? RE2::FullMatchN(string, re, rargs.data(), num_groups)
: RE2::PartialMatchN(string, re, rargs.data(), num_groups)) {
std::map<const Identifier *, HeapSimpleObject::Field> fields;

const Identifier *fid = alloc->makeIdentifier(U"string");
fields[fid].hide = ObjectField::VISIBLE;
fields[fid].body = alloc->make<LiteralString>(E, EF, decode_utf8(string), LiteralString::DOUBLE, "", "");

fid = alloc->makeIdentifier(U"captures");
fields[fid].hide = ObjectField::VISIBLE;
std::vector<Array::Element> captures;
for(int i=0; i<num_groups; ++i) {
captures.push_back(Array::Element(
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[i]), LiteralString::DOUBLE, "", ""),
EF));
}
fields[fid].body = alloc->make<Array>(E, EF, captures, false, EF);

fid = alloc->makeIdentifier(U"namedCaptures");
fields[fid].hide = ObjectField::VISIBLE;
DesugaredObject::Fields named_captures;
const std::map<std::string, int> &named_groups = re.NamedCapturingGroups();
for(auto it=named_groups.cbegin(); it!=named_groups.cend(); ++it) {
named_captures.push_back(DesugaredObject::Field(
ObjectField::VISIBLE,
alloc->make<LiteralString>(E, EF, decode_utf8(it->first), LiteralString::DOUBLE, "", ""),
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[it->second-1]), LiteralString::DOUBLE, "", "")));
}
fields[fid].body = alloc->make<DesugaredObject>(E, ASTs{}, named_captures);

scratch = makeObject<HeapSimpleObject>(BindingFrame{}, fields, ASTs{});
} else {
scratch = makeNull();
}
return nullptr;
}

const AST *builtinRegexFullMatch(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexFullMatch", args, {Value::STRING, Value::STRING});

std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);

return regexMatch(pattern, string, true);
}

const AST *builtinRegexPartialMatch(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexPartialMatch", args, {Value::STRING, Value::STRING});

std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);

return regexMatch(pattern, string, false);
}

const AST *builtinRegexQuoteMeta(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexQuoteMeta", args, {Value::STRING});
scratch = makeString(decode_utf8(RE2::QuoteMeta(encode_utf8(static_cast<HeapString *>(args[0].v.h)->value))));
return nullptr;
}

const AST *builtinRegexReplace(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexReplace", args, {Value::STRING, Value::STRING, Value::STRING});

std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);

RE2 re(pattern, RE2::CannedOptions::Quiet);
if(!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

RE2::Replace(&string, re, replace);
scratch = makeString(decode_utf8(string));
return nullptr;
}

const AST *builtinRegexGlobalReplace(const LocationRange &loc, const std::vector<Value> &args)
{
validateBuiltinArgs(loc, "regexGlobalReplace", args, {Value::STRING, Value::STRING, Value::STRING});

std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);

RE2 re(pattern, RE2::CannedOptions::Quiet);
if(!re.ok()) {
std::stringstream ss;
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
throw makeError(stack.top().location, ss.str());
}

RE2::GlobalReplace(&string, re, replace);
scratch = makeString(decode_utf8(string));
return nullptr;
}

const AST *builtinTrace(const LocationRange &loc, const std::vector<Value> &args)
{
if(args[0].t != Value::STRING) {
Expand Down
Loading

0 comments on commit 543f422

Please sign in to comment.