From a185be5a4f684c9bdbd90a2b9716ca02dde9e7b2 Mon Sep 17 00:00:00 2001 From: Alex Coyte Date: Mon, 19 Jun 2017 11:03:05 +1000 Subject: [PATCH 1/6] Treat characters between \Q \E as codepoints in UTF8 mode. fixes github issue #57 --- src/parser/Parser.rl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index 52b3340c6..05a084bb0 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -1155,6 +1155,35 @@ unichar readUtf8CodePoint4c(const char *s) { '\\E' => { fgoto main; }; + + #unicode chars + utf8_2c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint2c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + + utf8_3c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint3c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + + utf8_4c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint4c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + # Literal character any => { addLiteral(currentSeq, *ts, mode); From d317d75615cdc6d0533e290de21a9bd205ffd12e Mon Sep 17 00:00:00 2001 From: Alex Coyte Date: Tue, 20 Jun 2017 10:19:32 +1000 Subject: [PATCH 2/6] character classes: handle \Q\E and utf8 --- src/parser/Parser.rl | 30 ++++++++++++++++++++++++++++++ unit/hyperscan/bad_patterns.txt | 2 ++ 2 files changed, 32 insertions(+) diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index 05a084bb0..ce9ca865b 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -1184,6 +1184,11 @@ unichar readUtf8CodePoint4c(const char *s) { currentSeq->addComponent(move(cc)); }; + hi_byte when is_utf8 => { + assert(mode.utf8); + throwInvalidUtf8(); + }; + # Literal character any => { addLiteral(currentSeq, *ts, mode); @@ -1198,6 +1203,31 @@ unichar readUtf8CodePoint4c(const char *s) { '\\E' => { fret; }; + + #unicode chars + utf8_2c when is_utf8 => { + assert(mode.utf8); + currentCls->add(readUtf8CodePoint2c(ts)); + inCharClassEarly = false; + }; + + utf8_3c when is_utf8 => { + assert(mode.utf8); + currentCls->add(readUtf8CodePoint3c(ts)); + inCharClassEarly = false; + }; + + utf8_4c when is_utf8 => { + assert(mode.utf8); + currentCls->add(readUtf8CodePoint4c(ts)); + inCharClassEarly = false; + }; + + hi_byte when is_utf8 => { + assert(mode.utf8); + throwInvalidUtf8(); + }; + # Literal character any => { currentCls->add(*ts); diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt index 3d6d9db90..3042dc829 100644 --- a/unit/hyperscan/bad_patterns.txt +++ b/unit/hyperscan/bad_patterns.txt @@ -142,3 +142,5 @@ 145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching. 146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching. 147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match. +148:/\QÀ\Eaaaa/8 #Expression is not valid UTF-8. +149:/[\QÀ\Eaaaa]/8 #Expression is not valid UTF-8. From 345897f096b022f905e7b15661222e567116cc5b Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Thu, 29 Jun 2017 11:05:21 +1000 Subject: [PATCH 3/6] cpuid: exclude AVX512 flag when target not enabled If we have a fat runtime build without AVX512 enabled, we should not pick up the AVX512 flag from cpuid. --- cmake/config.h.in | 3 +++ src/util/cpuid_flags.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/config.h.in b/cmake/config.h.in index 9c250b4c7..203f0afde 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -21,6 +21,9 @@ /* Define if building "fat" runtime. */ #cmakedefine FAT_RUNTIME +/* Define if building AVX-512 in the fat runtime. */ +#cmakedefine BUILD_AVX512 + /* Define to 1 if `backtrace' works. */ #cmakedefine HAVE_BACKTRACE diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c index c0ab09afb..3c62c07bf 100644 --- a/src/util/cpuid_flags.c +++ b/src/util/cpuid_flags.c @@ -192,7 +192,8 @@ u64a cpuid_flags(void) { cap &= ~HS_CPU_FEATURES_AVX2; #endif -#if !defined(FAT_RUNTIME) && !defined(HAVE_AVX512) +#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512)) || \ + (defined(FAT_RUNTIME) && !defined(BUILD_AVX512)) cap &= ~HS_CPU_FEATURES_AVX512; #endif From 8337d99574f98792c75d1053fe1c050f73533b5e Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Wed, 26 Jul 2017 14:01:57 +1000 Subject: [PATCH 4/6] build: use more portable flag for mktemp Fixes github issue #60 --- cmake/build_wrapper.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/build_wrapper.sh b/cmake/build_wrapper.sh index 70392229c..a6ee3b269 100755 --- a/cmake/build_wrapper.sh +++ b/cmake/build_wrapper.sh @@ -11,8 +11,8 @@ shift 2 # $@ contains the actual build command OUT=$(echo "$@" | sed 's/.* -o \(.*\.o\).*/\1/') trap cleanup INT QUIT EXIT -SYMSFILE=$(mktemp --tmpdir ${PREFIX}_rename.syms.XXXXX) -KEEPSYMS=$(mktemp --tmpdir keep.syms.XXXXX) +SYMSFILE=$(mktemp -p /tmp ${PREFIX}_rename.syms.XXXXX) +KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX) # find the libc used by gcc LIBC_SO=$("$@" --print-file-name=libc.so.6) cp ${KEEPSYMS_IN} ${KEEPSYMS} From 4650a59ce0f5edd7eada264cb3e43593dd100f56 Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Wed, 26 Jul 2017 14:27:07 +1000 Subject: [PATCH 5/6] changelog: updates for 4.5.2 release --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73cc2f3d6..2e28e3b15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ This is a list of notable changes to Hyperscan, in reverse chronological order. +## [4.5.2] 2017-07-26 +- Bugfix for issue #57: Treat characters between `\Q.\E` as codepoints in + UTF8 mode. +- Bugfix for issue #60: Use a portable flag for mktemp for fat runtime builds. +- Bugfix for fat runtime builds on AVX-512 capable machines with Hyperscan's + AVX-512 support disabled. + ## [4.5.1] 2017-06-16 - Bugfix for issue #56: workaround for gcc-4.8 C++11 defect. - Bugfix for literal matching table generation, reversing a regression in From 44e45f727e734ca98656955743c2ab45de4835ef Mon Sep 17 00:00:00 2001 From: Matthew Barr Date: Wed, 26 Jul 2017 14:44:28 +1000 Subject: [PATCH 6/6] Bump version number for release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a02584dea..2c2e298ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project (hyperscan C CXX) set (HS_MAJOR_VERSION 4) set (HS_MINOR_VERSION 5) -set (HS_PATCH_VERSION 1) +set (HS_PATCH_VERSION 2) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)