forked from intel/hyperscan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathch_compile.h
394 lines (370 loc) · 15.9 KB
/
ch_compile.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/*
* Copyright (c) 2018, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CH_COMPILE_H_
#define CH_COMPILE_H_
/**
* @file
* @brief The Chimera compiler API definition.
*
* Chimera is a hybrid solution of Hyperscan and PCRE.
*
* This header contains functions for compiling regular expressions into
* Chimera databases that can be used by the Chimera runtime.
*/
#include "ch_common.h"
#include "hs_compile.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* A type containing error details that is returned by the compile calls (@ref
* ch_compile() and @ref ch_compile_multi() on failure. The caller may inspect
* the values returned in this type to determine the cause of failure.
*/
typedef struct ch_compile_error {
/**
* A human-readable error message describing the error.
*/
char *message;
/**
* The zero-based number of the expression that caused the error (if this
* can be determined). If the error is not specific to an expression, then
* this value will be less than zero.
*/
int expression;
} ch_compile_error_t;
/**
* The basic regular expression compiler.
*
* This is the function call with which an expression is compiled into a
* Chimera database which can be passed to the runtime function (
* @ref ch_scan())
*
* @param expression
* The NULL-terminated expression to parse. Note that this string must
* represent ONLY the pattern to be matched, with no delimiters or flags;
* any global flags should be specified with the @a flags argument. For
* example, the expression `/abc?def/i` should be compiled by providing
* `abc?def` as the @a expression, and @ref CH_FLAG_CASELESS as the @a
* flags.
*
* @param flags
* Flags which modify the behaviour of the expression. Multiple flags may
* be used by ORing them together. Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated for the
* expression per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the error
* parameter.
*/
ch_error_t HS_CDECL ch_compile(const char *expression, unsigned int flags,
unsigned int mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* The multiple regular expression compiler.
*
* This is the function call with which a set of expressions is compiled into a
* database which can be passed to the runtime function (@ref ch_scan()).
* Each expression can be labelled with a unique integer which is passed into
* the match callback to identify the pattern that has matched.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* ch_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
* first value in the @a flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the @a error
* parameter.
*
*/
ch_error_t HS_CDECL ch_compile_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
unsigned int elements, unsigned int mode,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* The multiple regular expression compiler with extended match limits support.
*
* This is the function call with which a set of expressions is compiled into a
* database in the same way as @ref ch_compile_multi(), but allows additional
* parameters to be specified via match_limit and match_limit_recursion to
* define match limits for PCRE runtime.
*
* @param expressions
* Array of NULL-terminated expressions to compile. Note that (as for @ref
* ch_compile()) these strings must contain only the pattern to be
* matched, with no delimiters or flags. For example, the expression
* `/abc?def/i` should be compiled by providing `abc?def` as the first
* string in the @a expressions array, and @ref CH_FLAG_CASELESS as the
* first value in the @a flags array.
*
* @param flags
* Array of flags which modify the behaviour of each expression. Multiple
* flags may be used by ORing them together. Specifying the NULL pointer
* in place of an array will set the flags value for all patterns to zero.
* Valid values are:
* - CH_FLAG_CASELESS - Matching will be performed case-insensitively.
* - CH_FLAG_DOTALL - Matching a `.` will not exclude newlines.
* - CH_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data.
* - CH_FLAG_SINGLEMATCH - Only one match will be generated by patterns
* with this match id per stream.
* - CH_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters.
* - CH_FLAG_UCP - Use Unicode properties for character classes.
*
* @param ids
* An array of integers specifying the ID number to be associated with the
* corresponding pattern in the expressions array. Specifying the NULL
* pointer in place of an array will set the ID value for all patterns to
* zero.
*
* @param elements
* The number of elements in the input arrays.
*
* @param mode
* Compiler mode flag that affect the database as a whole for capturing
* groups. One of CH_MODE_NOGROUPS or CH_MODE_GROUPS must be supplied.
* See @ref CH_MODE_FLAG for more details.
*
* @param match_limit
* A limit from pcre_extra on the amount of match function called in PCRE
* to limit backtracking that can take place.
*
* @param match_limit_recursion
* A limit from pcre_extra on the recursion depth of match function
* in PCRE.
*
* @param platform
* If not NULL, the platform structure is used to determine the target
* platform for the database. If NULL, a database suitable for running
* on the current host platform is produced.
*
* @param db
* On success, a pointer to the generated database will be returned in
* this parameter, or NULL on failure. The caller is responsible for
* deallocating the buffer using the @ref ch_free_database() function.
*
* @param compile_error
* If the compile fails, a pointer to a @ref ch_compile_error_t will be
* returned, providing details of the error condition. The caller is
* responsible for deallocating the buffer using the @ref
* ch_free_compile_error() function.
*
* @return
* @ref CH_SUCCESS is returned on successful compilation; @ref
* CH_COMPILER_ERROR on failure, with details provided in the @a error
* parameter.
*
*/
ch_error_t HS_CDECL ch_compile_ext_multi(const char *const *expressions,
const unsigned int *flags,
const unsigned int *ids,
unsigned int elements,
unsigned int mode,
unsigned long int match_limit,
unsigned long int match_limit_recursion,
const hs_platform_info_t *platform,
ch_database_t **db,
ch_compile_error_t **compile_error);
/**
* Free an error structure generated by @ref ch_compile(), @ref
* ch_compile_multi().
*
* @param error
* The @ref ch_compile_error_t to be freed. NULL may also be safely
* provided.
*
* @return
* @ref CH_SUCCESS on success, other values on failure.
*/
ch_error_t HS_CDECL ch_free_compile_error(ch_compile_error_t *error);
/**
* @defgroup CH_PATTERN_FLAG Pattern flags
*
* @{
*/
/**
* Compile flag: Set case-insensitive matching.
*
* This flag sets the expression to be matched case-insensitively by default.
* The expression may still use PCRE tokens (notably `(?i)` and
* `(?-i)`) to switch case-insensitive matching on and off.
*/
#define CH_FLAG_CASELESS 1
/**
* Compile flag: Matching a `.` will not exclude newlines.
*
* This flag sets any instances of the `.` token to match newline characters as
* well as all other characters. The PCRE specification states that the `.`
* token does not match newline characters by default, so without this flag the
* `.` token will not cross line boundaries.
*/
#define CH_FLAG_DOTALL 2
/**
* Compile flag: Set multi-line anchoring.
*
* This flag instructs the expression to make the `^` and `$` tokens match
* newline characters as well as the start and end of the stream. If this flag
* is not specified, the `^` token will only ever match at the start of a
* stream, and the `$` token will only ever match at the end of a stream within
* the guidelines of the PCRE specification.
*/
#define CH_FLAG_MULTILINE 4
/**
* Compile flag: Set single-match only mode.
*
* This flag sets the expression's match ID to match at most once, only the
* first match for each invocation of @ref ch_scan() will be returned.
*
*/
#define CH_FLAG_SINGLEMATCH 8
/**
* Compile flag: Enable UTF-8 mode for this expression.
*
* This flag instructs Chimera to treat the pattern as a sequence of UTF-8
* characters. The results of scanning invalid UTF-8 sequences with a Chimera
* library that has been compiled with one or more patterns using this flag are
* undefined.
*/
#define CH_FLAG_UTF8 32
/**
* Compile flag: Enable Unicode property support for this expression.
*
* This flag instructs Chimera to use Unicode properties, rather than the
* default ASCII interpretations, for character mnemonics like `\w` and `\s` as
* well as the POSIX character classes. It is only meaningful in conjunction
* with @ref CH_FLAG_UTF8.
*/
#define CH_FLAG_UCP 64
/** @} */
/**
* @defgroup CH_MODE_FLAG Compile mode flags
*
* The mode flags are used as values for the mode parameter of the various
* compile calls (@ref ch_compile(), @ref ch_compile_multi().
*
* By default, the matcher will only supply the start and end offsets of the
* match when the match callback is called. Using mode flag @ref CH_MODE_GROUPS
* will also fill the `captured' array with the start and end offsets of all
* the capturing groups specified by the pattern that has matched.
*
* @{
*/
/**
* Compiler mode flag: Disable capturing groups.
*/
#define CH_MODE_NOGROUPS 0
/**
* Compiler mode flag: Enable capturing groups.
*/
#define CH_MODE_GROUPS 1048576
/** @} */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CH_COMPILE_H_ */