Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extmod/ure: Extend functionality #1544

Merged
merged 8 commits into from
Feb 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 232 additions & 0 deletions extmod/modure.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,83 @@ STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) {
}
MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group);

#if MICROPY_PY_URE_MATCH_GROUPS

STATIC mp_obj_t match_groups(mp_obj_t self_in) {
mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
if (self->num_matches <= 1) {
return mp_const_empty_tuple;
}
mp_obj_tuple_t *groups = MP_OBJ_TO_PTR(mp_obj_new_tuple(self->num_matches - 1, NULL));
for (int i = 1; i < self->num_matches; ++i) {
groups->items[i - 1] = match_group(self_in, MP_OBJ_NEW_SMALL_INT(i));
}
return MP_OBJ_FROM_PTR(groups);
}
MP_DEFINE_CONST_FUN_OBJ_1(match_groups_obj, match_groups);

#endif

#if MICROPY_PY_URE_MATCH_SPAN_START_END

STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span[2]) {
mp_obj_match_t *self = MP_OBJ_TO_PTR(args[0]);

mp_int_t no = 0;
if (n_args == 2) {
no = mp_obj_get_int(args[1]);
if (no < 0 || no >= self->num_matches) {
nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, args[1]));
}
}

mp_int_t s = -1;
mp_int_t e = -1;
const char *start = self->caps[no * 2];
if (start != NULL) {
// have a match for this group
const char *begin = mp_obj_str_get_str(self->str);
s = start - begin;
e = self->caps[no * 2 + 1] - begin;
}

span[0] = mp_obj_new_int(s);
span[1] = mp_obj_new_int(e);
}

STATIC mp_obj_t match_span(size_t n_args, const mp_obj_t *args) {
mp_obj_t span[2];
match_span_helper(n_args, args, span);
return mp_obj_new_tuple(2, span);
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_span_obj, 1, 2, match_span);

STATIC mp_obj_t match_start(size_t n_args, const mp_obj_t *args) {
mp_obj_t span[2];
match_span_helper(n_args, args, span);
return span[0];
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_start_obj, 1, 2, match_start);

STATIC mp_obj_t match_end(size_t n_args, const mp_obj_t *args) {
mp_obj_t span[2];
match_span_helper(n_args, args, span);
return span[1];
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_end_obj, 1, 2, match_end);

#endif

STATIC const mp_rom_map_elem_t match_locals_dict_table[] = {
{ MP_ROM_QSTR(MP_QSTR_group), MP_ROM_PTR(&match_group_obj) },
#if MICROPY_PY_URE_MATCH_GROUPS
{ MP_ROM_QSTR(MP_QSTR_groups), MP_ROM_PTR(&match_groups_obj) },
#endif
#if MICROPY_PY_URE_MATCH_SPAN_START_END
{ MP_ROM_QSTR(MP_QSTR_span), MP_ROM_PTR(&match_span_obj) },
{ MP_ROM_QSTR(MP_QSTR_start), MP_ROM_PTR(&match_start_obj) },
{ MP_ROM_QSTR(MP_QSTR_end), MP_ROM_PTR(&match_end_obj) },
#endif
};

STATIC MP_DEFINE_CONST_DICT(match_locals_dict, match_locals_dict_table);
Expand All @@ -103,6 +178,35 @@ STATIC mp_obj_t ure_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
size_t len;
subj.begin = mp_obj_str_get_data(args[1], &len);
subj.end = subj.begin + len;
#if MICROPY_PY_URE_MATCH_SPAN_START_END
if (n_args > 2) {
const mp_obj_type_t *self_type = mp_obj_get_type(args[1]);
mp_int_t str_len = MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(args[1]));
const byte *begin = (const byte *)subj.begin;

int pos = mp_obj_get_int(args[2]);
if (pos >= str_len) {
return mp_const_none;
}
if (pos < 0) {
pos = 0;
}
const byte *pos_ptr = str_index_to_ptr(self_type, begin, len, MP_OBJ_NEW_SMALL_INT(pos), true);

const byte *endpos_ptr = (const byte *)subj.end;
if (n_args > 3) {
int endpos = mp_obj_get_int(args[3]);
if (endpos <= pos) {
return mp_const_none;
}
// Will cap to length
endpos_ptr = str_index_to_ptr(self_type, begin, len, args[3], true);
}

subj.begin = (const char *)pos_ptr;
subj.end = (const char *)endpos_ptr;
}
#endif
int caps_num = (self->re.sub + 1) * 2;
mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char*, caps_num);
// cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
Expand Down Expand Up @@ -174,10 +278,127 @@ STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split);

#if MICROPY_PY_URE_SUB

STATIC mp_obj_t re_sub_helper(mp_obj_t self_in, size_t n_args, const mp_obj_t *args) {
mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in);
mp_obj_t replace = args[1];
mp_obj_t where = args[2];
mp_int_t count = 0;
if (n_args > 3) {
count = mp_obj_get_int(args[3]);
// Note: flags are currently ignored
}

size_t where_len;
const char *where_str = mp_obj_str_get_data(where, &where_len);
Subject subj;
subj.begin = where_str;
subj.end = subj.begin + where_len;
int caps_num = (self->re.sub + 1) * 2;

vstr_t vstr_return;
vstr_return.buf = NULL; // We'll init the vstr after the first match
mp_obj_match_t *match = mp_local_alloc(sizeof(mp_obj_match_t) + caps_num * sizeof(char*));
match->base.type = &match_type;
match->num_matches = caps_num / 2; // caps_num counts start and end pointers
match->str = where;

for (;;) {
// cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
memset((char*)match->caps, 0, caps_num * sizeof(char*));
int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, false);

// If we didn't have a match, or had an empty match, it's time to stop
if (!res || match->caps[0] == match->caps[1]) {
break;
}

// Initialise the vstr if it's not already
if (vstr_return.buf == NULL) {
vstr_init(&vstr_return, match->caps[0] - subj.begin);
}

// Add pre-match string
vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin);

// Get replacement string
const char* repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace));

// Append replacement string to result, substituting any regex groups
while (*repl != '\0') {
if (*repl == '\\') {
++repl;
bool is_g_format = false;
if (*repl == 'g' && repl[1] == '<') {
// Group specified with syntax "\g<number>"
repl += 2;
is_g_format = true;
}

if ('0' <= *repl && *repl <= '9') {
// Group specified with syntax "\g<number>" or "\number"
unsigned int match_no = 0;
do {
match_no = match_no * 10 + (*repl++ - '0');
} while ('0' <= *repl && *repl <= '9');
if (is_g_format && *repl == '>') {
++repl;
}

if (match_no >= (unsigned int)match->num_matches) {
nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, MP_OBJ_NEW_SMALL_INT(match_no)));
}

const char *start_match = match->caps[match_no * 2];
if (start_match != NULL) {
// Add the substring matched by group
const char *end_match = match->caps[match_no * 2 + 1];
vstr_add_strn(&vstr_return, start_match, end_match - start_match);
}
}
} else {
// Just add the current byte from the replacement string
vstr_add_byte(&vstr_return, *repl++);
}
}

// Move start pointer to end of last match
subj.begin = match->caps[1];

// Stop substitutions if count was given and gets to 0
if (count > 0 && --count == 0) {
break;
}
}

mp_local_free(match);

if (vstr_return.buf == NULL) {
// Optimisation for case of no substitutions
return where;
}

// Add post-match string
vstr_add_strn(&vstr_return, subj.begin, subj.end - subj.begin);

return mp_obj_new_str_from_vstr(mp_obj_get_type(where), &vstr_return);
}

STATIC mp_obj_t re_sub(size_t n_args, const mp_obj_t *args) {
return re_sub_helper(args[0], n_args, args);
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_sub_obj, 3, 5, re_sub);

#endif

STATIC const mp_rom_map_elem_t re_locals_dict_table[] = {
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) },
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) },
{ MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) },
#if MICROPY_PY_URE_SUB
{ MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) },
#endif
};

STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table);
Expand Down Expand Up @@ -232,11 +453,22 @@ STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) {
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search);

#if MICROPY_PY_URE_SUB
STATIC mp_obj_t mod_re_sub(size_t n_args, const mp_obj_t *args) {
mp_obj_t self = mod_re_compile(1, args);
return re_sub_helper(self, n_args, args);
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_sub_obj, 3, 5, mod_re_sub);
#endif

STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = {
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) },
{ MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) },
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) },
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) },
#if MICROPY_PY_URE_SUB
{ MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&mod_re_sub_obj) },
#endif
{ MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) },
};

Expand Down
46 changes: 41 additions & 5 deletions extmod/re1.5/compilecode.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,29 @@
#define EMIT(at, byte) (code ? (code[at] = byte) : (at))
#define PC (prog->bytelen)


static char unescape(char c) {
switch (c) {
case 'a':
return '\a';
case 'b':
return '\b';
case 'f':
return '\f';
case 'n':
return '\n';
case 'r':
return '\r';
case 'v':
return '\v';
case 'x':
return '\\';
default:
return c;
}
}


static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
{
char *code = sizecode ? NULL : prog->insts;
Expand All @@ -22,13 +45,16 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
case '\\':
re++;
if (!*re) return NULL; // Trailing backslash
term = PC;
if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
term = PC;
EMIT(PC++, NamedClass);
EMIT(PC++, *re);
prog->len++;
break;
} else {
EMIT(PC++, Char);
EMIT(PC++, unescape(*re));
}
prog->len++;
break;
default:
term = PC;
EMIT(PC++, Char);
Expand All @@ -54,11 +80,21 @@ static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
prog->len++;
for (cnt = 0; *re != ']'; re++, cnt++) {
if (!*re) return NULL;
EMIT(PC++, *re);
if (*re == '\\') {
re += 1;
EMIT(PC++, unescape(*re));
} else {
EMIT(PC++, *re);
}
if (re[1] == '-' && re[2] != ']') {
re += 2;
}
EMIT(PC++, *re);
if (*re == '\\') {
re += 1;
EMIT(PC++, unescape(*re));
} else {
EMIT(PC++, *re);
}
}
EMIT(term + 1, cnt);
break;
Expand Down
Loading