Skip to content

Commit b4db22f

Browse files
committed
add a new (split) PEG special
This works similarly to string/split, but the separator is a PEG.
1 parent ea75086 commit b4db22f

File tree

3 files changed

+92
-2
lines changed

3 files changed

+92
-2
lines changed

src/core/peg.c

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
typedef struct {
4040
const uint8_t *text_start;
4141
const uint8_t *text_end;
42-
/* text_end will be restricted in a (sub) rule, but
42+
/* text_end can be restricted by some rules, but
4343
outer_text_end will always contain the real end of
4444
input, which we need to generate a line mapping */
4545
const uint8_t *outer_text_end;
@@ -510,6 +510,44 @@ static const uint8_t *peg_rule(
510510
return window_end;
511511
}
512512

513+
case RULE_SPLIT: {
514+
const uint8_t *saved_end = s->text_end;
515+
const uint32_t *rule_separator = s->bytecode + rule[1];
516+
const uint32_t *rule_subpattern = s->bytecode + rule[2];
517+
518+
const uint8_t *separator_end = NULL;
519+
do {
520+
const uint8_t *text_start = text;
521+
CapState cs = cap_save(s);
522+
down1(s);
523+
while (text <= s->text_end) {
524+
separator_end = peg_rule(s, rule_separator, text);
525+
cap_load(s, cs);
526+
if (separator_end) {
527+
break;
528+
}
529+
text++;
530+
}
531+
up1(s);
532+
533+
if (separator_end) {
534+
s->text_end = text;
535+
text = separator_end;
536+
}
537+
538+
down1(s);
539+
const uint8_t *subpattern_end = peg_rule(s, rule_subpattern, text_start);
540+
up1(s);
541+
s->text_end = saved_end;
542+
543+
if (!subpattern_end) {
544+
return NULL;
545+
}
546+
} while (separator_end);
547+
548+
return text;
549+
}
550+
513551
case RULE_REPLACE:
514552
case RULE_MATCHTIME: {
515553
uint32_t tag = rule[3];
@@ -1143,6 +1181,14 @@ static void spec_sub(Builder *b, int32_t argc, const Janet *argv) {
11431181
emit_2(r, RULE_SUB, subrule1, subrule2);
11441182
}
11451183

1184+
static void spec_split(Builder *b, int32_t argc, const Janet *argv) {
1185+
peg_fixarity(b, argc, 2);
1186+
Reserve r = reserve(b, 3);
1187+
uint32_t subrule1 = peg_compile1(b, argv[0]);
1188+
uint32_t subrule2 = peg_compile1(b, argv[1]);
1189+
emit_2(r, RULE_SPLIT, subrule1, subrule2);
1190+
}
1191+
11461192
#ifdef JANET_INT_TYPES
11471193
#define JANET_MAX_READINT_WIDTH 8
11481194
#else
@@ -1226,6 +1272,7 @@ static const SpecialPair peg_specials[] = {
12261272
{"sequence", spec_sequence},
12271273
{"set", spec_set},
12281274
{"some", spec_some},
1275+
{"split", spec_split},
12291276
{"sub", spec_sub},
12301277
{"thru", spec_thru},
12311278
{"to", spec_to},
@@ -1562,6 +1609,7 @@ static void *peg_unmarshal(JanetMarshalContext *ctx) {
15621609
i += 4;
15631610
break;
15641611
case RULE_SUB:
1612+
case RULE_SPLIT:
15651613
/* [rule, rule] */
15661614
if (rule[1] >= blen) goto bad;
15671615
if (rule[2] >= blen) goto bad;

src/include/janet.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2141,7 +2141,8 @@ typedef enum {
21412141
RULE_COLUMN, /* [tag] */
21422142
RULE_UNREF, /* [rule, tag] */
21432143
RULE_CAPTURE_NUM, /* [rule, tag] */
2144-
RULE_SUB /* [rule, rule] */
2144+
RULE_SUB, /* [rule, rule] */
2145+
RULE_SPLIT /* [rule, rule] */
21452146
} JanetPegOpcod;
21462147

21472148
typedef struct {

test/suite-peg.janet

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@
265265
(marshpeg '(group "abc"))
266266
(marshpeg '(sub "abcdf" "abc"))
267267
(marshpeg '(* (sub 1 1)))
268+
(marshpeg '(split "," (+ "a" "b" "c")))
268269

269270
# Peg swallowing errors
270271
# 159651117
@@ -710,5 +711,45 @@
710711
"abcdef"
711712
@[])
712713

714+
(test "split: basic functionality"
715+
~(split "," '1)
716+
"a,b,c"
717+
@["a" "b" "c"])
718+
719+
(test "split: drops captures from separator pattern"
720+
~(split '"," '1)
721+
"a,b,c"
722+
@["a" "b" "c"])
723+
724+
(test "split: can match empty subpatterns"
725+
~(split "," ':w*)
726+
",a,,bar,,,c,,"
727+
@["" "a" "" "bar" "" "" "c" "" ""])
728+
729+
(test "split: subpattern is limited to only text before the separator"
730+
~(split "," '(to -1))
731+
"a,,bar,c"
732+
@["a" "" "bar" "c"])
733+
734+
(test "split: fails if any subpattern fails"
735+
~(split "," '"a")
736+
"a,a,b"
737+
nil)
738+
739+
(test "split: separator does not have to match anything"
740+
~(split "x" '(to -1))
741+
"a,a,b"
742+
@["a,a,b"])
743+
744+
(test "split: always consumes entire input"
745+
~(split 1 '"")
746+
"abc"
747+
@["" "" "" ""])
748+
749+
(test "split: separator can be an arbitrary PEG"
750+
~(split :s+ '(to -1))
751+
"a b c"
752+
@["a" "b" "c"])
753+
713754
(end-suite)
714755

0 commit comments

Comments
 (0)