-
Notifications
You must be signed in to change notification settings - Fork 4
/
cqp.ebnf
175 lines (136 loc) · 5.5 KB
/
cqp.ebnf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
// ==========================================
// EBNF for CQP (noSke dialect mostly)
// ==========================================
//
// For details on the dialect, see /svn/intern/incubator/cqp2sparql/docs/
// Maintainer: Max Ionov, max.ionov@gmail.com
// Contributors: Max Ionov, Florian Stein
// Lifehack: set syntax highlighting to C or C++, it's good enough and it makes the grammar much more readable
//
// TODO:
// 1. Quantification for items
// 2. Better support for constraints
// ==========================================
// Entry point
// ==========================================
// The essence of a CQP query is a set of items (query) and additional global constraints on that items
start: query_outside [ (_AND|"::") constraints ]
// First, query:
// Query consists of items, segments and operations `within` and `containing`.
// Each item denotes a word (or several words if there is a quantifier), possibly with some conditions
// `expr` is a single item in items.
// It can be:
// 1. A (named) word ([...])
// 2. A segment name (<powla:sentence/>)
//
// 1. `expr` (not) within another item or (not) containing another `expr` ([conll:POS="olia:Verb"] within <powla:sentence/>) (recursion)
// 2. A query enclosed in brackets (recursion one level up)
//
// Note: [] [] has more priority that within and containing:
// [] [word="the"] within [word="obtain"] [] containing [word="the"] evaluates [word="obtain"] + [] before within and containing
// The same goes for <segment/> <segment/> (<s/> <s/> containing [lc="баба"] [lc="катя"])
// OR operation is in the middle (<s/> <s/> containing [lc="тетя"] | [lc="баба"] [lc="катя"])
// (tested on noSketch Engine)
query_outside: query -> query
?query: altern
| query "within" altern -> within
| query "!within" altern -> not_within
| query "containing" altern -> containing
| query "!containing" altern -> not_containing
?altern: expr (_OR expr)*
// // with redundant word-node after sequence
// word: var_name? "[" word_cond "]"
// ?word_or_seq: word
// | word "*" -> sequence
// | word "+" -> non_empty_sequence
// | word "{" lower "," upper "}" -> limited
// | word "{" exect_number "}" -> exect_num
// | word "?" -> optinal
// // minimal-solution without word-node after sequence
// ?word_or_seq: word
// | hidden_word "*" -> sequence
// | hidden_word "+" -> non_empty_sequence
// | hidden_word "{" lower "," upper "}" -> limited
// | hidden_word "{" exect_number "}" -> exect_num
// | hidden_word "?" -> optinal
// ?hidden_word: var_name? "[" word_cond "]"
// word: hidden_word -> word
// accepting cool stuff like ([] [])* witch CQP actualy does not accept
?word_or_seq: word
| word_or_brack "*" -> sequence
| word_or_brack "+" -> non_empty_sequence
| word_or_brack "{" lower "," upper "}" -> limited
| word_or_brack "{" lower ",}" -> limited_lower
| word_or_brack "{," upper "}" -> limited_upper
| word_or_brack "{" exect_number "}" -> exect_num
| word_or_brack "?" -> optinal
?word_or_brack: word // hidden_word // for no redundant word-node
| "(" expr ")"
word: (var_name ":")? "[" word_cond "]" -> word
// ?hidden_word: var_name? "[" word_cond "]"
// word: hidden_word -> word
lower: NUMBER -> lower_limit
upper: NUMBER -> upper_limit
exect_number: NUMBER -> count
?expr: word_or_seq+ -> words
| segment+ -> segments
| expr expr -> words
| "(" query ")"
// `word` is a set of conditions inside a single (named) word ([...])
// It can be empty or consist of a boolean expression
word_cond: [ disjunction ]
?disjunction: conjunction (_OR conjunction)*
?conjunction: condition (_AND condition)*
?condition: param "=" value -> eq
| param "!=" value -> neq
| "(" disjunction ")"
| "!" condition -> neg
// Definitions of elements inside one component of a boolean expression denoting `word`
param: TOKEN
value: QUOTED_STRING
// Now, constraints:
// constraints: condition
constraints: c_disjunc -> constraints
// | "(" constraints ")"
// | (_NOT) "(" constraints ")" -> c_neq
?c_disjunc: c_conjunc (_OR c_conjunc)* //-> c_or
?c_conjunc: constraint (_AND constraint)* //-> c_and
?constraint: word_param "=" word_param -> c_eq
| word_param "!=" word_param -> c_neq
| "(" c_disjunc ")"
| "!" c_disjunc -> c_neg
// word_param: var_name "." c_prefix ":" c_param -> term
word_param: var_name "." c_param -> term
| var_name -> term
// v_name: CNAME -> v_name
c_param: TOKEN -> c_param
// c_prefix: CNAME -> c_prefix
// ==========================================
// Basic definitions
// ==========================================
// -------- CQP-specific definitions --------
// 1. Defining labels for words (not only digits but any alphanumeric with a colon)
// 2. Defining segments
var_name: CNAME
| NUMBER
segment: SEGMENT_NAME
SEGMENT_NAME: "<" TOKEN ">"
| "</" TOKEN ">"
| "<" TOKEN "/>"
| "</>"
// -------- Unifiying different styles of logical primitives, let's give a user more freedom --------
_OR: "or" | "|"
_AND: "and" | "&"
_NOT: "not" | "!"
// -------- Defining primitives --------
TOKEN: /[A-Za-z0-9_:]+/
QUOTED_STRING: ESCAPED_STRING_DBL | ESCAPED_STRING_SGL
STRING_INNER_DBL: ("\\\""|/[^"]/)
ESCAPED_STRING_DBL: "\"" STRING_INNER_DBL* "\""
STRING_INNER_SGL: ("\\'"|/[^']/)
ESCAPED_STRING_SGL: "'" STRING_INNER_SGL* "'"
// -------- Importing basic data types and ignoring whitespaces --------
%import common.CNAME
%import common.WS
%ignore WS
%import common.NUMBER