16
16
17
17
package com .microsoft .hyperspace .util
18
18
19
- import org .apache .spark .sql .types .{ArrayType , StructField , StructType }
19
+ import org .apache .spark .sql .types .{ArrayType , MapType , StructField , StructType }
20
20
21
21
object SchemaUtils {
22
22
23
+ val BACKTICK_MARKER_REGEX = " ^`(.*)`$"
23
24
val NESTED_FIELD_NEEDLE_REGEX = " \\ ."
24
25
val NESTED_FIELD_REPLACEMENT = " __"
25
26
@@ -31,6 +32,7 @@ object SchemaUtils {
31
32
* root
32
33
* |-- id: integer (nullable = true)
33
34
* |-- name: string (nullable = true)
35
+ * |-- nested.nst.field1: string (nullable = true)
34
36
* |-- nested: struct (nullable = true)
35
37
* | |-- field1: string (nullable = true)
36
38
* | |-- nst: struct (nullable = true)
@@ -44,12 +46,16 @@ object SchemaUtils {
44
46
* Seq(
45
47
* "id",
46
48
* "name",
49
+ * "`nested.nst.field1`",
47
50
* "nested.field1",
48
51
* "nested.nst.field1",
49
52
* "nested.nst.field2"
50
53
* )
51
54
* }}}
52
55
*
56
+ * As observed if there is a field that is not nested but contains `.` (dot)
57
+ * that will be enclosed in backticks.
58
+ *
53
59
* @param structFields The struct fields we want to flatten. This can be a [[StructType ]] too.
54
60
* @param prefix Option where you can specify a prefix otherwise None.
55
61
* @return The list for leaf fields flattened.
@@ -60,8 +66,18 @@ object SchemaUtils {
60
66
flatten(fields, Some (prefix.map(o => s " $o. $name" ).getOrElse(name)))
61
67
case StructField (name, ArrayType (StructType (fields), _), _, _) =>
62
68
flatten(fields, Some (prefix.map(o => s " $o. $name" ).getOrElse(name)))
69
+ case StructField (name, MapType (StructType (keys), StructType (values), _), _, _) =>
70
+ flatten(keys, Some (prefix.map(o => s " $o. $name" ).getOrElse(name))) ++
71
+ flatten(values, Some (prefix.map(o => s " $o. $name" ).getOrElse(name)))
63
72
case other =>
64
- Seq (prefix.map(o => s " $o. ${other.name}" ).getOrElse(other.name))
73
+ if (other.name.contains(" ." )) {
74
+ // first clean it, then prefix it, then again enclose it with backticks
75
+ val cleanName = other.name.replaceAll(BACKTICK_MARKER_REGEX , " $1" )
76
+ val prefixed = prefix.map(o => s " $o. $cleanName" ).getOrElse(cleanName)
77
+ Seq (s " ` $prefixed` " )
78
+ } else {
79
+ Seq (prefix.map(o => s " $o. ${other.name}" ).getOrElse(other.name))
80
+ }
65
81
}
66
82
}
67
83
@@ -76,18 +92,27 @@ object SchemaUtils {
76
92
}
77
93
78
94
/**
79
- * The method escapes the flattened field names.
95
+ * The method escapes the flattened field name if not enclosed by backticks.
96
+ *
97
+ * The field names can escaped by enclosing them by backticks to specify
98
+ * that the `.` (dot) does not mean a nested field.
80
99
*
81
100
* Given {{{nested.nst.field1}}} it will be escaped to {{{nested__nst__field1}}}.
101
+ * Given {{{`nested.nst.field1`}}} it will remain as it is.
82
102
*
83
103
* The values used for search and replaced are defined under
84
- * [[NESTED_FIELD_NEEDLE_REGEX ]] and [[NESTED_FIELD_REPLACEMENT ]].
104
+ * [[NESTED_FIELD_NEEDLE_REGEX ]], [[NESTED_FIELD_REPLACEMENT ]]
105
+ * and [[backticksEnclosed ]] method.
85
106
*
86
107
* @param field The flattened field name to be escaped.
87
108
* @return The escaped field name.
88
109
*/
89
110
def escapeFieldName (field : String ): String = {
90
- field.replaceAll(NESTED_FIELD_NEEDLE_REGEX , NESTED_FIELD_REPLACEMENT )
111
+ if (backticksEnclosed(field)) {
112
+ field
113
+ } else {
114
+ field.replaceAll(NESTED_FIELD_NEEDLE_REGEX , NESTED_FIELD_REPLACEMENT )
115
+ }
91
116
}
92
117
93
118
/**
@@ -101,10 +126,16 @@ object SchemaUtils {
101
126
}
102
127
103
128
/**
104
- * The method unescapes the field name (returns the original field name).
129
+ * The method unescapes the field name (returns the original field name)
130
+ * if the field name is not enclosed by backticks.
131
+ *
132
+ * The field names can escaped by enclosing them by backticks to specify
133
+ * that the `.` (dot) does not mean a nested field.
134
+ *
105
135
* The method is the inverse operation of [[escapeFieldName ]] method.
106
136
*
107
137
* Given {{{nested__nst__field1}}} it will be escaped to {{{nested.nst.field1}}}.
138
+ * Given {{{`nested__nst__field1`}}} it will remain as it is.
108
139
*
109
140
* The values used for search and replaced are defined under
110
141
* [[NESTED_FIELD_NEEDLE_REGEX ]] and [[NESTED_FIELD_REPLACEMENT ]].
@@ -113,7 +144,11 @@ object SchemaUtils {
113
144
* @return The original (unescaped) field name.
114
145
*/
115
146
def unescapeFieldName (field : String ): String = {
116
- field.replaceAll(NESTED_FIELD_REPLACEMENT , NESTED_FIELD_NEEDLE_REGEX )
147
+ if (backticksEnclosed(field)) {
148
+ field
149
+ } else {
150
+ field.replaceAll(NESTED_FIELD_REPLACEMENT , NESTED_FIELD_NEEDLE_REGEX )
151
+ }
117
152
}
118
153
119
154
/**
@@ -131,14 +166,32 @@ object SchemaUtils {
131
166
* The method checks if a field name represents a nested field.
132
167
*
133
168
* The check is implemented by checking if the field name string contains
134
- * the separator defined for nested field expressions.
169
+ * the separator defined for nested field expressions and is not enclosed
170
+ * by backticks.
135
171
*
136
- * See [[NESTED_FIELD_NEEDLE_REGEX ]].
172
+ * The field names can escaped by enclosing them by backticks to specify
173
+ * that the `.` (dot) does not mean a nested field.
137
174
*
138
- * @param field The field nme
175
+ * See [[NESTED_FIELD_NEEDLE_REGEX ]] and [[backticksEnclosed ]] method.
176
+ *
177
+ * @param field The field name
139
178
* @return True if the field name represents a nested field otherwise false.
140
179
*/
141
180
def isNestedField (field : String ): Boolean = {
181
+ ! backticksEnclosed(field) &&
142
182
NESTED_FIELD_NEEDLE_REGEX .r.findFirstIn(field).isDefined
143
183
}
184
+
185
+ /**
186
+ * The method detects if the field is enclosed by backticks.
187
+ *
188
+ * The field names can escaped by enclosing them by backticks to specify
189
+ * that the `.` (dot) does not mean a nested field.
190
+ *
191
+ * @param field The field name
192
+ * @return True is the field name is enclosed by backticks.
193
+ */
194
+ def backticksEnclosed (field : String ): Boolean = {
195
+ BACKTICK_MARKER_REGEX .r.findFirstIn(field).isDefined
196
+ }
144
197
}
0 commit comments