Skip to content
This repository was archived by the owner on Jun 14, 2024. It is now read-only.

Commit 3ff478e

Browse files
Integrate feedback (1)
1 parent 14c529f commit 3ff478e

File tree

3 files changed

+111
-15
lines changed

3 files changed

+111
-15
lines changed

src/main/scala/com/microsoft/hyperspace/actions/CreateAction.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,12 @@ class CreateAction(
6767
private def isValidIndexSchema(config: IndexConfig, schema: StructType): Boolean = {
6868
// First we flatten the schema. Instead of having struct of leaves
6969
// the flatten method will return a list of field names.
70-
// The second step is escaping the field names as Parquet does not works well
71-
// with field names that contains the `.` (dot).
72-
// Given `struct(nested, struct(nst, struct(field1)))`, the fields variable
70+
// The second step is escaping the field names as there are some problems when
71+
// using field names with dots. One is `partitionBy` does not works well
72+
// with field names that contains the `.` (dot). See more on this Apache Spark
73+
// ticket: https://issues.apache.org/jira/browse/SPARK-18084. Other is doing
74+
// encountering `AnalysisException: Cannot resolve column name...` exceptions.
75+
// So, given `struct(nested, struct(nst, struct(field1)))`, the fields variable
7376
// will contain `Seq("nested__nst__field1")`.
7477
val fields = SchemaUtils.escapeFieldNames(SchemaUtils.flatten(schema))
7578
// Resolve index config columns from available column names present in the schema.

src/main/scala/com/microsoft/hyperspace/util/SchemaUtils.scala

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616

1717
package com.microsoft.hyperspace.util
1818

19-
import org.apache.spark.sql.types.{ArrayType, StructField, StructType}
19+
import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType}
2020

2121
object SchemaUtils {
2222

23+
val BACKTICK_MARKER_REGEX = "^`(.*)`$"
2324
val NESTED_FIELD_NEEDLE_REGEX = "\\."
2425
val NESTED_FIELD_REPLACEMENT = "__"
2526

@@ -31,6 +32,7 @@ object SchemaUtils {
3132
* root
3233
* |-- id: integer (nullable = true)
3334
* |-- name: string (nullable = true)
35+
* |-- nested.nst.field1: string (nullable = true)
3436
* |-- nested: struct (nullable = true)
3537
* | |-- field1: string (nullable = true)
3638
* | |-- nst: struct (nullable = true)
@@ -44,12 +46,16 @@ object SchemaUtils {
4446
* Seq(
4547
* "id",
4648
* "name",
49+
* "`nested.nst.field1`",
4750
* "nested.field1",
4851
* "nested.nst.field1",
4952
* "nested.nst.field2"
5053
* )
5154
* }}}
5255
*
56+
* As observed if there is a field that is not nested but contains `.` (dot)
57+
* that will be enclosed in backticks.
58+
*
5359
* @param structFields The struct fields we want to flatten. This can be a [[StructType]] too.
5460
* @param prefix Option where you can specify a prefix otherwise None.
5561
* @return The list for leaf fields flattened.
@@ -60,8 +66,18 @@ object SchemaUtils {
6066
flatten(fields, Some(prefix.map(o => s"$o.$name").getOrElse(name)))
6167
case StructField(name, ArrayType(StructType(fields), _), _, _) =>
6268
flatten(fields, Some(prefix.map(o => s"$o.$name").getOrElse(name)))
69+
case StructField(name, MapType(StructType(keys), StructType(values), _), _, _) =>
70+
flatten(keys, Some(prefix.map(o => s"$o.$name").getOrElse(name))) ++
71+
flatten(values, Some(prefix.map(o => s"$o.$name").getOrElse(name)))
6372
case other =>
64-
Seq(prefix.map(o => s"$o.${other.name}").getOrElse(other.name))
73+
if (other.name.contains(".")) {
74+
// first clean it, then prefix it, then again enclose it with backticks
75+
val cleanName = other.name.replaceAll(BACKTICK_MARKER_REGEX, "$1")
76+
val prefixed = prefix.map(o => s"$o.$cleanName").getOrElse(cleanName)
77+
Seq(s"`$prefixed`")
78+
} else {
79+
Seq(prefix.map(o => s"$o.${other.name}").getOrElse(other.name))
80+
}
6581
}
6682
}
6783

@@ -76,18 +92,27 @@ object SchemaUtils {
7692
}
7793

7894
/**
79-
* The method escapes the flattened field names.
95+
* The method escapes the flattened field name if not enclosed by backticks.
96+
*
97+
* The field names can escaped by enclosing them by backticks to specify
98+
* that the `.` (dot) does not mean a nested field.
8099
*
81100
* Given {{{nested.nst.field1}}} it will be escaped to {{{nested__nst__field1}}}.
101+
* Given {{{`nested.nst.field1`}}} it will remain as it is.
82102
*
83103
* The values used for search and replaced are defined under
84-
* [[NESTED_FIELD_NEEDLE_REGEX]] and [[NESTED_FIELD_REPLACEMENT]].
104+
* [[NESTED_FIELD_NEEDLE_REGEX]], [[NESTED_FIELD_REPLACEMENT]]
105+
* and [[backticksEnclosed]] method.
85106
*
86107
* @param field The flattened field name to be escaped.
87108
* @return The escaped field name.
88109
*/
89110
def escapeFieldName(field: String): String = {
90-
field.replaceAll(NESTED_FIELD_NEEDLE_REGEX, NESTED_FIELD_REPLACEMENT)
111+
if (backticksEnclosed(field)) {
112+
field
113+
} else {
114+
field.replaceAll(NESTED_FIELD_NEEDLE_REGEX, NESTED_FIELD_REPLACEMENT)
115+
}
91116
}
92117

93118
/**
@@ -101,10 +126,16 @@ object SchemaUtils {
101126
}
102127

103128
/**
104-
* The method unescapes the field name (returns the original field name).
129+
* The method unescapes the field name (returns the original field name)
130+
* if the field name is not enclosed by backticks.
131+
*
132+
* The field names can escaped by enclosing them by backticks to specify
133+
* that the `.` (dot) does not mean a nested field.
134+
*
105135
* The method is the inverse operation of [[escapeFieldName]] method.
106136
*
107137
* Given {{{nested__nst__field1}}} it will be escaped to {{{nested.nst.field1}}}.
138+
* Given {{{`nested__nst__field1`}}} it will remain as it is.
108139
*
109140
* The values used for search and replaced are defined under
110141
* [[NESTED_FIELD_NEEDLE_REGEX]] and [[NESTED_FIELD_REPLACEMENT]].
@@ -113,7 +144,11 @@ object SchemaUtils {
113144
* @return The original (unescaped) field name.
114145
*/
115146
def unescapeFieldName(field: String): String = {
116-
field.replaceAll(NESTED_FIELD_REPLACEMENT, NESTED_FIELD_NEEDLE_REGEX)
147+
if (backticksEnclosed(field)) {
148+
field
149+
} else {
150+
field.replaceAll(NESTED_FIELD_REPLACEMENT, NESTED_FIELD_NEEDLE_REGEX)
151+
}
117152
}
118153

119154
/**
@@ -131,14 +166,32 @@ object SchemaUtils {
131166
* The method checks if a field name represents a nested field.
132167
*
133168
* The check is implemented by checking if the field name string contains
134-
* the separator defined for nested field expressions.
169+
* the separator defined for nested field expressions and is not enclosed
170+
* by backticks.
135171
*
136-
* See [[NESTED_FIELD_NEEDLE_REGEX]].
172+
* The field names can escaped by enclosing them by backticks to specify
173+
* that the `.` (dot) does not mean a nested field.
137174
*
138-
* @param field The field nme
175+
* See [[NESTED_FIELD_NEEDLE_REGEX]] and [[backticksEnclosed]] method.
176+
*
177+
* @param field The field name
139178
* @return True if the field name represents a nested field otherwise false.
140179
*/
141180
def isNestedField(field: String): Boolean = {
181+
!backticksEnclosed(field) &&
142182
NESTED_FIELD_NEEDLE_REGEX.r.findFirstIn(field).isDefined
143183
}
184+
185+
/**
186+
* The method detects if the field is enclosed by backticks.
187+
*
188+
* The field names can escaped by enclosing them by backticks to specify
189+
* that the `.` (dot) does not mean a nested field.
190+
*
191+
* @param field The field name
192+
* @return True is the field name is enclosed by backticks.
193+
*/
194+
def backticksEnclosed(field: String): Boolean = {
195+
BACKTICK_MARKER_REGEX.r.findFirstIn(field).isDefined
196+
}
144197
}

src/test/scala/com/microsoft/hyperspace/util/SchemaUtilsTests.scala

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,21 @@ class SchemaUtilsTest extends SparkFunSuite with SparkInvolvedSuite {
106106
assert(flattened2(4) == "nested.n.n.nf1_b")
107107
assert(flattened2(5) == "nested.n.n.n.f1")
108108
assert(flattened2(6) == "nested.n.n.n.f2")
109+
110+
val df3 = Seq(
111+
(1, "f1", "g1", NestedType("n1", 1L)),
112+
(2, "f2", "g2", NestedType("n2", 1L)),
113+
(3, "f3", "g3", NestedType("n3", 1L))
114+
).toDF("id", "nested.f1", "`other.nested`", "nested")
115+
116+
val flattened3 = SchemaUtils.flatten(df3.schema)
117+
118+
assert(flattened3.length == 5)
119+
assert(flattened3(0) == "id")
120+
assert(flattened3(1) == "`nested.f1`")
121+
assert(flattened3(2) == "`other.nested`")
122+
assert(flattened3(3) == "nested.f1")
123+
assert(flattened3(4) == "nested.f2")
109124
}
110125

111126
test("flatten - array") {
@@ -176,25 +191,50 @@ class SchemaUtilsTest extends SparkFunSuite with SparkInvolvedSuite {
176191
assert(SchemaUtils.escapeFieldName("a.b") == "a__b")
177192
assert(SchemaUtils.escapeFieldName("a.b.c.d") == "a__b__c__d")
178193
assert(SchemaUtils.escapeFieldName("a_b_c_d") == "a_b_c_d")
194+
assert(SchemaUtils.escapeFieldName("`a.b.c.d`") == "`a.b.c.d`")
179195
}
180196

181197
test("escapeFieldNames") {
182198
assert(SchemaUtils.escapeFieldNames(
183-
Seq("a.b.c.d", "a.b", "A_B")) == Seq("a__b__c__d", "a__b", "A_B"))
199+
Seq("a.b.c.d", "a.b", "A_B", "`a.b`")) == Seq("a__b__c__d", "a__b", "A_B", "`a.b`"))
184200
assert(SchemaUtils.escapeFieldNames(Seq.empty[String]).isEmpty)
185201
}
186202

187203
test("unescapeFieldName") {
188204
assert(SchemaUtils.unescapeFieldName("a__b") == "a.b")
189205
assert(SchemaUtils.unescapeFieldName("a__b__c__d") == "a.b.c.d")
190206
assert(SchemaUtils.unescapeFieldName("a_b_c_d") == "a_b_c_d")
207+
assert(SchemaUtils.unescapeFieldName("`a__b__c__d`") == "`a__b__c__d`")
191208
}
192209

193210
test("unescapeFieldNames") {
194211
assert(SchemaUtils.unescapeFieldNames(
195-
Seq("a__b__c__d", "a__b", "A_B")) == Seq("a.b.c.d", "a.b", "A_B"))
212+
Seq("a__b__c__d", "a__b", "A_B", "`a__b`")) == Seq("a.b.c.d", "a.b", "A_B", "`a__b`"))
196213
assert(SchemaUtils.escapeFieldNames(Seq.empty[String]).isEmpty)
197214
}
215+
216+
test("isNestedField") {
217+
assert(SchemaUtils.isNestedField("a.b.c"))
218+
assert(SchemaUtils.isNestedField("`a.b.c"))
219+
assert(SchemaUtils.isNestedField("a.b.c`"))
220+
assert(!SchemaUtils.isNestedField("`a.b.c`"))
221+
assert(!SchemaUtils.isNestedField("`abc`"))
222+
assert(!SchemaUtils.isNestedField("`abc"))
223+
assert(!SchemaUtils.isNestedField("abc`"))
224+
}
225+
226+
test("hasNestedField") {
227+
assert(SchemaUtils.hasNestedFields(Seq("a.b.c", "abc")))
228+
assert(!SchemaUtils.hasNestedFields(Seq("`a.b.c`", "abc")))
229+
assert(!SchemaUtils.hasNestedFields(Seq.empty[String]))
230+
}
231+
232+
test("backticksEnclosed") {
233+
assert(!SchemaUtils.backticksEnclosed("a.b.c"))
234+
assert(!SchemaUtils.backticksEnclosed("`a.b.c"))
235+
assert(!SchemaUtils.backticksEnclosed("a.b.c`"))
236+
assert(SchemaUtils.backticksEnclosed("`a.b.c`"))
237+
}
198238
}
199239

200240
case class NestedType4(nf1_b: String, n: NestedType)

0 commit comments

Comments
 (0)