Skip to content

Commit

Permalink
[SPARK-26402][SQL] Accessing nested fields with different cases in ca…
Browse files Browse the repository at this point in the history
…se insensitive mode

## What changes were proposed in this pull request?

GetStructField with different optional names should be semantically equal. We will use this as building block to compare the nested fields used in the plans to be optimized by catalyst optimizer.

This PR also fixes a bug below that accessing nested fields with different cases in case insensitive mode will result `AnalysisException`.

```
sql("create table t (s struct<i: Int>) using json")
sql("select s.I from t group by s.i")
```
which is currently failing
```
org.apache.spark.sql.AnalysisException: expression 'default.t.`s`' is neither present in the group by, nor is it an aggregate function
```
as cloud-fan pointed out.

## How was this patch tested?

New tests are added.

Closes apache#23353 from dbtsai/nestedEqual.

Lead-authored-by: DB Tsai <[email protected]>
Co-authored-by: DB Tsai <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
2 people authored and dongjoon-hyun committed Dec 22, 2018
1 parent 90a8103 commit a5a24d9
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ package org.apache.spark.sql.catalyst.expressions
*
* The following rules are applied:
* - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped.
* - Names for [[GetStructField]] are stripped.
* - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered
* by `hashCode`.
* - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`.
Expand All @@ -37,10 +38,11 @@ object Canonicalize {
expressionReorder(ignoreNamesTypes(e))
}

/** Remove names and nullability from types. */
/** Remove names and nullability from types, and names from `GetStructField`. */
private[expressions] def ignoreNamesTypes(e: Expression): Expression = e match {
case a: AttributeReference =>
AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId)
case GetStructField(child, ordinal, Some(_)) => GetStructField(child, ordinal, None)
case _ => e
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.plans.logical.Range
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class CanonicalizeSuite extends SparkFunSuite {

Expand Down Expand Up @@ -50,4 +51,32 @@ class CanonicalizeSuite extends SparkFunSuite {
assert(range.where(arrays1).sameResult(range.where(arrays2)))
assert(!range.where(arrays1).sameResult(range.where(arrays3)))
}

test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
val expId = NamedExpression.newExprId
val qualifier = Seq.empty[String]
val structType = StructType(
StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil)

// GetStructField with different names are semantically equal
val fieldA1 = GetStructField(
AttributeReference("data1", structType, false)(expId, qualifier),
0, Some("a1"))
val fieldA2 = GetStructField(
AttributeReference("data2", structType, false)(expId, qualifier),
0, Some("a2"))
assert(fieldA1.semanticEquals(fieldA2))

val fieldB1 = GetStructField(
GetStructField(
AttributeReference("data1", structType, false)(expId, qualifier),
0, Some("a1")),
0, Some("b1"))
val fieldB2 = GetStructField(
GetStructField(
AttributeReference("data2", structType, false)(expId, qualifier),
0, Some("a2")),
0, Some("b2"))
assert(fieldB1.semanticEquals(fieldB2))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLite
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class BinaryComparisonSimplificationSuite extends PlanTest with PredicateHelper {

Expand Down Expand Up @@ -92,4 +93,33 @@ class BinaryComparisonSimplificationSuite extends PlanTest with PredicateHelper
val correctAnswer = nonNullableRelation.analyze
comparePlans(actual, correctAnswer)
}

test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
val expId = NamedExpression.newExprId
val qualifier = Seq.empty[String]
val structType = StructType(
StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil)

val fieldA1 = GetStructField(
GetStructField(
AttributeReference("data1", structType, false)(expId, qualifier),
0, Some("a1")),
0, Some("b1"))
val fieldA2 = GetStructField(
GetStructField(
AttributeReference("data2", structType, false)(expId, qualifier),
0, Some("a2")),
0, Some("b2"))

// GetStructField with different names are semantically equal; thus, `EqualTo(fieldA1, fieldA2)`
// will be optimized to `TrueLiteral` by `SimplifyBinaryComparison`.
val originalQuery = nonNullableRelation
.where(EqualTo(fieldA1, fieldA2))
.analyze

val optimized = Optimize.execute(originalQuery)
val correctAnswer = nonNullableRelation.analyze

comparePlans(optimized, correctAnswer)
}
}
19 changes: 19 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2937,6 +2937,25 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
}
}
}

test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
val msg = intercept[AnalysisException] {
withTable("t") {
sql("create table t (s struct<i: Int>) using json")
checkAnswer(sql("select s.I from t group by s.i"), Nil)
}
}.message
assert(msg.contains("No such struct field I in i"))
}

withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
withTable("t") {
sql("create table t (s struct<i: Int>) using json")
checkAnswer(sql("select s.I from t group by s.i"), Nil)
}
}
}
}

case class Foo(bar: Option[String])

0 comments on commit a5a24d9

Please sign in to comment.