-
Notifications
You must be signed in to change notification settings - Fork 78
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve Schema Diff Error Message #160
Changes from 4 commits
9db07d8
7e9491e
c8c15e5
cbc01e7
857f59e
081fd5b
36b8dfa
bcfb255
606355b
90b0d77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,50 @@ | ||
package com.github.mrpowers.spark.fast.tests | ||
|
||
import com.github.mrpowers.spark.fast.tests.ufansi.Color.{DarkGray, Green, Red} | ||
import com.github.mrpowers.spark.fast.tests.ufansi.FansiExtensions.StrOps | ||
import org.apache.commons.lang3.StringUtils | ||
import org.apache.spark.sql.Row | ||
import com.github.mrpowers.spark.fast.tests.ufansi.FansiExtensions.StrOps | ||
object DataframeUtil { | ||
|
||
private[mrpowers] def showDataframeDiff( | ||
import scala.reflect.ClassTag | ||
|
||
object ProductUtil { | ||
private[mrpowers] def productOrRowToSeq(product: Any): Seq[Any] = { | ||
product match { | ||
case r: Row => r.toSeq | ||
case p: Product => p.productIterator.toSeq | ||
case null => Seq.empty | ||
case s => Seq(s) | ||
} | ||
} | ||
private[mrpowers] def showProductDiff[T: ClassTag]( | ||
zeotuan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
header: (String, String), | ||
actual: Seq[Row], | ||
expected: Seq[Row], | ||
actual: Seq[T], | ||
expected: Seq[T], | ||
truncate: Int = 20, | ||
minColWidth: Int = 3 | ||
minColWidth: Int = 3, | ||
defaultVal: T = null.asInstanceOf[T] | ||
): String = { | ||
|
||
val runTimeClass = implicitly[ClassTag[T]].runtimeClass | ||
val className = runTimeClass.getSimpleName | ||
zeotuan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
val border = if (runTimeClass == classOf[Row]) ("[", "]") else ("(", ")") | ||
val prodToString: Seq[Any] => String = s => s.mkString(s"$className${border._1}", ",", border._2) | ||
val emptyProd = "MISSING" | ||
|
||
val sb = new StringBuilder | ||
|
||
val fullJoin = actual.zipAll(expected, Row(), Row()) | ||
val fullJoin = actual.zipAll(expected, defaultVal, defaultVal) | ||
zeotuan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
val diff = fullJoin.map { case (actualRow, expectedRow) => | ||
if (equals(actualRow, expectedRow)) { | ||
if (actualRow == expectedRow) { | ||
List(DarkGray(actualRow.toString), DarkGray(expectedRow.toString)) | ||
} else { | ||
val actualSeq = actualRow.toSeq | ||
val expectedSeq = expectedRow.toSeq | ||
val actualSeq = productOrRowToSeq(actualRow) | ||
val expectedSeq = productOrRowToSeq(expectedRow) | ||
if (actualSeq.isEmpty) | ||
List( | ||
Red("[]"), | ||
Green(expectedSeq.mkString("[", ",", "]")) | ||
) | ||
List(Red(emptyProd), Green(prodToString(expectedSeq))) | ||
else if (expectedSeq.isEmpty) | ||
List(Red(actualSeq.mkString("[", ",", "]")), Green("[]")) | ||
List(Red(prodToString(actualSeq)), Green(emptyProd)) | ||
else { | ||
val withEquals = actualSeq | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would suggest skipping this part if the Type is not Row, if it´s There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought by this point we only work with in memory data - Seq[T] ? |
||
.zip(expectedSeq) | ||
|
@@ -38,22 +53,18 @@ object DataframeUtil { | |
} | ||
val allFieldsAreNotEqual = !withEquals.exists(_._3) | ||
if (allFieldsAreNotEqual) { | ||
List( | ||
Red(actualSeq.mkString("[", ",", "]")), | ||
Green(expectedSeq.mkString("[", ",", "]")) | ||
) | ||
List(Red(prodToString(actualSeq)), Green(prodToString(expectedSeq))) | ||
} else { | ||
|
||
val coloredDiff = withEquals | ||
.map { | ||
case (actualRowField, expectedRowField, true) => | ||
(DarkGray(actualRowField.toString), DarkGray(expectedRowField.toString)) | ||
case (actualRowField, expectedRowField, false) => | ||
(Red(actualRowField.toString), Green(expectedRowField.toString)) | ||
} | ||
val start = DarkGray("[") | ||
val start = DarkGray(s"$className${border._1}") | ||
val sep = DarkGray(",") | ||
val end = DarkGray("]") | ||
val end = DarkGray(border._2) | ||
List( | ||
coloredDiff.map(_._1).mkStr(start, sep, end), | ||
coloredDiff.map(_._2).mkStr(start, sep, end) | ||
|
@@ -69,11 +80,12 @@ object DataframeUtil { | |
val colWidths = Array.fill(numCols)(minColWidth) | ||
|
||
// Compute the width of each column | ||
for ((cell, i) <- headerSeq.zipWithIndex) { | ||
headerSeq.zipWithIndex.foreach({ case (cell, i) => | ||
colWidths(i) = math.max(colWidths(i), cell.length) | ||
} | ||
for (row <- diff) { | ||
for ((cell, i) <- row.zipWithIndex) { | ||
}) | ||
|
||
diff.foreach { row => | ||
row.zipWithIndex.foreach { case (cell, i) => | ||
colWidths(i) = math.max(colWidths(i), cell.length) | ||
} | ||
} | ||
|
@@ -117,5 +129,4 @@ object DataframeUtil { | |
|
||
sb.toString | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don
t see the improvement, any Dataset[T] can be transformed into a DataFrame, the only change here is that it will work with the original Class
T` that was provided, but the result will be the same, only with the Class name prefixing each line. If the only thing was to prevent the .asRows method we could have done this and not any other change will be needed.I see this as more unified and we wont have to check if its a Product or not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ShowProductDiff
is an update/extension to the originalshowDataframeDiff
so we can have a nicely format table display for anyCase Class
and additionally Row. This was done so we have nicely format table forStructField
And since
showProductDiff
can now also display differences for anyCase Class
I figured we no longer need to convert aDataset[T]
into aDataframe
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All these changes could be simplified by only adding a single parameter in the header because at the end the processing doesn't care if it's a row or a case class, it only requires the list of elements:
Use these 3 elements to make the representation, and when you call this function, pass the class name if it's not a Row.
I don't see any other benefit of getting the element as
T
because the comparison is not done here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
since we also use it for Seq[StructField] for displaying schema fields won't we have to make actual and expected
Seq[Any]
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you don't want to do this here, move it inside
showDataframeDiff
, you can put the type parameter but you ar not forced to use it. But I see it weird to have the signature of your function as:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I prefer putting it in
showProductDiff
since it will also need to be onbetterSchemaMismatchMessage
. Do you think we should still useSeq[T]
? I think that would make it at least safer for the input Type. ofcourse unless someone decided to use AnyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok to change the name, it's called from a dataset method. To keep the
[T]
, this is not a public method, it carries the type and the compiler will validate the schema of the actual and expected dataframe. The only problem is if the conversion to DataSet[T] is not correct, e.g. :If the table doesn't match, the error will be raised with an exception in the
.as[T]
method in runtime.Twhy reason I say if we detect a non-Row DataSet, to skschema validation schema because it's already done.
I see it simpler if you know the exact type of the data as a Row only, and you only have one way to work with it. If you still think this could be good I would suggest first making more tests, not only with products but also with datasets of only one type like
DataSet[String]
DataSet[Int]
DataSet[Timestamp]
DataSet[Array[Int]]. These types are not products and in some cases, I don't know how will it work.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see. I forgot that people can total do
Dataset[Array[Int]]
. I did some test, and it support single value type correctly but notArray[Int]
.Actually, currently our main branch also has issue with
Dataset[Seq[Int]]
. let's me try fixing thatThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alfonsorr I fixed issue with Dataset[Array[Int]] and also add test cases for String, Int, Array, Seq cases