Skip to content

Commit

Permalink
[SPARK-26248][SQL] Infer date type from CSV
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

The `CSVInferSchema` class is extended to support inferring of `DateType` from CSV input. The attempt to infer `DateType` is performed after inferring `TimestampType`.

## How was this patch tested?

Added new test for inferring date types from CSV . It was also tested by existing suites like `CSVInferSchemaSuite`, `CsvExpressionsSuite`, `CsvFunctionsSuite` and `CsvSuite`.

Closes apache#23202 from MaxGekk/csv-date-inferring.

Lead-authored-by: Maxim Gekk <[email protected]>
Co-authored-by: Maxim Gekk <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
2 people authored and cloud-fan committed Dec 17, 2018
1 parent e3e33d8 commit 5217f7b
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,20 @@ import scala.util.control.Exception.allCatch
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.analysis.TypeCoercion
import org.apache.spark.sql.catalyst.expressions.ExprUtils
import org.apache.spark.sql.catalyst.util.TimestampFormatter
import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
import org.apache.spark.sql.types._

class CSVInferSchema(val options: CSVOptions) extends Serializable {

@transient
private lazy val timestampParser = TimestampFormatter(
private lazy val timestampFormatter = TimestampFormatter(
options.timestampFormat,
options.timeZone,
options.locale)
@transient
private lazy val dateFormatter = DateFormatter(
options.dateFormat,
options.locale)

private val decimalParser = {
ExprUtils.getDecimalParser(options.locale)
Expand Down Expand Up @@ -104,6 +108,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
compatibleType(typeSoFar, tryParseDecimal(field)).getOrElse(StringType)
case DoubleType => tryParseDouble(field)
case TimestampType => tryParseTimestamp(field)
case DateType => tryParseDate(field)
case BooleanType => tryParseBoolean(field)
case StringType => StringType
case other: DataType =>
Expand Down Expand Up @@ -159,9 +164,16 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
}

private def tryParseTimestamp(field: String): DataType = {
// This case infers a custom `dataFormat` is set.
if ((allCatch opt timestampParser.parse(field)).isDefined) {
if ((allCatch opt timestampFormatter.parse(field)).isDefined) {
TimestampType
} else {
tryParseDate(field)
}
}

private def tryParseDate(field: String): DataType = {
if ((allCatch opt dateFormatter.parse(field)).isDefined) {
DateType
} else {
tryParseBoolean(field)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,22 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {

Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_, DecimalType(7, 0)))
}

test("inferring date type") {
var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd"), false, "GMT")
var inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "2018/12/02") == DateType)

options = new CSVOptions(Map("dateFormat" -> "MMM yyyy"), false, "GMT")
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "Dec 2018") == DateType)

options = new CSVOptions(
Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss"),
columnPruning = false,
defaultTimeZoneId = "GMT")
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(NullType, "2018-12-03T11:00:00") == TimestampType)
assert(inferSchema.inferField(NullType, "2018-12-03") == DateType)
}
}

0 comments on commit 5217f7b

Please sign in to comment.