From a1d02f36acdabd947911194260ab5f2ef8feda79 Mon Sep 17 00:00:00 2001 From: Tiewei Fang <43782773+BePPPower@users.noreply.github.com> Date: Fri, 18 Nov 2022 14:17:02 +0800 Subject: [PATCH] [feature](table-valued-function) support `hdfs()` tvf (#14213) This pr does two things: 1. support `hdfs()` table valued function. 2. add regression test --- .licenserc.yaml | 1 + .../scripts/csv_format_test/all_types.csv | 120 ++++++++ .../csv_format_test/array_malformat.csv | 5 + .../scripts/csv_format_test/array_normal.csv | 9 + .../hive/scripts/csv_format_test/student.csv | 10 + .../csv_format_test/student_with_names.csv | 11 + .../student_with_names_and_types.csv | 12 + .../hive/scripts/hive-metastore.sh | 4 +- .../json_format_test/one_array_json.json | 52 ++++ .../json_format_test/simple_object_json.json | 4 +- .../org/apache/doris/backup/S3Storage.java | 4 +- .../planner/external/QueryScanProvider.java | 17 +- .../planner/external/TVFScanProvider.java | 4 + .../ExternalFileTableValuedFunction.java | 72 +++-- .../HdfsTableValuedFunction.java | 116 +++++++ .../tablefunction/S3TableValuedFunction.java | 58 +++- .../tablefunction/TableValuedFunctionIf.java | 2 + .../table_valued_function/test_hdfs_tvf.out | 289 ++++++++++++++++++ .../stream_load/test_hdfs_json_load.out | 28 +- .../test_hdfs_tvf.groovy | 199 ++++++++++++ 20 files changed, 962 insertions(+), 55 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/csv_format_test/all_types.csv create mode 100644 docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_malformat.csv create mode 100644 docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_normal.csv create mode 100644 docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student.csv create mode 100644 docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names.csv create mode 100644 docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names_and_types.csv create mode 100644 docker/thirdparties/docker-compose/hive/scripts/json_format_test/one_array_json.json create mode 100644 fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java create mode 100644 regression-test/data/correctness_p0/table_valued_function/test_hdfs_tvf.out create mode 100644 regression-test/suites/correctness_p0/table_valued_function/test_hdfs_tvf.groovy diff --git a/.licenserc.yaml b/.licenserc.yaml index 1d1d35a65ac638..d6554af369a7b7 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -27,6 +27,7 @@ header: - "**/test_data/**" - "**/jmockit/**" - "**/*.json" + - "**/*.csv" - "**/*.dat" - "**/*.svg" - "**/*.md5" diff --git a/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/all_types.csv b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/all_types.csv new file mode 100644 index 00000000000000..9a5e34b270f4c1 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/all_types.csv @@ -0,0 +1,120 @@ +0,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +1,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +2,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +3,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +4,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +5,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +6,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +7,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +8,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +9,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +10,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +11,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +12,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +13,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +14,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +15,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +16,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +17,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +18,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +19,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +20,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +21,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +22,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +23,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +24,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +25,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +26,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +27,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +28,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +29,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +30,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +31,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +32,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +33,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +34,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +35,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +36,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +37,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +38,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +39,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +40,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +41,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +42,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +43,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +44,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +45,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +46,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +47,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +48,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +49,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +50,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +51,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +52,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +53,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +54,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +55,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +56,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +57,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +58,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +59,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +60,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +61,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +62,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +63,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +64,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +65,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +66,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +67,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +68,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +69,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +70,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +71,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +72,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +73,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +74,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +75,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +76,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +77,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +78,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +79,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +80,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +81,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +82,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +83,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +84,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +85,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +86,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +87,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +88,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +89,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +90,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +91,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +92,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +93,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +94,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +95,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +96,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +97,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +98,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +99,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +100,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +101,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +102,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +103,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +104,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +105,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +106,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +107,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +108,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +109,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +110,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +111,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +112,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +113,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +114,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +115,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +116,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +117,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +118,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 +119,2,3,4,5,6.6,7.7,8.8,abc,def,ghiaaaaaa,2020-10-10,2020-10-10 11:12:59 \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_malformat.csv b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_malformat.csv new file mode 100644 index 00000000000000..3fbc5a50f5f57d --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_malformat.csv @@ -0,0 +1,5 @@ +1|[1,2,3,4,5]|[32767,32768,32769]|[65534,65535,65536]|["a","b","c","d","e"]|["hello","world"]|["1991-01-01", "1992-02-02", "1993-03-03"]|["1991-01-01 00:00:00"]|[0.33,0.67]|[3.1415926,0.878787878]|[1,1.2,1.3] +2|[1,2,3,4,5]|[32767,32768,32769]|[65534,65535,65536]|["a","b","c","d","e"]|["hello","world"]|['1991-01-01', '1992-02-02', '1993-03-03']|\N|\N|\N|[1,\N,1.3] +3|\N|\N|\N|\N|\N|\N|\N|\N|\N|\N +4|1,2,3,4,5|\N|\N|\N|\N|\N|\N|\N|\N|\N +5|[1,2,3,4,5|\N|\N|\N|\N|\N|\N|\N|\N|\N diff --git a/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_normal.csv b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_normal.csv new file mode 100644 index 00000000000000..b4b3a716a73824 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/array_normal.csv @@ -0,0 +1,9 @@ +1|[1,2,3,4,5]|[32767,32768,32769]|[65534,65535,65536]|["a","b","c","d","e"]|["hello","world"]|["1991-01-01", "1992-02-02", "1993-03-03"]|["1991-01-01 00:00:00"]|[0.33,0.67]|[3.1415926,0.878787878]|[1,1.2,1.3] +2|[1,2,3,4,5]|[32767,32768,32769]|[65534,65535,65536]|["a","b","c","d","e"]|["hello","world"]|['1991-01-01', '1992-02-02', '1993-03-03']|\N|\N|\N|[1,\N,1.3] +3|\N|\N|\N|\N|\N|\N|\N|\N|\N|\N +4|[]|[]|[]|[]|[]|[]|[]|[]|[]|[] +5|[null]|[null]|[null]|[null]|[null]|[null]|[null]|[null]|[null]|[null] +6|[null,null]|[null,null]|[null,null]|[null,null]|[null,null]|[null,null]|[null,null]|[null,null]|[null,null]|[null,null,null,null,null,null] +6|[null,null]|[null,null]|[null,null]|[null,null]|[null,"null"]|[null,null]|[null,null]|[null,null]|[null,null]|[null,null,null,null,null,null] +7|[1,2,3,4,5]|\N|\N|\N|\N|\N|\N|\N|\N|\N +8|[1,2,3,4,5]|\N|\N|\N|\N|\N|[]]|]]|[[]|[[ diff --git a/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student.csv b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student.csv new file mode 100644 index 00000000000000..3a7d6c5d6f1f35 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student.csv @@ -0,0 +1,10 @@ +1,alice,18 +2,bob,20 +3,jack,24 +4,jackson,19 +5,liming,18 +6,luffy,20 +7,zoro,22 +8,sanzi,26 +9,wusuopu,21 +10,nami,18 \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names.csv b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names.csv new file mode 100644 index 00000000000000..62d32e39f42a4e --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names.csv @@ -0,0 +1,11 @@ +id,name,age +1,alice,18 +2,bob,20 +3,jack,24 +4,jackson,19 +5,liming,18 +6,luffy,20 +7,zoro,22 +8,sanzi,26 +9,wusuopu,21 +10,nami,18 \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names_and_types.csv b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names_and_types.csv new file mode 100644 index 00000000000000..4e88aef6d895a6 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/csv_format_test/student_with_names_and_types.csv @@ -0,0 +1,12 @@ +id,name,age +INT,STRING,INT +1,alice,18 +2,bob,20 +3,jack,24 +4,jackson,19 +5,liming,18 +6,luffy,20 +7,zoro,22 +8,sanzi,26 +9,wusuopu,21 +10,nami,18 \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh index 6d97471cc55f0d..884684f2ad578e 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh @@ -27,12 +27,14 @@ echo "hadoop fs -mkdir /user/doris/" hadoop fs -mkdir -p /user/doris/ echo "hadoop fs -put /mnt/scripts/tpch1.db /user/doris/" hadoop fs -put /mnt/scripts/tpch1.db /user/doris/ -echo "hadoop fs -put /mnt/scripts/json_format_test.db /user/doris/" +echo "hadoop fs -put /mnt/scripts/json_format_test /user/doris/" hadoop fs -put /mnt/scripts/json_format_test /user/doris/ echo "hadoop fs -put /mnt/scripts/parquet /user/doris/" hadoop fs -put /mnt/scripts/parquet /user/doris/ echo "hadoop fs -put /mnt/scripts/orc /user/doris/" hadoop fs -put /mnt/scripts/orc /user/doris/ +echo "hadoop fs -put /mnt/scripts/csv_format_test /user/doris/" +hadoop fs -put /mnt/scripts/csv_format_test /user/doris/ echo "hive -f /mnt/scripts/create.hql" hive -f /mnt/scripts/create.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/json_format_test/one_array_json.json b/docker/thirdparties/docker-compose/hive/scripts/json_format_test/one_array_json.json new file mode 100644 index 00000000000000..042db8c884bacb --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/json_format_test/one_array_json.json @@ -0,0 +1,52 @@ +[ + { + "id": 1, + "city": "beijing", + "code": 1454547 + }, + { + "id": 2, + "city": "shanghai", + "code": 1244264 + }, + { + "id": 3, + "city": "guangzhou", + "code": 528369 + }, + { + "id": 4, + "city": "shenzhen", + "code": 594201 + }, + { + "id": 5, + "city": "hangzhou", + "code": 594201 + }, + { + "id": 6, + "city": "nanjing", + "code": 2345672 + }, + { + "id": 7, + "city": "wuhan", + "code": 2345673 + }, + { + "id": 8, + "city": "chengdu", + "code": 2345674 + }, + { + "id": 9, + "city": "xian", + "code": 2345675 + }, + { + "id": 10, + "city": "hefei", + "code": 2345676 + } +] \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/json_format_test/simple_object_json.json b/docker/thirdparties/docker-compose/hive/scripts/json_format_test/simple_object_json.json index a7912466fd74a6..5c3a9c07e93e11 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/json_format_test/simple_object_json.json +++ b/docker/thirdparties/docker-compose/hive/scripts/json_format_test/simple_object_json.json @@ -8,5 +8,5 @@ {"id": 8, "city": "chengdu", "code": 2345678} {"id": 9, "city": "xian", "code": 2345679} {"id": 10, "city": "hefei", "code": 23456710} -{"id": 10, "city": null, "code": 23456711} -{"id": 10, "city": "hefei", "code": null} +{"id": 11, "city": null, "code": 23456711} +{"id": 12, "city": "hefei", "code": null} diff --git a/fe/fe-core/src/main/java/org/apache/doris/backup/S3Storage.java b/fe/fe-core/src/main/java/org/apache/doris/backup/S3Storage.java index ae89175ebd2ff2..d1e250e7797e77 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/backup/S3Storage.java +++ b/fe/fe-core/src/main/java/org/apache/doris/backup/S3Storage.java @@ -97,14 +97,14 @@ public S3Storage(Map properties) { public void setProperties(Map properties) { super.setProperties(properties); caseInsensitiveProperties.putAll(properties); - // Virtual hosted-sytle is recommended in the s3 protocol. + // Virtual hosted-style is recommended in the s3 protocol. // The path-style has been abandoned, but for some unexplainable reasons, // the s3 client will determine whether the endpiont starts with `s3` // when generating a virtual hosted-sytle request. // If not, it will not be converted ( https://github.com/aws/aws-sdk-java-v2/pull/763), // but the endpoints of many cloud service providers for object storage do not start with s3, // so they cannot be converted to virtual hosted-sytle. - // Some of them, such as aliyun's oss, only support virtual hosted-sytle, + // Some of them, such as aliyun's oss, only support virtual hosted-style, // and some of them(ceph) may only support // path-style, so we need to do some additional conversion. // diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/QueryScanProvider.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/QueryScanProvider.java index c38d1b967aec3a..9cf2255f64318a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/QueryScanProvider.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/QueryScanProvider.java @@ -64,12 +64,6 @@ public void createScanRangeLocations(ParamCreateContext context, BackendPolicy b return; } InputSplit inputSplit = inputSplits.get(0); - String fullPath = ((FileSplit) inputSplit).getPath().toUri().toString(); - String filePath = ((FileSplit) inputSplit).getPath().toUri().getPath(); - // eg: - // hdfs://namenode - // s3://buckets - String fsName = fullPath.replace(filePath, ""); TFileType locationType = getLocationType(); context.params.setFileType(locationType); TFileFormatType fileFormatType = getFileFormatType(); @@ -84,6 +78,17 @@ public void createScanRangeLocations(ParamCreateContext context, BackendPolicy b // set hdfs params for hdfs file type. Map locationProperties = getLocationProperties(); if (locationType == TFileType.FILE_HDFS) { + String fsName = ""; + if (this instanceof TVFScanProvider) { + fsName = ((TVFScanProvider) this).getFsName(); + } else { + String fullPath = ((FileSplit) inputSplit).getPath().toUri().toString(); + String filePath = ((FileSplit) inputSplit).getPath().toUri().getPath(); + // eg: + // hdfs://namenode + // s3://buckets + fsName = fullPath.replace(filePath, ""); + } THdfsParams tHdfsParams = BrokerUtil.generateHdfsParam(locationProperties); tHdfsParams.setFsName(fsName); context.params.setHdfsParams(tHdfsParams); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/TVFScanProvider.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/TVFScanProvider.java index 8c8bdf9d3048e4..954d271a94890a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/TVFScanProvider.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/TVFScanProvider.java @@ -58,6 +58,10 @@ public TVFScanProvider(FunctionGenTable tvfTable, TupleDescriptor desc, this.tableValuedFunction = tableValuedFunction; } + public String getFsName() { + return tableValuedFunction.getFsName(); + } + // =========== implement abstract methods of QueryScanProvider ================= @Override public TFileAttributes getFileAttributes() throws UserException { diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index ac69cad8deb215..17b33642943ae1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -45,10 +45,12 @@ import org.apache.doris.thrift.TFileScanRangeParams; import org.apache.doris.thrift.TFileTextScanRangeParams; import org.apache.doris.thrift.TFileType; +import org.apache.doris.thrift.THdfsParams; import org.apache.doris.thrift.TNetworkAddress; import org.apache.doris.thrift.TPrimitiveType; import org.apache.doris.thrift.TStatusCode; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.protobuf.ByteString; import org.apache.log4j.LogManager; @@ -75,20 +77,37 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio protected static final String JSON_PATHS = "jsonpaths"; protected static final String STRIP_OUTER_ARRAY = "strip_outer_array"; protected static final String READ_JSON_BY_LINE = "read_json_by_line"; + protected static final String NUM_AS_STRING = "num_as_string"; + protected static final String FUZZY_PARSE = "fuzzy_parse"; + + protected static final ImmutableSet FILE_FORMAT_PROPERTIES = new ImmutableSet.Builder() + .add(FORMAT) + .add(JSON_ROOT) + .add(JSON_PATHS) + .add(STRIP_OUTER_ARRAY) + .add(READ_JSON_BY_LINE) + .add(NUM_AS_STRING) + .add(FUZZY_PARSE) + .add(COLUMN_SEPARATOR) + .add(LINE_DELIMITER) + .build(); + protected List columns = null; protected List fileStatuses = Lists.newArrayList(); protected Map locationProperties; - protected TFileFormatType fileFormatType; - protected String headerType = ""; + private TFileFormatType fileFormatType; + private String headerType = ""; - protected String columnSeparator = DEFAULT_COLUMN_SEPARATOR; - protected String lineDelimiter = DEFAULT_LINE_DELIMITER; - protected String jsonRoot = ""; - protected String jsonPaths = ""; - protected String stripOuterArray = ""; - protected String readJsonByLine = ""; + private String columnSeparator = DEFAULT_COLUMN_SEPARATOR; + private String lineDelimiter = DEFAULT_LINE_DELIMITER; + private String jsonRoot = ""; + private String jsonPaths = ""; + private boolean stripOuterArray; + private boolean readJsonByLine; + private boolean numAsString; + private boolean fuzzyParse; public abstract TFileType getTFileType(); @@ -105,6 +124,16 @@ public Map getLocationProperties() { return locationProperties; } + public String getFsName() { + TFileType fileType = getTFileType(); + if (fileType == TFileType.FILE_HDFS) { + return locationProperties.get(HdfsTableValuedFunction.HADOOP_FS_NAME); + } else if (fileType == TFileType.FILE_S3) { + return locationProperties.get(S3TableValuedFunction.S3_ENDPOINT); + } + return ""; + } + protected void parseFile() throws UserException { String path = getFilePath(); BrokerDesc brokerDesc = getBrokerDesc(); @@ -142,8 +171,10 @@ protected void parseProperties(Map validParams) throws UserExcep lineDelimiter = validParams.getOrDefault(LINE_DELIMITER, DEFAULT_LINE_DELIMITER); jsonRoot = validParams.getOrDefault(JSON_ROOT, ""); jsonPaths = validParams.getOrDefault(JSON_PATHS, ""); - stripOuterArray = validParams.getOrDefault(STRIP_OUTER_ARRAY, "false").toLowerCase(); - readJsonByLine = validParams.getOrDefault(READ_JSON_BY_LINE, "true").toLowerCase(); + readJsonByLine = Boolean.valueOf(validParams.get(READ_JSON_BY_LINE)).booleanValue(); + stripOuterArray = Boolean.valueOf(validParams.get(STRIP_OUTER_ARRAY)).booleanValue(); + numAsString = Boolean.valueOf(validParams.get(NUM_AS_STRING)).booleanValue(); + fuzzyParse = Boolean.valueOf(validParams.get(FUZZY_PARSE)).booleanValue(); } public List getFileStatuses() { @@ -161,17 +192,10 @@ public TFileAttributes getFileAttributes() { } else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) { fileAttributes.setJsonRoot(jsonRoot); fileAttributes.setJsonpaths(jsonPaths); - if (readJsonByLine.equalsIgnoreCase("true")) { - fileAttributes.setReadJsonByLine(true); - } else { - fileAttributes.setReadJsonByLine(false); - } - if (stripOuterArray.equalsIgnoreCase("true")) { - fileAttributes.setStripOuterArray(true); - } else { - fileAttributes.setStripOuterArray(false); - } - // TODO(ftw): num_as_string/fuzzy_parser? + fileAttributes.setReadJsonByLine(readJsonByLine); + fileAttributes.setStripOuterArray(stripOuterArray); + fileAttributes.setNumAsString(numAsString); + fileAttributes.setFuzzyParse(fuzzyParse); } return fileAttributes; } @@ -254,6 +278,12 @@ private PFetchTableSchemaRequest getFetchTableStructureRequest() throws Analysis fileScanRangeParams.setFormatType(fileFormatType); fileScanRangeParams.setProperties(locationProperties); fileScanRangeParams.setFileAttributes(getFileAttributes()); + if (getTFileType() == TFileType.FILE_HDFS) { + THdfsParams tHdfsParams = BrokerUtil.generateHdfsParam(locationProperties); + String fsNmae = getLocationProperties().get(HdfsTableValuedFunction.HADOOP_FS_NAME); + tHdfsParams.setFsName(fsNmae); + fileScanRangeParams.setHdfsParams(tHdfsParams); + } // get first file, used to parse table schema TBrokerFileStatus firstFile = null; diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java new file mode 100644 index 00000000000000..175c9e501aa363 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.tablefunction; + +import org.apache.doris.analysis.BrokerDesc; +import org.apache.doris.analysis.ExportStmt; +import org.apache.doris.analysis.StorageBackend.StorageType; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.UserException; +import org.apache.doris.common.util.URI; +import org.apache.doris.thrift.TFileType; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import org.apache.commons.collections.map.CaseInsensitiveMap; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Map; + +/** + * The Implement of table valued function + * hdfs("uri" = "xxx", "hadoop.username" = "xx", "FORMAT" = "csv"). + */ +public class HdfsTableValuedFunction extends ExternalFileTableValuedFunction { + public static final Logger LOG = LogManager.getLogger(HdfsTableValuedFunction.class); + + public static final String NAME = "hdfs"; + public static final String HDFS_URI = "uri"; + public static String HADOOP_FS_NAME = "fs.defaultFS"; + // simple or kerberos + public static String HADOOP_SECURITY_AUTHENTICATION = "hadoop.security.authentication"; + public static String HADOOP_USER_NAME = "hadoop.username"; + public static String HADOOP_KERBEROS_PRINCIPAL = "hadoop.kerberos.principal"; + public static String HADOOP_KERBEROS_KEYTAB = "hadoop.kerberos.keytab"; + public static String HADOOP_SHORT_CIRCUIT = "dfs.client.read.shortcircuit"; + public static String HADOOP_SOCKET_PATH = "dfs.domain.socket.path"; + + private static final ImmutableSet LOCATION_PROPERTIES = new ImmutableSet.Builder() + .add(HDFS_URI) + .add(HADOOP_SECURITY_AUTHENTICATION) + .add(HADOOP_FS_NAME) + .add(HADOOP_USER_NAME) + .add(HADOOP_KERBEROS_PRINCIPAL) + .add(HADOOP_KERBEROS_KEYTAB) + .add(HADOOP_SHORT_CIRCUIT) + .add(HADOOP_SOCKET_PATH) + .build(); + + private URI hdfsUri; + private String filePath; + + public HdfsTableValuedFunction(Map params) throws UserException { + Map fileFormatParams = new CaseInsensitiveMap(); + locationProperties = Maps.newHashMap(); + for (String key : params.keySet()) { + if (FILE_FORMAT_PROPERTIES.contains(key.toLowerCase())) { + fileFormatParams.put(key, params.get(key)); + } else if (LOCATION_PROPERTIES.contains(key.toLowerCase()) || HADOOP_FS_NAME.equalsIgnoreCase(key)) { + // because HADOOP_FS_NAME contains upper and lower case + if (HADOOP_FS_NAME.equalsIgnoreCase(key)) { + locationProperties.put(HADOOP_FS_NAME, params.get(key)); + } else { + locationProperties.put(key.toLowerCase(), params.get(key)); + } + } else { + throw new AnalysisException(key + " is invalid property"); + } + } + + ExportStmt.checkPath(locationProperties.get(HDFS_URI), StorageType.HDFS); + hdfsUri = URI.create(locationProperties.get(HDFS_URI)); + filePath = locationProperties.get(HADOOP_FS_NAME) + hdfsUri.getPath(); + + parseProperties(fileFormatParams); + parseFile(); + } + + // =========== implement abstract methods of ExternalFileTableValuedFunction ================= + @Override + public TFileType getTFileType() { + return TFileType.FILE_HDFS; + } + + @Override + public String getFilePath() { + // must be "hdfs://namenode/filepath" + return filePath; + } + + @Override + public BrokerDesc getBrokerDesc() { + return new BrokerDesc("S3TvfBroker", StorageType.HDFS, locationProperties); + } + + // =========== implement abstract methods of TableValuedFunctionIf ================= + @Override + public String getTableName() { + return "HDFSTableValuedFunction"; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/S3TableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/S3TableValuedFunction.java index 784a75accccd28..6af05f33743b64 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/S3TableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/S3TableValuedFunction.java @@ -26,6 +26,7 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; +import org.apache.commons.collections.map.CaseInsensitiveMap; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -46,44 +47,71 @@ public class S3TableValuedFunction extends ExternalFileTableValuedFunction { private static final String AK = "access_key"; private static final String SK = "secret_key"; - public static final String USE_PATH_STYLE = "use_path_style"; + private static final String USE_PATH_STYLE = "use_path_style"; + private static final String REGION = "region"; private static final ImmutableSet PROPERTIES_SET = new ImmutableSet.Builder() .add(S3_URI) .add(AK) .add(SK) - .add(FORMAT) - .add(JSON_ROOT) - .add(JSON_PATHS) - .add(STRIP_OUTER_ARRAY) - .add(READ_JSON_BY_LINE) + .add(USE_PATH_STYLE) + .add(REGION) .build(); private S3URI s3uri; private String s3AK; private String s3SK; + private String endPoint; + private String virtualBucket; + private boolean forceVirtualHosted; public S3TableValuedFunction(Map params) throws UserException { - Map validParams = Maps.newHashMap(); + Map validParams = new CaseInsensitiveMap(); for (String key : params.keySet()) { - if (!PROPERTIES_SET.contains(key.toLowerCase())) { + if (!PROPERTIES_SET.contains(key.toLowerCase()) && !FILE_FORMAT_PROPERTIES.contains(key.toLowerCase())) { throw new AnalysisException(key + " is invalid property"); } - validParams.put(key.toLowerCase(), params.get(key)); + validParams.put(key, params.get(key)); } - s3uri = S3URI.create(validParams.get(S3_URI)); + String originUri = validParams.getOrDefault(S3_URI, ""); + if (originUri.toLowerCase().startsWith("s3")) { + // s3 protocol + forceVirtualHosted = false; + } else { + // not s3 protocol, forceVirtualHosted is determined by USE_PATH_STYLE. + forceVirtualHosted = !Boolean.valueOf(validParams.get(USE_PATH_STYLE)).booleanValue(); + } + + s3uri = S3URI.create(validParams.get(S3_URI), forceVirtualHosted); + if (forceVirtualHosted) { + // s3uri.getVirtualBucket() is: virtualBucket.endpoint, Eg: + // uri: http://my_bucket.cos.ap-beijing.myqcloud.com/file.txt + // s3uri.getVirtualBucket() = my_bucket.cos.ap-beijing.myqcloud.com, + // so we need separate virtualBucket and endpoint. + String[] fileds = s3uri.getVirtualBucket().split("\\.", 2); + virtualBucket = fileds[0]; + if (fileds.length > 1) { + endPoint = fileds[1]; + } else { + throw new AnalysisException("can not parse endpoint, please check uri."); + } + } else { + endPoint = s3uri.getBucketScheme(); + } s3AK = validParams.getOrDefault(AK, ""); s3SK = validParams.getOrDefault(SK, ""); + String usePathStyle = validParams.getOrDefault(USE_PATH_STYLE, "false"); parseProperties(validParams); // set S3 location properties + // these five properties is necessary, no one can be lost. locationProperties = Maps.newHashMap(); - locationProperties.put(S3_ENDPOINT, s3uri.getBucketScheme()); + locationProperties.put(S3_ENDPOINT, endPoint); locationProperties.put(S3_AK, s3AK); locationProperties.put(S3_SK, s3SK); - locationProperties.put(S3_REGION, ""); - locationProperties.put(USE_PATH_STYLE, "true"); + locationProperties.put(S3_REGION, validParams.getOrDefault(REGION, "")); + locationProperties.put(USE_PATH_STYLE, usePathStyle); parseFile(); } @@ -97,6 +125,10 @@ public TFileType getTFileType() { @Override public String getFilePath() { // must be "s3://..." + if (forceVirtualHosted) { + return NAME + S3URI.SCHEME_DELIM + virtualBucket + S3URI.PATH_DELIM + + s3uri.getBucket() + S3URI.PATH_DELIM + s3uri.getKey(); + } return NAME + S3URI.SCHEME_DELIM + s3uri.getKey(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java index 3063880c923130..56167d1b874143 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/TableValuedFunctionIf.java @@ -48,6 +48,8 @@ public static TableValuedFunctionIf getTableFunction(String funcName, Map