Add support for Hive 1.x.x (#115)

GoogleCloudDataproc · Apr 9, 2024 · 7971a8a · 7971a8a
1 parent 64af1b0
commit 7971a8a
Show file tree

Hide file tree

Showing 73 changed files with 2,200 additions and 539 deletions.
diff --git a/.github/workflows/spotless.yaml b/.github/workflows/spotless.yaml
@@ -21,4 +21,4 @@ jobs:
           cache: 'maven'
 
       - name: Checkout coding style
-        run: ./mvnw spotless:check -Pdateproc21
+        run: ./mvnw spotless:check -Phive1-generic && ./mvnw spotless:check -Phive2-generic && ./mvnw spotless:check -Pdateproc21
diff --git a/README.md b/README.md
@@ -1,3 +1,4 @@
+
 # Hive-BigQuery Connector
 
 The Hive-BigQuery Connector is a Hive storage handler that enables Hive to interact with BigQuery's
@@ -15,10 +16,10 @@ This connector supports [Dataproc](https://cloud.google.com/dataproc) 2.0 and 2.
 For Hadoop clusters other than Dataproc, the connector has been tested with the following
 software versions:
 
-* Hive 2.3.6, 2.3.9, 3.1.2, and 3.1.3.
-* Hadoop 2.10.2, 3.2.3, and 3.3.3.
+* Hive 1.2.1, 2.3.6, 2.3.9, 3.1.2, and 3.1.3.
+* Hadoop 2.6.4, 2.7.0, 2.10.2, 3.2.3, and 3.3.3.
 * Tez 0.9.2 on Hadoop 2, and Tez 0.10.1 on Hadoop 3.
-* Pig 0.17.0.
+* Pig 0.16.0, 0.17.0.
 
 ## Installation
 
@@ -45,6 +46,12 @@ Alternately, you can build a JAR from source:
 
   2. Compile and package the jar:
 
+     * For Hive 1:
+
+      ``` sh
+      ./mvnw package -DskipTests -P hive1-generic
+      ```
+
      * For Hive 2:
 
        ``` sh
@@ -715,10 +722,6 @@ There are multiple options to override the default behavior and to provide custo
   sections on [partitioning](#partitioning) and [clustering](#clustering).
 * CTAS (aka `CREATE TABLE AS SELECT`) and CTLT (`CREATE TABLE LIKE TABLE`) statements are currently
   not supported.
-* If a write job fails when using the Tez execution engine and the `indirect` write method, the
-  temporary avro files might not be automatically cleaned up from the GCS bucket. The MR execution
-  engine does not have this limitation. The temporary files are always cleaned up when the job is
-  successful, regardless of the execution engine in use.
 * If you use the Hive `MAP` type, then the map's key must be of `STRING` type if you use the Avro
   format for reading or the indirect method for writing. This is because Avro requires keys to be
   strings. If you use the Arrow format for reading (default) and the direct method for writing (also

diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml
@@ -22,7 +22,16 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 3. Run unit tests for Hive 2
+# 3. Run unit tests for Hive 1
+- name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
+  id: 'unit-tests-hive1'
+  waitFor: ['build']
+  entrypoint: 'bash'
+  args: ['/workspace/cloudbuild/presubmit.sh', 'unittest_hive1']
+  env:
+    - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
+
+# 4. Run unit tests for Hive 2
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'unit-tests-hive2'
   waitFor: ['build']
@@ -31,7 +40,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 4. Run unit tests for Hive 3
+# 5. Run unit tests for Hive 3
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'unit-tests-hive3'
   waitFor: ['build']
@@ -40,7 +49,16 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 5. Run integration tests for Hive 2
+# 6. Run integration tests for Hive 1
+- name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
+  id: 'integration-tests-hive1'
+  waitFor: ['unit-tests-hive1']
+  entrypoint: 'bash'
+  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest_hive1']
+  env:
+    - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
+
+# 7. Run integration tests for Hive 2
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'integration-tests-hive2'
   waitFor: ['unit-tests-hive2']
@@ -49,7 +67,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 6. Run integration tests for Hive 3
+# 8. Run integration tests for Hive 3
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'integration-tests-hive3'
   waitFor: ['unit-tests-hive3']

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
@@ -24,8 +24,10 @@ fi
 
 readonly ACTION=$1
 
+readonly HIVE1_PROFILE="hive1-generic"
 readonly HIVE2_PROFILE="hive2-generic"
 readonly HIVE3_PROFILE="hive3-generic"
+readonly HIVE2_SHADED_DEPS="shaded-deps-hive2.3.9-hadoop2.10.2"
 readonly HIVE3_SHADED_DEPS="shaded-deps-hive3.1.2-hadoop2.10.2"
 readonly MVN="./mvnw -B -e -Dmaven.repo.local=/workspace/.repository"
 
@@ -38,36 +40,55 @@ cd /workspace
 case "$ACTION" in
   # Java code style check
   check)
-    $MVN spotless:check -P"${HIVE2_PROFILE}" && $MVN spotless:check -P"${HIVE3_PROFILE}"
+    $MVN spotless:check -P"${HIVE1_PROFILE}" && $MVN spotless:check -P"${HIVE2_PROFILE}" && $MVN spotless:check -P"${HIVE3_PROFILE}"
     exit
     ;;
 
   # Build the Maven packages and dependencies
   build)
-    # Install all modules for Hive 2
-    $MVN install -DskipTests -P"${HIVE2_PROFILE}"
+    # Install all modules for Hive 1
+    $MVN install -DskipTests -P"${HIVE1_PROFILE}"
+    # Install the shaded dependencies for Hive 2 (all the other shaded & parent modules have already been installed with the previous command)
+    $MVN install -DskipTests -P"${HIVE2_PROFILE}" -pl ${HIVE2_SHADED_DEPS}
     # Install the shaded dependencies for Hive 3 (all the other shaded & parent modules have already been installed with the previous command)
     $MVN install -DskipTests -P"${HIVE3_PROFILE}" -pl ${HIVE3_SHADED_DEPS}
     exit
     ;;
 
-  # Run unit tests for Hive 2
+  # Run unit tests for Hive 1.x.x
+  unittest_hive1)
+    $MVN surefire:test jacoco:report jacoco:report-aggregate -P"${HIVE1_PROFILE}",coverage
+    # Upload test coverage report to Codecov
+    bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
+    exit
+    ;;
+
+  # Run unit tests for Hive 2.x.x
   unittest_hive2)
     $MVN surefire:test jacoco:report jacoco:report-aggregate -P"${HIVE2_PROFILE}",coverage
     # Upload test coverage report to Codecov
     bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
     exit
     ;;
 
-  # Run unit tests for Hive 3
+  # Run unit tests for Hive 3.x.x
   unittest_hive3)
     $MVN surefire:test jacoco:report jacoco:report-aggregate -P"${HIVE3_PROFILE}",coverage
     # Upload test coverage report to Codecov
     bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
     exit
     ;;
 
-  # Run integration tests for Hive 2
+  # Run integration tests for Hive 1.x.x
+  integrationtest_hive1)
+    $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate \
+      -P"${HIVE1_PROFILE}",coverage,integration
+    # Upload test coverage report to Codecov
+    bash <(curl -s https://codecov.io/bash) -K -F "${ACTION}"
+    exit
+    ;;
+
+  # Run integration tests for Hive 2.x.x
   integrationtest_hive2)
     $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate \
       -P"${HIVE2_PROFILE}",coverage,integration
@@ -76,7 +97,7 @@ case "$ACTION" in
     exit
     ;;
 
-  # Run integration tests for Hive 3
+  # Run integration tests for Hive 3.x.x
   integrationtest_hive3)
     $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate \
       -P"${HIVE3_PROFILE}",coverage,integration

diff --git a/hive-1-bigquery-connector/pom.xml b/hive-1-bigquery-connector/pom.xml
@@ -0,0 +1,128 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>com.google.cloud.hive</groupId>
+        <artifactId>hive-x-bigquery-connector</artifactId>
+        <version>${revision}</version>
+        <relativePath>../hive-x-bigquery-connector</relativePath>
+    </parent>
+
+    <artifactId>hive-1-bigquery-connector</artifactId>
+    <name>Hive-BigQuery Connector For Hive 1.x.x</name>
+
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>hive-bigquery-connector-common</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.google.cloud.bigdataoss</groupId>
+            <artifactId>gcs-connector</artifactId>
+            <classifier>shaded</classifier>
+        </dependency>
+
+        <!-- **************** Test dependencies **************** -->
+
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>hive-bigquery-connector-common</artifactId>
+            <version>${project.version}</version>
+            <type>test-jar</type>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>io.github.hiverunner</groupId>
+            <artifactId>hiverunner</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+    </dependencies>
+
+    <profiles>
+        <profile>
+            <!-- Currently the same as "hive1.2.1-hadoop2.6.4" but could be changed later -->
+            <!-- Use this profile if you don't care about specific minor versions of Hive 1.X -->
+            <id>hive1-generic</id>
+            <properties>
+                <!--
+                As of the time of writing (August 2023), the latest GCS connector isn't
+                compatible with Hadoop <= 2.7, so we use an older version
+                -->
+                <gcs-connector.version>hadoop2-2.2.3</gcs-connector.version>
+            </properties>
+            <dependencies>
+                <dependency>
+                    <groupId>${project.groupId}</groupId>
+                    <artifactId>shaded-deps-hive1.2.1-hadoop2.6.4</artifactId>
+                    <version>${project.version}</version>
+                    <classifier>shaded</classifier>
+                    <scope>provided</scope>
+                </dependency>
+            </dependencies>
+        </profile>
+        <profile>
+            <id>hive1.2.1-hadoop2.6.4</id>
+            <properties>
+                <!--
+                As of the time of writing (August 2023), the latest GCS connector isn't
+                compatible with Hadoop <= 2.7, so we use an older version
+                -->
+                <gcs-connector.version>hadoop2-2.2.3</gcs-connector.version>
+            </properties>
+            <dependencies>
+                <dependency>
+                    <groupId>${project.groupId}</groupId>
+                    <artifactId>shaded-deps-hive1.2.1-hadoop2.6.4</artifactId>
+                    <version>${project.version}</version>
+                    <classifier>shaded</classifier>
+                    <scope>provided</scope>
+                </dependency>
+            </dependencies>
+        </profile>
+    </profiles>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-resources-plugin</artifactId>
+            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+            </plugin>
+
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>exec-maven-plugin</artifactId>
+            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+            </plugin>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>flatten-maven-plugin</artifactId>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
diff --git a/...nector/src/main/java/com/google/cloud/hive/bigquery/connector/BigQueryStorageHandler.java b/...nector/src/main/java/com/google/cloud/hive/bigquery/connector/BigQueryStorageHandler.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2023 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.hive.bigquery.connector;
+
+import com.google.cloud.hive.bigquery.connector.output.PostInsertHook;
+import com.google.cloud.hive.bigquery.connector.output.PreInsertHook;
+import java.util.Map;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.metastore.HiveMetaHook;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.serde2.AbstractSerDe;
+
+public class BigQueryStorageHandler extends BigQueryStorageHandlerBase {
+
+  @Override
+  public HiveMetaHook getMetaHook() {
+    return new Hive1BigQueryMetaHook(conf);
+  }
+
+  @Override
+  public void configureOutputJobProperties(TableDesc tableDesc, Map<String, String> jobProperties) {
+    super.configureOutputJobProperties(tableDesc, jobProperties);
+
+    // In Hive 1, the metahook doesn't have a `preInsertTable()` method, so we use a
+    // pre-execution hook instead
+    addExecHook(ConfVars.PREEXECHOOKS.varname, PreInsertHook.class);
+
+    String engine = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).toLowerCase();
+    if ((engine.equals("tez"))) {
+      // Tez does not use the OutputCommitter (regardless of the Hive versions).
+      // So with Hive 2 and 3, we override and use the metahook's `commitInsertTable()` method.
+      // However, with Hive 1, that method isn't available. So we set up a post execution hook to
+      // commit the writes.
+      addExecHook(ConfVars.POSTEXECHOOKS.varname, PostInsertHook.class);
+    }
+  }
+
+  @Override
+  public Class<? extends AbstractSerDe> getSerDeClass() {
+    return BigQuerySerDe.class;
+  }
+}