diff --git a/.codecov_bash b/.codecov_bash new file mode 100644 index 0000000..523fb30 --- /dev/null +++ b/.codecov_bash @@ -0,0 +1,1285 @@ +#!/usr/bin/env bash + +set -e +o pipefail + +VERSION="1b98dd4" + +url="https://codecov.io" +url_o="" +env="$CODECOV_ENV" +pr_o="" +pr="" +job="" +build_url="" +service="" +build_o="" +token="" +commit_o="" +search_in="" +search_in_o="" +tag_o="" +tag="" +flags="" +exit_with=0 +branch_o="" +slug_o="" +dump="0" +clean="0" +branch="" +commit="" +include_cov="" +exclude_cov="" +ddp="$(echo ~)/Library/Developer/Xcode/DerivedData" +xp="" +files="" +cacert="$CODECOV_CA_BUNDLE" +gcov_ignore="" +gcov_include="" +ft_gcov="1" +ft_coveragepy="1" +ft_fix="1" +ft_search="1" +ft_s3="1" +ft_xcode="1" +ft_network="1" +_git_root=$(git rev-parse --show-toplevel 2>/dev/null || hg root 2>/dev/null || echo $PWD) +git_root="$_git_root" +remote_addr="" +if [ "$git_root" = "$PWD" ]; +then + git_root="." +fi + +proj_root="$git_root" +gcov_exe="gcov" +gcov_arg="" + +b="\033[0;36m" +g="\033[0;32m" +r="\033[0;31m" +e="\033[0;90m" +x="\033[0m" + +show_help() { +cat << EOF + + Codecov Bash $VERSION + + Global report uploading tool for Codecov + Contribute at https://github.com/codecov/codecov-bash + + + -h Display this help and exit + -f FILE Target file(s) to upload + + -f "path/to/file" only upload this file + skips searching unless provided patterns below + -f "!*.bar" ignore all files at pattern *.bar + -f "*.foo" include all files at pattern *.foo + + -s DIR Directory to search for coverage reports. + Already searches project root and artifact folders. + -t TOKEN Set the private repository token + (option) set environment variable CODECOV_TOKEN=:uuid + + -t @/path/to/token_file + -t uuid + + -e ENV Specify environment variables to be included with this build + Also accepting environment variables: CODECOV_ENV=VAR,VAR2 + + -e VAR,VAR2 + + -X feature Toggle functionalities + + -X gcov Disable gcov + -X coveragepy Disable python coverage + -X fix Disable report fixing + -X search Disable searching for reports + -X xcode Disable xcode processing + -X network Disable uploading the file network + + -R root dir Used when not in git/hg project to identify project root directory + -F flag Flag the upload to group coverage metrics + + -F unittests This upload is only unittests + -F integration This upload is only integration tests + -F ui,chrome This uplaod is Chrome - UI tests + + -c Move discovered coverage reports to the trash + -Z Exit with 1 if not successful. Default will Exit with 0 + + -- xcode -- + -D Custom Derived Data Path for Coverage.profdata and gcov processing + Default '~/Library/Developer/Xcode/DerivedData' + -J Specify packages to build coverage. + This can significantly reduces time to build coverage reports. + + -J 'MyAppName' Will match "MyAppName" and "MyAppNameTests" + -J '^ExampleApp$' Will match only "ExampleApp" not "ExampleAppTests" + + -- gcov -- + -g GLOB Paths to ignore during gcov gathering + -G GLOB Paths to include during gcov gathering + -p dir Project root directory + Also used when preparing gcov + -x gcovexe gcov executable to run. Defaults to 'gcov' + -a gcovargs extra arguments to pass to gcov + + -- Override CI Environment Variables -- + These variables are automatically detected by popular CI providers + + -B branch Specify the branch name + -C sha Specify the commit sha + -P pr Specify the pull request number + -b build Specify the build number + -T tag Specify the git tag + + -- Enterprise -- + -u URL Set the target url for Enterprise customers + Not required when retrieving the bash uploader from your CCE + (option) Set environment variable CODECOV_URL=https://my-hosted-codecov.com + -r SLUG owner/repo slug used instead of the private repo token in Enterprise + (option) set environment variable CODECOV_SLUG=:owner/:repo + (option) set in your codecov.yml "codecov.slug" + -S PATH File path to your cacert.pem file used to verify ssl with Codecov Enterprise (optional) + (option) Set environment variable: CODECOV_CA_BUNDLE="/path/to/ca.pem" + + -- Debugging -- + -d Dont upload and dump to stdin + -K Remove color from the output + -v Verbose mode + +EOF +} + + +say() { + echo -e "$1" +} + + +urlencode() { + echo "$1" | curl -Gso /dev/null -w %{url_effective} --data-urlencode @- "" | cut -c 3- | sed -e 's/%0A//' +} + + +swiftcov() { + _dir=$(dirname "$1") + for _type in app framework xctest + do + find "$_dir" -name "*.$_type" | while read f + do + _proj=${f##*/} + _proj=${_proj%."$_type"} + if [ "$2" = "" ] || [ "$(echo "$_proj" | grep -i "$2")" != "" ]; + then + say " $g+$x Building reports for $_proj $_type" + dest=$([ -f "$f/$_proj" ] && echo "$f/$_proj" || echo "$f/Contents/MacOS/$_proj") + _proj_name=$(echo "$_proj" | sed -e 's/[[:space:]]//g') + xcrun llvm-cov show -instr-profile "$1" "$dest" > "$_proj_name.$_type.coverage.txt" \ + || say " ${r}x>${x} llvm-cov failed to produce results for $dest" + fi + done + done +} + + +# Credits to: https://gist.github.com/pkuczynski/8665367 +parse_yaml() { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\)\($w\)$s:$s\"\(.*\)\"$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; if (indent > 0) {vn=(vn)(vname[0])("_")} + printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3); + } + }' +} + + +if [ $# != 0 ]; +then + while getopts "a:b:B:cC:dD:e:f:F:g:G:hJ:Kp:P:r:R:s:S:t:T:u:vx:X:Z" o + do + case "$o" in + "a") + gcov_arg=$OPTARG + ;; + "b") + build_o="$OPTARG" + ;; + "B") + branch_o="$OPTARG" + ;; + "c") + clean="1" + ;; + "C") + commit_o="$OPTARG" + ;; + "d") + dump="1" + ;; + "D") + ddp="$OPTARG" + ;; + "e") + env="$env,$OPTARG" + ;; + "f") + if [ "${OPTARG::1}" = "!" ]; + then + exclude_cov="$exclude_cov -not -path '${OPTARG:1}'" + + elif [[ "$OPTARG" = *"*"* ]]; + then + include_cov="$include_cov -or -name '$OPTARG'" + + else + ft_search=0 + if [ "$files" = "" ]; + then + files="$OPTARG" + else + files="$files +$OPTARG" + fi + fi + ;; + "F") + if [ "$flags" = "" ]; + then + flags="$OPTARG" + else + flags="$flags,$OPTARG" + fi + ;; + "g") + gcov_ignore="$gcov_ignore -not -path '$OPTARG'" + ;; + "G") + gcov_include="$gcov_include -path '$OPTARG'" + ;; + "h") + show_help + exit 0; + ;; + "J") + if [ "$xp" = "" ]; + then + xp="$OPTARG" + else + xp="$xp\|$OPTARG" + fi + ;; + "K") + b="" + g="" + r="" + e="" + x="" + ;; + "p") + proj_root="$OPTARG" + ;; + "P") + pr_o="$OPTARG" + ;; + "r") + slug_o="$OPTARG" + ;; + "R") + git_root="$OPTARG" + ;; + "s") + if [ "$search_in_o" = "" ]; + then + search_in_o="$OPTARG" + else + search_in_o="$search_in_o $OPTARG" + fi + ;; + "S") + cacert="--cacert \"$OPTARG\"" + ;; + "t") + if [ "${OPTARG::1}" = "@" ]; + then + token=$(cat "${OPTARG:1}" | tr -d ' \n') + else + token="$OPTARG" + fi + ;; + "T") + tag_o="$OPTARG" + ;; + "u") + url_o=$(echo "$OPTARG" | sed -e 's/\/$//') + ;; + "v") + set -x + ;; + "x") + gcov_exe=$OPTARG + ;; + "X") + if [ "$OPTARG" = "gcov" ]; + then + ft_gcov="0" + elif [ "$OPTARG" = "coveragepy" ] || [ "$OPTARG" = "py" ]; + then + ft_coveragepy="0" + elif [ "$OPTARG" = "fix" ] || [ "$OPTARG" = "fixes" ]; + then + ft_fix="0" + elif [ "$OPTARG" = "xcode" ]; + then + ft_xcode="0" + elif [ "$OPTARG" = "search" ]; + then + ft_search="0" + elif [ "$OPTARG" = "network" ]; + then + ft_network="0" + fi + ;; + "Z") + exit_with=1 + ;; + esac + done +fi + +say " + _____ _ + / ____| | | +| | ___ __| | ___ ___ _____ __ +| | / _ \\ / _\` |/ _ \\/ __/ _ \\ \\ / / +| |___| (_) | (_| | __/ (_| (_) \\ V / + \\_____\\___/ \\__,_|\\___|\\___\\___/ \\_/ + Bash-$VERSION + +" + +search_in="$proj_root" + +if [ "$JENKINS_URL" != "" ]; +then + say "$e==>$x Jenkins CI detected." + # https://wiki.jenkins-ci.org/display/JENKINS/Building+a+software+project + # https://wiki.jenkins-ci.org/display/JENKINS/GitHub+pull+request+builder+plugin#GitHubpullrequestbuilderplugin-EnvironmentVariables + service="jenkins" + branch=$([ ! -z "$ghprbSourceBranch" ] && echo "$ghprbSourceBranch" || echo "$GIT_BRANCH") + commit=$([ ! -z "$ghprbActualCommit" ] && echo "$ghprbActualCommit" || echo "$GIT_COMMIT") + build="$BUILD_NUMBER" + pr="$ghprbPullId" + build_url=$(urlencode "$BUILD_URL") + +elif [ "$CI" = "true" ] && [ "$TRAVIS" = "true" ] && [ "$SHIPPABLE" != "true" ]; +then + say "$e==>$x Travis CI detected." + # http://docs.travis-ci.com/user/ci-environment/#Environment-variables + service="travis" + branch="$TRAVIS_BRANCH" + commit="$TRAVIS_COMMIT" + build="$TRAVIS_JOB_NUMBER" + pr="$TRAVIS_PULL_REQUEST" + job="$TRAVIS_JOB_ID" + slug="$TRAVIS_REPO_SLUG" + tag="$TRAVIS_TAG" + env="$env,TRAVIS_OS_NAME" + + language=$(printenv | grep "TRAVIS_.*_VERSION" | head -1) + if [ "$language" != "" ]; + then + env="$env,${language%=*}" + fi + +elif [ "$CI" = "true" ] && [ "$CI_NAME" = "codeship" ]; +then + say "$e==>$x Codeship CI detected." + # https://www.codeship.io/documentation/continuous-integration/set-environment-variables/ + service="codeship" + branch="$CI_BRANCH" + build="$CI_BUILD_NUMBER" + build_url=$(urlencode "$CI_BUILD_URL") + commit="$CI_COMMIT_ID" + +elif [ "$TEAMCITY_VERSION" != "" ]; +then + say "$e==>$x TeamCity CI detected." + # https://confluence.jetbrains.com/display/TCD8/Predefined+Build+Parameters + # https://confluence.jetbrains.com/plugins/servlet/mobile#content/view/74847298 + if [ "$TEAMCITY_BUILD_BRANCH" = '' ]; + then + echo " Teamcity does not automatically make build parameters available as environment variables." + echo " Add the following environment parameters to the build configuration" + echo " env.TEAMCITY_BUILD_BRANCH = %teamcity.build.branch%" + echo " env.TEAMCITY_BUILD_ID = %teamcity.build.id%" + echo " env.TEAMCITY_BUILD_URL = %teamcity.serverUrl%/viewLog.html?buildId=%teamcity.build.id%" + echo " env.TEAMCITY_BUILD_COMMIT = %system.build.vcs.number%" + echo " env.TEAMCITY_BUILD_REPOSITORY = %vcsroot..url%" + fi + service="teamcity" + branch="$TEAMCITY_BUILD_BRANCH" + build="$TEAMCITY_BUILD_ID" + build_url=$(urlencode "$TEAMCITY_BUILD_URL") + if [ "$TEAMCITY_BUILD_COMMIT" != "" ]; + then + commit="$TEAMCITY_BUILD_COMMIT" + else + commit="$BUILD_VCS_NUMBER" + fi + remove_addr="$TEAMCITY_BUILD_REPOSITORY" + +elif [ "$CI" = "true" ] && [ "$CIRCLECI" = "true" ]; +then + say "$e==>$x Circle CI detected." + # https://circleci.com/docs/environment-variables + service="circleci" + branch="$CIRCLE_BRANCH" + build="$CIRCLE_BUILD_NUM" + job="$CIRCLE_NODE_INDEX" + slug="$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME" + pr="$CIRCLE_PR_NUMBER" + commit="$CIRCLE_SHA1" + search_in="$search_in $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS" + +elif [ "$BUDDYBUILD_BRANCH" != "" ]; +then + say "$e==>$x buddybuild detected" + # http://docs.buddybuild.com/v6/docs/custom-prebuild-and-postbuild-steps + service="buddybuild" + branch="$BUDDYBUILD_BRANCH" + build="$BUDDYBUILD_BUILD_NUMBER" + build_url="https://dashboard.buddybuild.com/public/apps/$BUDDYBUILD_APP_ID/build/$BUDDYBUILD_BUILD_ID" + # BUDDYBUILD_TRIGGERED_BY + if [ "$ddp" = "$(echo ~)/Library/Developer/Xcode/DerivedData" ]; + then + ddp="/private/tmp/sandbox/${BUDDYBUILD_APP_ID}/bbtest" + fi + +elif [ "${bamboo_planRepository_revision}" != "" ]; +then + say "$e==>$x Bamboo detected" + # https://confluence.atlassian.com/bamboo/bamboo-variables-289277087.html#Bamboovariables-Build-specificvariables + service="bamboo" + commit="${bamboo_planRepository_revision}" + branch="${bamboo_planRepository_branch}" + build="${bamboo_buildNumber}" + build_url="${bamboo_buildResultsUrl}" + remote_addr="${bamboo_planRepository_repositoryUrl}" + +elif [ "$CI" = "true" ] && [ "$BITRISE_IO" = "true" ]; +then + # http://devcenter.bitrise.io/faq/available-environment-variables/ + say "$e==>$x Bitrise CI detected." + service="bitrise" + branch="$BITRISE_GIT_BRANCH" + build="$BITRISE_BUILD_NUMBER" + build_url=$(urlencode "$BITRISE_BUILD_URL") + pr="$BITRISE_PULL_REQUEST" + if [ "$GIT_CLONE_COMMIT_HASH" != "" ]; + then + commit="$GIT_CLONE_COMMIT_HASH" + fi + +elif [ "$CI" = "true" ] && [ "$SEMAPHORE" = "true" ]; +then + say "$e==>$x Semaphore CI detected." + # https://semaphoreapp.com/docs/available-environment-variables.html + service="semaphore" + branch="$BRANCH_NAME" + build="$SEMAPHORE_BUILD_NUMBER.$SEMAPHORE_CURRENT_THREAD" + pr="$PULL_REQUEST_NUMBER" + slug="$SEMAPHORE_REPO_SLUG" + commit="$REVISION" + env="$env,SEMAPHORE_TRIGGER_SOURCE" + +elif [ "$CI" = "true" ] && [ "$BUILDKITE" = "true" ]; +then + say "$e==>$x Buildkite CI detected." + # https://buildkite.com/docs/guides/environment-variables + service="buildkite" + branch="$BUILDKITE_BRANCH" + build="$BUILDKITE_BUILD_NUMBER.$BUILDKITE_JOB_ID" + build_url=$(urlencode "$BUILDKITE_BUILD_URL") + slug="$BUILDKITE_PROJECT_SLUG" + commit="$BUILDKITE_COMMIT" + +elif [ "$CI" = "true" ] && [ "$DRONE" = "true" ]; +then + say "$e==>$x Drone CI detected." + # http://docs.drone.io/env.html + # drone commits are not full shas + service="drone.io" + branch="$DRONE_BRANCH" + build="$DRONE_BUILD_NUMBER" + build_url=$(urlencode "${DRONE_BUILD_URL:-$CI_BUILD_URL}") + pr="$DRONE_PULL_REQUEST" + job="$DRONE_JOB_NUMBER" + tag="$DRONE_TAG" + +elif [ "$CI" = "True" ] && [ "$APPVEYOR" = "True" ]; +then + say "$e==>$x Appveyor CI detected." + # http://www.appveyor.com/docs/environment-variables + service="appveyor" + branch="$APPVEYOR_REPO_BRANCH" + build=$(urlencode "$APPVEYOR_JOB_ID") + pr="$APPVEYOR_PULL_REQUEST_NUMBER" + job="$APPVEYOR_ACCOUNT_NAME%2F$APPVEYOR_PROJECT_SLUG%2F$APPVEYOR_BUILD_VERSION" + slug="$APPVEYOR_REPO_NAME" + commit="$APPVEYOR_REPO_COMMIT" + +elif [ "$CI" = "true" ] && [ "$WERCKER_GIT_BRANCH" != "" ]; +then + say "$e==>$x Wercker CI detected." + # http://devcenter.wercker.com/articles/steps/variables.html + service="wercker" + branch="$WERCKER_GIT_BRANCH" + build="$WERCKER_MAIN_PIPELINE_STARTED" + slug="$WERCKER_GIT_OWNER/$WERCKER_GIT_REPOSITORY" + commit="$WERCKER_GIT_COMMIT" + +elif [ "$CI" = "true" ] && [ "$MAGNUM" = "true" ]; +then + say "$e==>$x Magnum CI detected." + # https://magnum-ci.com/docs/environment + service="magnum" + branch="$CI_BRANCH" + build="$CI_BUILD_NUMBER" + commit="$CI_COMMIT" + +elif [ "$CI" = "true" ] && [ "$SNAP_CI" = "true" ]; +then + say "$e==>$x Snap CI detected." + # https://docs.snap-ci.com/environment-variables/ + service="snap" + branch=$([ "$SNAP_BRANCH" != "" ] && echo "$SNAP_BRANCH" || echo "$SNAP_UPSTREAM_BRANCH") + build="$SNAP_PIPELINE_COUNTER" + job="$SNAP_STAGE_NAME" + pr="$SNAP_PULL_REQUEST_NUMBER" + commit=$([ "$SNAP_COMMIT" != "" ] && echo "$SNAP_COMMIT" || echo "$SNAP_UPSTREAM_COMMIT") + env="$env,DISPLAY" + +elif [ "$SHIPPABLE" = "true" ]; +then + say "$e==>$x Shippable CI detected." + # http://docs.shippable.com/ci_configure/ + service="shippable" + branch=$([ "$HEAD_BRANCH" != "" ] && echo "$HEAD_BRANCH" || echo "$BRANCH") + build="$BUILD_NUMBER" + build_url=$(urlencode "$BUILD_URL") + pr="$PULL_REQUEST" + slug="$REPO_FULL_NAME" + commit="$COMMIT" + +elif [ "$TDDIUM" = "true" ]; +then + say "Solano CI detected." + # http://docs.solanolabs.com/Setup/tddium-set-environment-variables/ + service="solano" + commit="$TDDIUM_CURRENT_COMMIT" + branch="$TDDIUM_CURRENT_BRANCH" + build="$TDDIUM_TID" + pr="$TDDIUM_PR_ID" + +elif [ "$GREENHOUSE" = "true" ]; +then + say "$e==>$x Greenhouse CI detected." + # http://docs.greenhouseci.com/docs/environment-variables-files + service="greenhouse" + branch="$GREENHOUSE_BRANCH" + build="$GREENHOUSE_BUILD_NUMBER" + build_url=$(urlencode "$GREENHOUSE_BUILD_URL") + pr="$GREENHOUSE_PULL_REQUEST" + commit="$GREENHOUSE_COMMIT" + search_in="$search_in $GREENHOUSE_EXPORT_DIR" + +elif [ "$GITLAB_CI" != "" ]; +then + say "$e==>$x GitLab CI detected." + # http://doc.gitlab.com/ce/ci/variables/README.html + service="gitlab" + branch="$CI_BUILD_REF_NAME" + build="$CI_BUILD_ID" + remove_addr="$CI_BUILD_REPO" + commit="$CI_BUILD_REF" + +else + say "${r}x>${x} No CI provider detected." + + commit="$VCS_COMMIT_ID" + branch="$VCS_BRANCH_NAME" + pr="$VCS_PULL_REQUEST" + slug="$VCS_SLUG" + build_url="$CI_BUILD_URL" + build="$CI_BUILD_ID" + +fi + +say " ${e}project root:${x} $git_root" + +# find branch, commit, repo from git command +if [ "$GIT_BRANCH" != "" ]; +then + branch="$GIT_BRANCH" + +elif [ "$branch" = "" ]; +then + branch=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || hg branch 2>/dev/null || echo "") + if [ "$branch" = "HEAD" ]; + then + branch="" + fi +fi + +if [ "$commit_o" = "" ]; +then + # merge commit -> actual commit + mc=$(git log -1 --pretty=%B 2>/dev/null | tr -d '[[:space:]]' || true) + if [[ "$mc" =~ ^Merge[[:space:]][a-z0-9]{40}[[:space:]]into[[:space:]][a-z0-9]{40}$ ]]; + then + say " (DEPRECIATING SOON) Learn more at http://docs.codecov.io/docs/merge-commits" + # Merge xxx into yyy + say " Fixing merge commit sha" + commit=$(echo "$mc" | cut -d' ' -f2) + elif [ "$GIT_COMMIT" != "" ]; + then + commit="$GIT_COMMIT" + elif [ "$commit" = "" ]; + then + commit=$(git log -1 --format="%H" 2>/dev/null || hg id -i --debug 2>/dev/null | tr -d '+' || echo "") + fi +else + commit="$commit_o" +fi + +if [ "$CODECOV_TOKEN" != "" ] && [ "$token" = "" ]; +then + say "${e}-->${x} token set from env" + token="$CODECOV_TOKEN" +fi + +if [ "$CODECOV_URL" != "" ] && [ "$url_o" = "" ]; +then + say "${e}-->${x} url set from env" + url_o=$(echo "$CODECOV_URL" | sed -e 's/\/$//') +fi + +if [ "$CODECOV_SLUG" != "" ]; +then + say "${e}-->${x} slug set from env" + slug_o="$CODECOV_SLUG" + +elif [ "$slug" = "" ]; +then + if [ "$remote_addr" = "" ]; + then + remote_addr=$(git config --get remote.origin.url || hg paths default || echo '') + fi + if [ "$remote_addr" != "" ]; + then + if echo "$remote_addr" | grep -q "//"; then + # https + slug=$(echo "$remote_addr" | cut -d / -f 4,5 | sed -e 's/\.git$//') + else + # ssh + slug=$(echo "$remote_addr" | cut -d : -f 2 | sed -e 's/\.git$//') + fi + fi + if [ "$slug" = "/" ]; + then + slug="" + fi +fi + +yaml=$(cd "$git_root" && \ + git ls-files "*codecov.yml" "*codecov.yaml" 2>/dev/null \ + || hg locate "*codecov.yml" "*codecov.yaml" 2>/dev/null \ + || echo '') +yaml=$(echo "$yaml" | head -1) + +if [ "$yaml" != "" ]; +then + say " ${e}Yaml found at:${x} $yaml" + config=$(parse_yaml $yaml || echo '') + + # TODO validate the yaml here + + if [ "$(echo "$config" | grep 'codecov_token="')" != "" ] && [ "$token" = "" ]; + then + say "${e}-->${x} token set from yaml" + token="$(echo "$config" | grep 'codecov_token="' | sed -e 's/codecov_token="//' | sed -e 's/"\.*//')" + fi + + if [ "$(echo "$config" | grep 'codecov_url="')" != "" ] && [ "$url_o" = "" ]; + then + say "${e}-->${x} url set from yaml" + url_o="$(echo "$config" | grep 'codecov_url="' | sed -e 's/codecov_url="//' | sed -e 's/"\.*//')" + fi + + if [ "$(echo "$config" | grep 'codecov_slug="')" != "" ] && [ "$slug_o" = "" ]; + then + say "${e}-->${x} slug set from yaml" + slug_o="$(echo "$config" | grep 'codecov_slug="' | sed -e 's/codecov_slug="//' | sed -e 's/"\.*//')" + fi +else + say " ${g}Yaml not found, that's ok! Learn more at${x} ${b}http://docs.codecov.io/docs/codecov-yaml${x}" + +fi + +if [ "$branch_o" != "" ]; +then + branch=$(urlencode "$branch_o") +else + branch=$(urlencode "$branch") +fi + +query="branch=$branch\ + &commit=$commit\ + &build=$([ "$build_o" = "" ] && echo "$build" || echo "$build_o")\ + &build_url=$build_url\ + &tag=$([ "$tag_o" = "" ] && echo "$tag" || echo "$tag_o")\ + &slug=$([ "$slug_o" = "" ] && urlencode "$slug" || urlencode "$slug_o")\ + &yaml=$(urlencode "$yaml")\ + &service=$service\ + &flags=$flags\ + &pr=$([ "$pr_o" = "" ] && echo "$pr" || echo "$pr_o")\ + &job=$job" + +if [ "$ft_search" = "1" ]; +then + # detect bower comoponents location + bower_components="bower_components" + bower_rc=$(cd "$git_root" && cat .bowerrc 2>/dev/null || echo "") + if [ "$bower_rc" != "" ]; + then + bower_components=$(echo "$bower_rc" | tr -d '\n' | grep '"directory"' | cut -d'"' -f4 | sed -e 's/\/$//') + if [ "$bower_components" = "" ]; + then + bower_components="bower_components" + fi + fi + + # Swift Coverage + if [ "$ft_xcode" = "1" ] && [ -d "$ddp" ]; + then + say "${e}==>${x} Processing Xcode reports" + say " DerivedData folder: $ddp" + profdata_files=$(find "$ddp" -name '*.profdata' 2>/dev/null || echo '') + if [ "$profdata_files" != "" ]; + then + # xcode via profdata + if [ "$xp" = "" ]; + then + # xp=$(xcodebuild -showBuildSettings 2>/dev/null | grep -i "^\s*PRODUCT_NAME" | sed -e 's/.*= \(.*\)/\1/') + # say " ${e}->${x} Speed up Xcode processing by adding ${e}-J '$xp'${x}" + say " ${g}hint${x} Speed up Swift processing by using use -J 'AppName' (accepting regexp)" + fi + while read -r profdata; + do + if [ "$profdata" != "" ]; + then + swiftcov "$profdata" "$xp" + fi + done <<< "$profdata_files" + else + say " ${e}->${x} No Swift coverage found" + fi + + # Obj-C Gcov Coverage + if [ "$ft_gcov" = "1" ]; + then + say " ${e}->${x} Running $gcov_exe for Obj-C" + bash -c "find $ddp -type f -name '*.gcda' $gcov_include $gcov_ignore -exec $gcov_exe -pbcu $gcov_arg {} +" || true + fi + fi + + # Gcov Coverage + if [ "$ft_gcov" = "1" ]; + then + say "${e}==>${x} Running gcov in $proj_root ${e}(disable via -X gcov)${x}" + bash -c "find $proj_root -type f -name '*.gcno' $gcov_include $gcov_ignore -exec $gcov_exe -pb $gcov_arg {} +" || true + else + say "${e}==>${x} gcov disabled" + fi + + # Python Coverage + if [ "$ft_coveragepy" = "1" ]; + then + if [ ! -f coverage.xml ]; + then + if which coverage >/dev/null 2>&1; + then + say "${e}==>${x} Python coveragepy exists ${e}disable via -X coveragepy${x}" + + dotcoverage=$(find "$git_root" -name '.coverage' -or -name '.coverage.*' | head -1 || echo '') + if [ "$dotcoverage" != "" ]; + then + cd "$(dirname "$dotcoverage")" + if [ ! -f .coverage ]; + then + say " ${e}->${x} Running coverage combine" + coverage combine + fi + say " ${e}->${x} Running coverage xml" + if [ "$(coverage xml -i)" != "No data to report." ]; + then + files="$files +$PWD/coverage.xml" + else + say " ${r}No data to report.${x}" + fi + cd "$proj_root" + else + say " ${r}No .coverage file found.${x}" + fi + else + say "${e}==>${x} Python coveragepy not found" + fi + fi + else + say "${e}==>${x} Python coveragepy disabled" + fi + + if [ "$search_in_o" != "" ]; + then + # location override + search_in="$search_in_o" + fi + + say "$e==>$x Searching for coverage reports in:" + for _path in $search_in + do + say " ${g}+${x} $_path" + done + + patterns="find $search_in -type f \( -name '*coverage*.*' \ + -or -name 'nosetests.xml' \ + -or -name 'jacoco*.xml' \ + -or -name 'clover.xml' \ + -or -name 'report.xml' \ + -or -name '*.codecov.*' \ + -or -name 'codecov.*' \ + -or -name 'cobertura.xml' \ + -or -name 'excoveralls.json' \ + -or -name 'luacov.report.out' \ + -or -name 'coverage-final.json' \ + -or -name 'naxsi.info' \ + -or -name 'lcov.info' \ + -or -name 'lcov.dat' \ + -or -name '*.lcov' \ + -or -name '*.clover' \ + -or -name 'cover.out' \ + -or -name 'gcov.info' \ + -or -name '*.gcov' \ + -or -name '*.lst' \ + $include_cov \) \ + $exclude_cov \ + -not -name '*.profdata' \ + -not -name 'coverage-summary.json' \ + -not -name 'phpunit-code-coverage.xml' \ + -not -name 'remapInstanbul.coverage*.json' \ + -not -name '*codecov.yml' \ + -not -name '*.serialized' \ + -not -name '.coverage*' \ + -not -name '.*coveragerc' \ + -not -name '*.sh' \ + -not -name '*.bat' \ + -not -name '*.ps1' \ + -not -name '*.rst' \ + -not -name '*.sbt' \ + -not -name '*.xcoverage.*' \ + -not -name '*.gz' \ + -not -name '*.conf' \ + -not -name '*.p12' \ + -not -name '*.csv' \ + -not -name '*.rsp' \ + -not -name '*.m4' \ + -not -name '*.am' \ + -not -name '*.template' \ + -not -name '*.cp' \ + -not -name '*.bw' \ + -not -name '*.crt' \ + -not -name '*.log' \ + -not -name '*.cmake' \ + -not -name '*.pth' \ + -not -name '*.in' \ + -not -name '*.jar*' \ + -not -name '*.pom*' \ + -not -name '*.png' \ + -not -name '*.jpg' \ + -not -name '*.sql' \ + -not -name '*.jpeg' \ + -not -name '*.svg' \ + -not -name '*.gif' \ + -not -name '*.csv' \ + -not -name '*.snapshot' \ + -not -name '*.mak*' \ + -not -name '*.bash' \ + -not -name '*.data' \ + -not -name '*.py' \ + -not -name '*.class' \ + -not -name '*.xcconfig' \ + -not -name '*.ec' \ + -not -name '*.coverage' \ + -not -name '*.pyc' \ + -not -name '*.cfg' \ + -not -name '*.egg' \ + -not -name '*.ru' \ + -not -name '*.css' \ + -not -name '*.less' \ + -not -name '*.pyo' \ + -not -name '*.whl' \ + -not -name '*.html' \ + -not -name '*.ftl' \ + -not -name '*.erb' \ + -not -name '*.rb' \ + -not -name '*.js' \ + -not -name '*.jade' \ + -not -name '*.db' \ + -not -name '*.md' \ + -not -name '*.cpp' \ + -not -name '*.gradle' \ + -not -name '*.tar.tz' \ + -not -name '*.scss' \ + -not -name 'include.lst' \ + -not -name 'fullLocaleNames.lst' \ + -not -name 'inputFiles.lst' \ + -not -name 'createdFiles.lst' \ + -not -name 'scoverage.measurements.*' \ + -not -name 'test_*_coverage.txt' \ + -not -name 'testrunner-coverage*' \ + -not -path '*/vendor/*' \ + -not -path '*/htmlcov/*' \ + -not -path '*/home/cainus/*' \ + -not -path '*/virtualenv/*' \ + -not -path '*/js/generated/coverage/*' \ + -not -path '*/.virtualenv/*' \ + -not -path '*/virtualenvs/*' \ + -not -path '*/.virtualenvs/*' \ + -not -path '*/.env/*' \ + -not -path '*/.envs/*' \ + -not -path '*/env/*' \ + -not -path '*/envs/*' \ + -not -path '*/.venv/*' \ + -not -path '*/.venvs/*' \ + -not -path '*/venv/*' \ + -not -path '*/venvs/*' \ + -not -path '*/.git/*' \ + -not -path '*/.hg/*' \ + -not -path '*/.tox/*' \ + -not -path '*/__pycache__/*' \ + -not -path '*/.egg-info*' \ + -not -path '*/$bower_components/*' \ + -not -path '*/node_modules/*' \ + -not -path '*/conftest_*.c.gcov' 2>/dev/null" + files=$(eval "$patterns" || echo '') + +elif [ "$include_cov" != "" ]; +then + files=$(eval "find $search_in -type f \( ${include_cov:5} \)$exclude_cov 2>/dev/null" || echo '') +fi + +num_of_files=$(echo "$files" | wc -l | tr -d ' ') +if [ "$num_of_files" != '' ] && [ "$files" != '' ]; +then + say " ${e}->${x} Found $num_of_files reports" +fi + +# no files found +if [ "$files" = "" ]; +then + say "${r}-->${x} No coverage report found." + say " Please visit ${b}http://docs.codecov.io/docs/supported-languages${x}" + exit ${exit_with}; +fi + +if [ "$ft_network" == "1" ]; +then + say "${e}==>${x} Detecting git/mercurial file structure" + network=$(cd "$git_root" && git ls-files 2>/dev/null || hg locate 2>/dev/null || echo "") + if [ "$network" = "" ]; + then + network=$(find "$git_root" -type f \ + -not -path '*/virtualenv/*' \ + -not -path '*/.virtualenv/*' \ + -not -path '*/virtualenvs/*' \ + -not -path '*/.virtualenvs/*' \ + -not -path '*.png' \ + -not -path '*.gif' \ + -not -path '*.jpg' \ + -not -path '*.jpeg' \ + -not -path '*.md' \ + -not -path '*/.env/*' \ + -not -path '*/.envs/*' \ + -not -path '*/env/*' \ + -not -path '*/envs/*' \ + -not -path '*/.venv/*' \ + -not -path '*/.venvs/*' \ + -not -path '*/venv/*' \ + -not -path '*/venvs/*' \ + -not -path '*/build/lib/*' \ + -not -path '*/.git/*' \ + -not -path '*/.egg-info/*' \ + -not -path '*/shunit2-2.1.6/*' \ + -not -path '*/vendor/*' \ + -not -path '*/js/generated/coverage/*' \ + -not -path '*/__pycache__/*' \ + -not -path '*/node_modules/*' \ + -not -path "*/$bower_components/*" 2>/dev/null || echo '') + fi +fi + +upload_file=`mktemp /tmp/codecov.XXXXXX` + +cleanup() { + rm -f $upload_file +} + +trap cleanup INT ABRT TERM + +if [ "$env" != "" ]; +then + inc_env="" + say "${e}==>${x} Appending build variables" + for varname in $(echo "$env" | tr ',' ' ') + do + if [ "$varname" != "" ]; + then + say " ${g}+${x} $varname" + inc_env="${inc_env}${varname}=$(eval echo "\$${varname}") +" + fi + done + +echo "$inc_env<<<<<< ENV" >> $upload_file +fi + +if [ "$ft_network" == "1" ]; +then + echo "$network +<<<<<< network" >> $upload_file +fi + +fr=0 +say "${e}==>${x} Reading reports" +while IFS='' read -r file; +do + # read the coverage file + if [ "$(echo "$file" | tr -d ' ')" != '' ]; + then + if [ -f "$file" ]; + then + report_len=$(wc -c < "$file") + if [ "$report_len" -ne 0 ]; + then + say " ${g}+${x} $file ${e}bytes=$(echo "$report_len" | tr -d ' ')${x}" + # append to to upload + echo "# path=$(echo "$file" | sed "s|^$git_root/||")" >> $upload_file + cat "$file" >> $upload_file + echo "<<<<<< EOF" >> $upload_file + fr=1 + if [ "$clean" = "1" ]; + then + rm "$file" + fi + else + say " ${r}-${x} Skipping empty file $file" + fi + else + say " ${r}-${x} file not found at $file" + fi + fi +done <<< "$(echo -e "$files")" + +if [ "$fr" = "0" ]; +then + say "${r}-->${x} No coverage data found." + say " Please visit ${b}http://docs.codecov.io/docs/supported-languages${x}" + say " search for your projects language to learn how to collect reports." + exit ${exit_with}; +fi + +if [ "$ft_fix" = "1" ]; +then + say "${e}==>${x} Appending adjustments" + say " ${b}http://docs.codecov.io/docs/fixing-reports${x}" + adjustments="" + + if echo "$network" | grep -m1 '.kt$' 1>/dev/null; + then + # skip brackets and comments + adjustments="$adjustments +$(find "$git_root" -type f -name '*.kt' -exec grep -nIH '^[[:space:]]*}$' {} \; | cut -f1-2 -d: 2>/dev/null || echo '') +$(find "$git_root" -type f -name '*.kt' -exec grep -nIH '^/\*' {} \; | cut -f1-2 -d: 2>/dev/null || echo '')" + fi + + if echo "$network" | grep -m1 '.go$' 1>/dev/null; + then + # skip empty lines, comments, and brackets + adjustments="$adjustments +$(find "$git_root" -type f -not -path '*/vendor/*' -name '*.go' -exec grep -nIH \ + -e '^[[:space:]]*$' \ + -e '^[[:space:]]*//.*' \ + -e '^[[:space:]]*/\*' \ + -e '^[[:space:]]*\*/' \ + -e '^[[:space:]]*}$' \ + {} \; | cut -f1-2 -d: 2>/dev/null || echo '')" + fi + + if echo "$network" | grep -m1 '.jsx$' 1>/dev/null; + then + # skip empty lines, comments, and brackets + adjustments="$adjustments +$(find "$git_root" -type f -name '*.jsx' -exec grep -nIH \ + -e '^[[:space:]]*$' \ + -e '^[[:space:]]*//.*' \ + -e '^[[:space:]]*/\*' \ + -e '^[[:space:]]*\*/' \ + -e '^[[:space:]]*}$' \ + {} \; | cut -f1-2 -d: 2>/dev/null || echo '')" + fi + + if echo "$network" | grep -m1 '.php$' 1>/dev/null; + then + # skip empty lines, comments, and brackets + adjustments="$adjustments +$(find "$git_root" -type f -not -path '*/vendor/*' -name '*.php' -exec grep -nIH \ + -e '^[[:space:]]*[\{\}\\[][[:space:]]*$' \ + -e '^[[:space:]]*);[[:space:]]*$' \ + -e '^[[:space:]]*][[:space:]]*$' \ + {} \; | cut -f1-2 -d: 2>/dev/null || echo '')" + fi + + if echo "$network" | grep -m1 '\(.cpp\|.h\|.cxx\|.c\|.hpp\|.m\)$' 1>/dev/null; + then + # skip brackets + adjustments="$adjustments +$(find "$git_root" -type f \( -name '*.h' -or -name '*.cpp' -or -name '*.cxx' -or -name '*.m' -or -name '*.c' -or -name '*.hpp' \) -exec grep -nIH '^[[:space:]]*}' {} \; | cut -f1-2 -d: 2>/dev/null || echo '')" + fi + + # join files into single line + found=$(echo "$adjustments" | wc -l | tr -d ' ') + adjustments=$(echo "$adjustments" | awk 'BEGIN { FS=":" } + $1!=key { if (key!="") print out ; key=$1 ; out=$0 ; next } + { out=out","$2 } + END { print out }') + + if echo "$network" | grep -m1 '\(.cpp\|.h\|.cxx\|.c\|.hpp\|.m\)$' 1>/dev/null; + then + # skip LCOV_EXCL + adjustments="$adjustments +$(find "$git_root" -type f \( -name '*.h' -or -name '*.cpp' -or -name '*.cxx' -or -name '*.m' -or -name '*.c' -or -name '*.hpp' \) -exec grep -nIH '// LCOV_EXCL' {} \; 2>/dev/null || echo '')" + fi + + if echo "$network" | grep -m1 '.kt$' 1>/dev/null; + then + # last line in file + adjustments="$adjustments +$(find "$git_root" -type f -name '*.kt' -exec wc -l {} \; | while read l; do echo "EOF: $l"; done 2>/dev/null || echo '')" + fi + + if [ "$found" != "1" ]; + then + say " ${g}+${x} Found $found adjustments" + echo "# path=fixes +$adjustments +<<<<<< EOF" >> $upload_file + else + say " ${e}->${x} Found 0 adjustments" + fi +fi + +if [ "$url_o" != "" ]; +then + url="$url_o" +fi +# trim whitespace from query + +if [ "$dump" != "0" ]; +then + echo "$url/upload/v4?$(echo "package=bash-$VERSION&token=$token&$query" | tr -d ' ')" + cat $upload_file +else + + query=$(echo "${query}" | tr -d ' ') + say "${e}==>${x} Uploading reports" + say " ${e}url:${x} $url" + say " ${e}query:${x} $query" + + # now add token to query + query=$(echo "package=bash-$VERSION&token=$token&$query" | tr -d ' ') + + if [ "$ft_s3" = "1" ]; + then + say " ${e}->${x} Pinging Codecov" + res=$(curl -sX $cacert POST "$url/upload/v4?$query" -H 'Accept: text/plain' || true) + # a good replay is "https://codecov.io" + "\n" + "https://codecov.s3.amazonaws.com/..." + status=$(echo "$res" | head -1 | grep 'HTTP ' | cut -d' ' -f2) + if [ "$status" = "" ]; + then + s3target=$(echo "$res" | sed -n 2p) + say " ${e}->${x} Uploading to S3 $(echo "$s3target" | cut -c1-32)" + s3=$(curl -fisX PUT --data-binary @$upload_file \ + -H 'Content-Type: text/plain' \ + -H 'x-amz-acl: public-read' \ + -H 'x-amz-storage-class: REDUCED_REDUNDANCY' \ + "$s3target" || true) + if [ "$s3" != "" ]; + then + say " ${g}->${x} View reports at ${b}$(echo "$res" | sed -n 1p)${x}" + exit 0 + else + say " ${r}X>${x} Failed to upload to S3" + fi + elif [ "$status" = "400" ]; + then + # 400 Error + say "${g}${res}${x}" + exit ${exit_with} + fi + fi + + say " ${e}->${x} Uploading to Codecov" + i="0" + while [ $i -lt 4 ] + do + i=$[$i+1] + + res=$(curl -sX POST $cacert --data-binary @$upload_file "$url/upload/v2?$query" -H 'Accept: text/plain' || echo 'HTTP 500') + # HTTP 200 + # http://.... + status=$(echo "$res" | head -1 | cut -d' ' -f2) + if [ "$status" = "" ]; + then + say " View reports at ${b}$(echo "$res" | head -2 | tail -1)${x}" + exit 0 + + elif [ "${status:0:1}" = "5" ]; + then + say " ${e}->${x} Sleeping for 10s and trying again..." + sleep 10 + + else + say " ${g}${res}${x}" + exit 0 + exit ${exit_with} + fi + + done + +fi + +say " ${r}X> Failed to upload coverage reports${x}" +exit ${exit_with} + +# EOF diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..afff394 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,27 @@ +Dear Gobblin maintainers, + +Please accept this PR. I understand that it will not be reviewed until I have checked off all the steps below! + + +### JIRA +- [ ] My PR addresses the following [Gobblin JIRA](https://issues.apache.org/jira/browse/GOBBLIN/) issues and references them in the PR title. For example, "[GOBBLIN-XXX] My Gobblin PR" + - https://issues.apache.org/jira/browse/GOBBLIN-XXX + + +### Description +- [ ] Here are some details about my PR, including screenshots (if applicable): + + +### Tests +- [ ] My PR adds the following unit tests __OR__ does not need testing for this extremely good reason: + + +### Commits +- [ ] My commits all reference JIRA issues in their subject lines, and I have squashed multiple commits if they address the same issue. In addition, my commits follow the guidelines from "[How to write a good git commit message](http://chris.beams.io/posts/git-commit/)": + 1. Subject is separated from body by a blank line + 2. Subject is limited to 50 characters + 3. Subject does not end with a period + 4. Subject uses the imperative mood ("add", not "adding") + 5. Body wraps at 72 characters + 6. Body explains "what" and "why", not "how" + diff --git a/.github/workflows/build_and_test.yaml b/.github/workflows/build_and_test.yaml new file mode 100644 index 0000000..d805187 --- /dev/null +++ b/.github/workflows/build_and_test.yaml @@ -0,0 +1,151 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +name: Build and Run Tests + +on: + push: + # Publish only on `master` + branches: + - master + pull_request: + branches: + - master + release: + types: [published, edited] + +jobs: + build: + name: Build repository + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + # Stores external dependencies, can be further improved with Gradle 6.1 + - name: Cache Gradle Dependencies + uses: actions/cache@v2 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + # Only rebuild cache if build.gradle is changed + key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle') }} + restore-keys: ${{ runner.os }}-gradle + - name: Build repository + run: | + ./gradlew --no-daemon clean build -x test -x javadoc -x findbugsMain -x findbugsTest -x checkstyleMain -x checkstyleJmh -x checkstyleTest -Dorg.gradle.parallel=true + + test_coverage: + runs-on: ubuntu-latest + name: Generate test coverage + needs: build + steps: + - name: Check out the repo + uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Cache Gradle Dependencies + uses: actions/cache@v2 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + # Only rebuild cache if build.gradle is changed + key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle') }} + restore-keys: ${{ runner.os }}-gradle + - name: Generate code coverage + run: | + ./gradlew -PskipTestGroup=disabledOnCI -Dorg.gradle.parallel=false -DjacocoBuild=1 $GOBBLIN_GRADLE_OPTS jacocoTestCoverage + + static_checks: + name: Run static checks + runs-on: ubuntu-latest + needs: build + steps: + - name: Check out the repo + uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + # Stores external dependencies, can be further improved with Gradle 6.1 + - name: Cache Gradle Dependencies + uses: actions/cache@v2 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + # Only rebuild cache if build.gradle is changed + key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle') }} + restore-keys: ${{ runner.os }}-gradle + - name: Run CheckStyle and FindBugs + run: | + ./gradlew --no-daemon -x javadoc findbugsMain checkstyleMain checkstyleTest checkstyleJmh + + run_tests: + timeout-minutes: 60 + env: + GOBBLIN_GRADLE_OPTS: "--no-daemon -Dgobblin.metastore.testing.embeddedMysqlEnabled=false -PusePreinstalledMysql=true" + strategy: + matrix: + test-group: ["Core Tests", "Service Tests", "Module Tests", "Other Tests"] + fail-fast: false + runs-on: ubuntu-latest + needs: build + services: + mysql: + image: mysql:5.7.32 + env: + MYSQL_USER: testUser + MYSQL_PASSWORD: testPassword + MYSQL_DATABASE: test + MYSQL_ROOT_PASSWORD: password + ports: + - 3306:3306 + options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=5 + steps: + - name: Check out the repo + uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Verify mysql connection + run: | + sudo apt-get install -y mysql-client + mysql --host 127.0.0.1 --port 3306 -uroot -ppassword -e "SHOW DATABASES" + - name: Cache Gradle Dependencies + uses: actions/cache@v2 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + # Only rebuild cache if build.gradle is changed + key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle') }} + restore-keys: ${{ runner.os }}-gradle + - name: Run test group ${{ matrix.test-group }} + # Write retry logic as integration tests can fail due to timing out or network problems + run: | + ./gradlew getGroupedTests -PgroupName="${{matrix.test-group}}" > temp.txt + TASKS=$(sed -n 's/CI Task: //p' temp.txt) + echo $TASKS + + n=0 + until [ "$n" -ge 3 ] + do + ./gradlew -PskipTestGroup=disabledOnCI $GOBBLIN_GRADLE_OPTS $TASKS -Dorg.gradle.parallel=false && break + n=$((n+1)) + if [[ $n -lt 3 ]]; then + echo "Tests failed, retry attempt number $n" + else + exit 1 + fi + sleep 10 + done diff --git a/.github/workflows/docker_build_publish.yaml b/.github/workflows/docker_build_publish.yaml new file mode 100644 index 0000000..4c2ba93 --- /dev/null +++ b/.github/workflows/docker_build_publish.yaml @@ -0,0 +1,61 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +name: Build and Publish Docker image +on: + push: + # Publish only on `master` + branches: + - master + pull_request: + branches: + - master + paths: + - 'gobblin-docker/**' + - '.github/workflows/docker_build_publish.yaml' + release: + types: [published, edited] + +env: + IMAGE_NAME: apache/gobblin + +jobs: + build_and_push_to_registry: + name: Build docker images and publish to DockerHub + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@v2 + - name: Build Docker Tag + id: build_tag + run: | + SHA=`echo ${{ github.sha }} | head -c 7` + if [[ ${{ github.event_name }} == 'release' ]]; then + TAG="${{ env.IMAGE_NAME }}:sha-$SHA, ${{ env.IMAGE_NAME }}:${{ github.event.release.tag_name }}, ${{ env.IMAGE_NAME }}:latest" + else + TAG="${{ env.IMAGE_NAME }}:sha-$SHA" + fi + echo "tag=$TAG" + echo "::set-output name=tag::$TAG" + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - name: Login to DockerHub + uses: docker/login-action@v1 + if: github.event_name != 'pull_request' + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + # - name: Login to GitHub Container Registry + # if: github.event_name != 'pull_request' + # uses: docker/login-action@v1 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.CR_PAT }} + - name: Build Images and Publish + uses: docker/build-push-action@v2 + with: + tags: ${{ steps.build_tag.outputs.tag }} + push: ${{ github.event_name == 'release' }} + file: ./gobblin-docker/gobblin/alpine-gobblin-latest/Dockerfile diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..51d31b5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,59 @@ +.classpath* +.project* +.settings +.DS_Store +*.tar.gz + +# Intellij related +**/.idea +**/*.iml +**/*.iws +**/*.ipr +.shelf/ +.ideaDataSources/ + +# VS Code related +.vscode + +**/.classpath +**/.project +**/.settings + +**/*.swp +**/*.swo +**/*.log + +**/build/ +.gradle +**/.gradle +gradle.properties.release +test-output +**/test-output +dist +target +tmp +out +**/out +output +/eclipse_build +.project +.classpath +out/ +*/bin/ +**/mainGeneratedDataTemplate +**/mainGeneratedRest +**/main/snapshot +**.tar.gz +*~ +metastore_db/ + +# generated nodeJs files +node_modules/ +package-lock.json + +*.out + +# generated java files +**/gen-java/ + +temp/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..427e5fb --- /dev/null +++ b/.travis.yml @@ -0,0 +1,65 @@ +language: java + +dist: bionic +sudo: required + +jdk: + - openjdk8 + +addons: + apt: + packages: + - libaio-dev + - libdbus-glib-1-dev + - xsltproc + +before_cache: + - rm -f $HOME/.gradle/caches/modules-2/modules-2.lock + +cache: + directories: + - $HOME/.gradle/caches/ + - $HOME/.gradle/wrapper/ + +before_install: + +services: + - xvfb + - mysql + +stages: + - test + - name: deploy + if: branch = master + +before_script: + - mysql -uroot -e "create user testUser identified by 'testPassword';" + - mysql -uroot -e "SET PASSWORD FOR 'root'@'localhost' = PASSWORD('password')" + +script: + - travis_retry ./travis/test.sh + - travis_retry ./gradlew jacocoTestReport +after_success: + - bash <(cat .codecov_bash) +after_failure: ./travis/junit-errors-to-stdout.sh + +jobs: + include: + - stage: deploy + env: RUN_TEST_GROUP=none + install: skip + script: + - travis_retry ./travis/bintrayDeploy.sh + +env: + jobs: + - RUN_TEST_GROUP=build + - RUN_TEST_GROUP=default + - RUN_TEST_GROUP=group1 + - RUN_TEST_GROUP=coverage + global: + - secure: U72nmzXq7kXcIabiwvrTF+WkNFQxov2ACd8oPxWSHsyzRsfVJN42vT7gS3dLbH5G5claXG2p+rur4ueVffzYSwJ8B9OP6gTB8sNJnGr9zyxni4OJchyKqOYuj+UBpEQC/7qXKMCKnuJndsf1OvndDh/V1SH0DSSUuA6mDtgO/eM= + - secure: WiK7tyFV68xdkIfUlWreUHgEGGjaCBd73O4SbjE9AsbqqF7D+Iu8iRo1OhKQj+6eajUH9Eoev9rVN74FQgUfeNzrOkYsgDysXmyZ7+UxFokijFcATJmIBompA3dySGU2qXeKbJMNuUjXgrRIludaV6h2ahL6Fji42cgK4I3s2qs= + +jdk: + - openjdk8 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..29fcc6e --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +DIL 0.0.1 +-------------- + +###Created Date: 4/10/2021 + +## HIGHLIGHTS +* Initial publication of DIL to open source + +## NEW FEATURES + +##IMPROVEMENTS diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md new file mode 100644 index 0000000..426cbef --- /dev/null +++ b/CONTRIBUTION.md @@ -0,0 +1,28 @@ +Contribution Agreement +====================== + +As a contributor, you represent that the code you submit is your +original work or that of your employer (in which case you represent +you have the right to bind your employer). By submitting code, you +(and, if applicable, your employer) are licensing the submitted code +to LinkedIn and the open source community subject to the BSD 2-Clause +license. + +Responsible Disclosure of Security Vulnerabilities +================================================== + +Please do not file reports on Github for security issues. Please +review the guidelines on at (link to more info). Reports should be +encrypted using PGP (link to PGP key) and sent to +security@linkedin.com preferably with the title "Github +linkedin/ - ". + +Tips for Getting Your Pull Request Accepted +=========================================== + +1. Make sure all new features are tested, and the tests pass. +2. Bug fixes must include a test case demonstrating the error that it + fixes. +3. Open an issue first and seek advice for your change before + submitting a pull request. Large features which have never been + discussed are unlikely to be accepted. diff --git a/HEADER b/HEADER new file mode 100644 index 0000000..4cea293 --- /dev/null +++ b/HEADER @@ -0,0 +1,3 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..99f08f3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +BSD 2-CLAUSE LICENSE + +Copyright 2021 LinkedIn Corporation +All Rights Reserved. + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the following +conditions are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials provided +with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..c3dc4d2 --- /dev/null +++ b/NOTICE @@ -0,0 +1,4 @@ +Copyright 2021 LinkedIn Corporation. All Rights Reserved. + +Licensed under the BSD 2-Clause License (the "License"). +See License in the project root for license information. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc25839 --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +# Data Integration Library + +LinkedIn Data Integration Library (DIL) is a collection of generic data integration components that can be mix-and-matched to form powerful ready-to-use connectors, which can then be used by data integration frameworks like [Apache Gobblin](https://gobblin.apache.org) or event processing frameworks like [Apache Kafka](https://kafka.apache.org/) to ingress or egress data between cloud services or APIs. + +# Highlights +- Generic components: data transmission protocol components and data format components are generically designed without one depending on another, greatly relieved the challenges in handling the variety of cloud APIs and services. +- Multistage architecture: data integration is never a one-step process, the library inherently supports multi-staged integration processes so that complex data integration scenarios can be handled with simple generic components. +- Bidirectional transmission: ingress and egress are just business logic in DIL, both work the same way and use the same set of configurations, as ingress to one end is egress to the other end. +- Extensible ompression and encryption: users can easily add pluggable and extensible data compression and encryption algorithms. +- Flexible pagaination: DIL supports a wide range of pagination methods to break large payloads to small chunks. + +# Common Patterns used in production +- Asynchronous bulk ingestion from Rest APIs, like Salesforce.com, to Data Lake (HDFS, S3, ADLS) +- Data upload to Rest APIs, like Google API, with tracking of responses +- Ingest data from one Rest API and egress to another (Rest API) on cloud + +# Requirements +* Java >= 1.8 + +If building the distribution with tests turned on: +* Maven version 3.5.3 + +# Instructions to build the distribution +1. Extract the archive file to your local directory. +2. Skip tests and build the distribution: +Run `./gradlew build -x findbugsMain -x test -x rat -x checkstyleMain` +The distribution will be created in build/distribution directory. +(or) +3. Run tests and build the distribution (requires Maven): +Run `./gradlew build` +The distribution will be created in build/distribution directory. \ No newline at end of file diff --git a/build.gradle b/build.gradle new file mode 100644 index 0000000..d6f6071 --- /dev/null +++ b/build.gradle @@ -0,0 +1,136 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + + +apply from: 'gradle/scripts/environment.gradle' + +buildscript { + apply from: 'gradle/scripts/repositories.gradle' + apply from: 'gradle/scripts/defaultBuildProperties.gradle' + apply from: 'gradle/scripts/computeVersions.gradle' + apply from: 'gradle/scripts/ci-support.gradle' + + apply from: file('gradle/scripts/buildscript.gradle'), to: buildscript + + buildscript.repositories.addAll(project.repositories) + + dependencies { + classpath 'org.apache.ant:ant:1.9.4' + classpath 'gradle.plugin.org.inferred:gradle-processors:1.1.2' + classpath 'org.kt3k.gradle.plugin:coveralls-gradle-plugin:1.0.2' + classpath 'io.spring.gradle:dependency-management-plugin:0.6.0.RELEASE' + classpath 'me.champeau.gradle:jmh-gradle-plugin:0.4.8' + classpath "gradle.plugin.nl.javadude.gradle.plugins:license-gradle-plugin:0.14.0" + classpath 'com.jfrog.bintray.gradle:gradle-bintray-plugin:1.+' + classpath 'com.github.jengelman.gradle.plugins:shadow:5.2.0' + } + + repositories { + maven { + url "https://plugins.gradle.org/m2/" + } + } + +} + +apply plugin: "com.github.hierynomus.license" + +downloadLicenses { + includeProjectDependencies = true + + ext.apacheTwo = license('Apache License, Version 2.0', 'http://opensource.org/licenses/Apache-2.0') + ext.bsd = license('BSD License', 'http://www.opensource.org/licenses/bsd-license.php') + ext.bsd3 = license('BSD 3-clause', 'https://opensource.org/licenses/BSD-3-Clause') + ext.cddlGplv2 = license('CDDL + GPLv2 with classpath exception', 'https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html') + ext.spl = license('SPL', 'http://opensource.org/licenses/SPL-1.0'); + ext.epl = license('Eclipse Public License - Version 1.0', 'https://opensource.org/licenses/EPL-1.0') + + licenses = [ + (group('com.linkedin.gobblin')) : apacheTwo, + // https://zookeeper.apache.org/ + (group('org.apache.zookeeper')) : apacheTwo, + // http://testng.org/doc/ + (group('org.testng:testng')) : apacheTwo, + // https://db.apache.org/jdo/index.html + (group('javax.jdo')) : apacheTwo, + // Based on other javax licenses + (group('javax.servlet.jsp')) : cddlGplv2, + (group('javax.servlet')) : cddlGplv2, + (group('javax.transaction')) : cddlGplv2, + // http://commons.apache.org/proper/commons-beanutils/ + (group('commons-beanutils')) : apacheTwo, + // http://jakarta.apache.org/regexp/ + (group('regexp')) : apacheTwo, + // http://forge.ow2.org/projects/asm/ + (group('asm')) : bsd, + // https://github.com/codehaus/jettison/blob/master/pom.xml + (group('org.codehaus.jettison')) : apacheTwo, + // https://github.com/stephenc/jcip-annotations/blob/master/LICENSE.txt + (group('net.jcip')) : apacheTwo, + // https://github.com/google/re2j/blob/master/LICENSE + (group('com.google.re2j')) : bsd3, + // https://github.com/findbugsproject/findbugs/issues/128 + (group('com.google.code.findbugs')) : bsd3, + // http://www.beanshell.org/license.html + (group('org.beanshell')) : spl, + // http://www.mchange.com/projects/c3p0/ + (group('c3p0')) : epl, + 'antlr-runtime-3.4.jar' : apacheTwo + ] + aliases = [ + (apacheTwo) : ['The Apache Software License, Version 2.0', 'Apache 2', 'Apache License Version 2.0', + 'Apache License, Version 2.0', 'Apache License 2.0', 'Apache Version 2.0, January 2004', + license('Apache License', 'http://www.apache.org/licenses/LICENSE-2.0')], + (bsd) : ['BSD', license('New BSD License', 'http://www.opensource.org/licenses/bsd-license.php')] + ] + dependencyConfiguration = 'compile' + excludeDependencies = [ + ] +} + +apply from: 'gradle/scripts/repositories.gradle' + +apply plugin: 'org.inferred.processors' +apply plugin: 'io.spring.dependency-management' + +apply from: 'gradle/scripts/configureSubprojects.gradle' +apply from: 'gradle/scripts/idesSetup.gradle' +apply from: 'gradle/scripts/jacoco-coveralls-support.gradle' + +apply from: 'gradle/scripts/dependencyDefinitions.gradle' +apply from: 'gradle/scripts/restli.gradle' + +apply from: 'gradle/scripts/testSetup.gradle' +apply from: 'gradle/scripts/globalDependencies.gradle' +apply from: 'gradle/scripts/javaPlugin.gradle' + +apply from: 'gradle/scripts/utilities.gradle' + +apply from: 'gradle/scripts/javadoc.gradle' +apply from: 'gradle/scripts/sourcesJar.gradle' + +apply from: 'gradle/scripts/mavenPublishing.gradle' +apply from: 'gradle/scripts/nexusPublishing.gradle' +apply from: 'gradle/scripts/bintrayPublishing.gradle' +apply from: 'gradle/scripts/javaVersionCheck.gradle' +apply from: 'gradle/scripts/release.gradle' + +/* + * Hack for upgrading pegasus to version 11.0.0. For some reason, the gradle-plugins in + * that version fails to bring in "tools.jar" into the classpath. The rest.li team is actively + * seeking for a clean fix. This part will be removed later when the fix is ready + */ +allprojects { + tasks.matching { it.name == 'generateRestModel' }.all { + doFirst { + it.codegenClasspath += files("${System.getProperty('java.home')}/../lib/tools.jar") + } + } +} + +task printVersionName { + doLast { + println project.version + } +} diff --git a/buildSrc/src/main/groovy/org/apache/gobblin/gradle/BuildProperties.groovy b/buildSrc/src/main/groovy/org/apache/gobblin/gradle/BuildProperties.groovy new file mode 100644 index 0000000..e051673 --- /dev/null +++ b/buildSrc/src/main/groovy/org/apache/gobblin/gradle/BuildProperties.groovy @@ -0,0 +1,50 @@ +package org.apache.gobblin.gradle; + +import java.util.TreeMap +import org.gradle.api.Project + +/** + * The manages the collection of all known build properties for the project. It is + * essentially a map from the property name to the BuildProperty object. + */ +public class BuildProperties extends TreeMap { + final Project project; + + public BuildProperties(Project project) { + super(); + this.project = project + } + + public BuildProperties register(BuildProperty prop) { + put(prop.name, prop); + return this; + } + + public void ensureDefined(String propName) { + if (! containsKey(propName)) { + throw new RuntimeException ("Property not defined: " + propName) + } + def defaultValue = get(propName).defaultValue + + // Special treatment for Boolean flags -- just specifying the property + // is treated as setting to true. + if (null != defaultValue && defaultValue instanceof Boolean && + !((Boolean)defaultValue).booleanValue()) { + this.project.ext.set(propName, this.project.hasProperty(propName)) + } + else if (! this.project.hasProperty(propName)) { + this.project.ext.set(propName, defaultValue) + } + + println String.format("Build property: %s=%s", propName, this.project.ext.get(propName)) + } + + public void printHelp() { + println "\n\n" + println "BUILD PROPERTIES" + println "" + this.each { propName, propHelp -> + println propHelp.getHelp() + } + } +} \ No newline at end of file diff --git a/buildSrc/src/main/groovy/org/apache/gobblin/gradle/BuildProperty.groovy b/buildSrc/src/main/groovy/org/apache/gobblin/gradle/BuildProperty.groovy new file mode 100644 index 0000000..24a8349 --- /dev/null +++ b/buildSrc/src/main/groovy/org/apache/gobblin/gradle/BuildProperty.groovy @@ -0,0 +1,23 @@ +package org.apache.gobblin.gradle; + +/** + * Encapsulates various aspects of a project property that can be used to customize the build through + * the gradle -P switch. + */ +public class BuildProperty { + private final String HELP_FORMAT = "\t%-20s - %s. Default: %s"; + + public final String name; + public final Object defaultValue; + public final String description; + + public BuildProperty(String name, Object defaultValue, String description) { + this.name = name; + this.defaultValue = defaultValue; + this.description = description; + } + + public String getHelp() { + return String.format(HELP_FORMAT, this.name, this.description, this.defaultValue) + } +} \ No newline at end of file diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml new file mode 100644 index 0000000..a89898b --- /dev/null +++ b/config/checkstyle/checkstyle.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/config/checkstyle/suppressions.xml b/config/checkstyle/suppressions.xml new file mode 100644 index 0000000..bee4e86 --- /dev/null +++ b/config/checkstyle/suppressions.xml @@ -0,0 +1,10 @@ + + + + + + + \ No newline at end of file diff --git a/defaultEnvironment.gradle b/defaultEnvironment.gradle new file mode 100644 index 0000000..a84d256 --- /dev/null +++ b/defaultEnvironment.gradle @@ -0,0 +1,21 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +repositories { + mavenCentral() +} + +subprojects { + repositories { + mavenCentral() + maven { + url "https://repository.cloudera.com/artifactory/cloudera-repos/" + } + maven { + url "http://conjars.org/repo" + } + } + + project.buildDir = new File(project.rootProject.buildDir, project.name) +} diff --git a/dil/build.gradle b/dil/build.gradle new file mode 100644 index 0000000..49e9c34 --- /dev/null +++ b/dil/build.gradle @@ -0,0 +1,113 @@ +apply plugin: 'com.github.johnrengelman.shadow' +jacoco { + toolVersion = '0.8.0' +} + +// Exclude classes, packages from Jacoco report, which we don't need to add unit tests for. +jacocoTestReport { + afterEvaluate { + classDirectories = files(classDirectories.files.collect { + fileTree(dir: it, + exclude: ['com/linkedin/dil/factory']) + }) + } +} + +dependencies { + compile externalDependency.'gson' + compile externalDependency.'gobblin-core' + compile externalDependency.'gobblin-core-base' + compile externalDependency.'gobblin-crypto' + compile externalDependency.'gobblin-http' + compile externalDependency.'gobblin-runtime' + compile externalDependency.'gobblin-metadata' + compile externalDependency.'gobblin-metastore' + compile externalDependency.'awsCore' + compile externalDependency.'awsApacheHttp' + compile externalDependency.'awsHttpSpi' + compile externalDependency.'awsS3' + compile externalDependency.'awsUtils' + compile externalDependency.'commonsValidator' + compile externalDependency.'guava' + compile externalDependency.'lombok' + compile externalDependency.'commonsLang3' + compile externalDependency.'testng' + compile externalDependency.'okhttp' + + runtime externalDependency.'gobblin-azkaban' + runtime externalDependency.'gobblin-kafka-08' + runtime externalDependency.'gobblin-kafka-common' + runtime externalDependency.'gobblin-metrics' + runtime externalDependency.'gobblin-data-management' + runtime externalDependency.'gobblin-sql' + runtime externalDependency.gson + runtime externalDependency.'jodaTime' + + testCompile externalDependency.'jmockit' + testCompile externalDependency.'powermock-core' + testCompile externalDependency.'powermock-api-mockito' + testCompile externalDependency.'powermock-module-testng' + testCompile externalDependency.'mockito-core' + +// compile externalDependency.'slf4j-api' +// runtime externalDependency.'commons-csv' +// runtime externalDependency.config +// runtime externalDependency.'pegasus-data' +// runtime externalDependency.reflections +// testCompile externalDependency.hsqldb +} + +// Exclude dependencies that are already on the grid from the Hadoop zip +configurations { + + // dependencies not needed from compile, unit testing, nor production will + // be excluded from the package + all*.exclude group: 'com.linkedin.hadoop' + all*.exclude group: 'com.linkedin.hive' + all*.exclude group: 'com.linkedin.pig' + all*.exclude group: 'com.linkedin.azkaban' + all*.exclude group: 'com.linkedin.azkaban-plugins' + all*.exclude group: 'com.linkedin.spark' + all*.exclude group: 'kafka' + all*.exclude group: 'org.apache.kafka' + all*.exclude group: 'org.apache.hive' + all*.exclude group: 'ch.qos.logback' + all*.exclude group: 'com.ibm.icu', module: 'icu4j' +} + +shadowJar { + zip64 true + dependencies { + include dependency('com.linkedin.dil:.*') + include dependency('com.squareup.retrofit:.*') + include dependency('com.squareup.okhttp:.*') + include dependency('com.squareup.okhttp3:.*') + include dependency('com.squareup.okio:.*') + include dependency('org.testng:.*') + } + + exclude 'log4j.xml' + exclude 'log4j.properties' + exclude 'org/apache/log4j/**' + exclude 'org/apache/spark/**' + exclude 'org/slf4j/**' + exclude 'javax/ws/rs/**' + + relocate('com.squareup', 'gobblin.shaded.com.squareup') + relocate('retrofit', 'gobblin.shaded.retrofit') + relocate('okhttp3', 'gobblin.shaded.okhttp3') + relocate('okio', 'gobblin.shaded.okio') + relocate('org.testng', 'gobblin.shaded.org.testng') + mergeServiceFiles() +} + +artifacts { + archives shadowJar +} + +test { + doFirst { + def jmockitPath = configurations.getByName("testCompile").copy().files.find {it.name.contains("jmockit") }.absolutePath + jvmArgs "-javaagent:$jmockitPath" + } +} diff --git a/dil/src/main/java/com/linkedin/dil/configuration/MultistageProperties.java b/dil/src/main/java/com/linkedin/dil/configuration/MultistageProperties.java new file mode 100644 index 0000000..266fb15 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/configuration/MultistageProperties.java @@ -0,0 +1,985 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.configuration; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.factory.DefaultS3ClientFactory; + + +/** + * Each item is associated with a Java class so that its actual configuration value + * can be validated.

+ * + * The getProp() function will return values with the specified type, with default values. getProp + * can raise exceptions if the configured value mismatch the specified type. For example, if a + * string is configured for an Integer property, or an incorrect string is configured for for + * a JsonObject property.

+ * + * The getValidNonblankWithDefault() function will always return a validate value. And if the configured + * value is not valid, a default value will be returned. + * + * The getMillis() function is mostly used to convert days, seconds, and other configured time values + * to milli-seconds. This function might be meaningless to some properties. The default implementation + * will return 0L. + * + * Each item can define functions like:

+ * - value validation, by implementing the validate() function for each property

+ * - conversion functions applicable to specific property types, like days to millis

+ * - provide default values, like default status codes

+ * + * @author chrli + */ +@Slf4j +@Getter +@SuppressWarnings("unchecked") +public enum MultistageProperties { + /** + * Abstinent Period is designed to avoid re-extract a dataset repeatedly. This is particular useful + * for situations like downloading files in large quantity. + * + * Assuming we will control all data extraction through a time range, including file downloads. Further + * assuming that files were all uploaded to source on 6/30, and assuming that we can only download 100 files + * a day, and there are 1000 files. Files downloaded on 7/1 will be downloaded again on 7/2 because + * their cut off time is 7/1, which is before the new extraction time. + * + * An abstinent period 30 is thus added to the last download/extract time, allowing us move the cutoff time forward. + * Therefore, if there is an abstinent period of 30 days, the downloaded files will not be downloaded + * again in 30 days. + * + * Abstinent period can be set to a large number so that the same file will never be downloaded again. + */ + MSTAGE_ABSTINENT_PERIOD_DAYS("ms.abstinent.period.days", Integer.class) { + @Override + public Long getMillis(State state) { + return 24L * 3600L * 1000L * (Integer) this.getProp(state); + } + }, + /** + * activation.property is a row object in JsonObject string format, the activation + * property can container key value pairs to be used as filters or URI parameters + * in extractor. + * + */ + MSTAGE_ACTIVATION_PROPERTY("ms.activation.property", JsonObject.class) { + // this property is normally set from the Source and used in Extractor + // therefore, we use Work Unit State to retrieve the value, and using + // Source State to retrieve the value will always be blank. + @Override + public boolean validate(State state) { + try { + JsonObject activation = getProp(state); + return activation == null || activation.entrySet().size() >= 0; + } catch (Exception e) { + return false; + } + } + }, + MSTAGE_AUTHENTICATION("ms.authentication", JsonObject.class) { + @Override + // accepts only validly formed values + public boolean validateNonblank(State state) { + try { + JsonObject auth = getProp(state); + return auth.entrySet().size() > 0 && auth.has("method") && auth.has("encryption"); + } catch (Exception e) { + return false; + } + } + }, + /** + * to do back fill, set ms.backfill=true, and also set the watermark + * any value other than true indicates a normal load. + * + */ + MSTAGE_BACKFILL("ms.backfill", Boolean.class) { + @Override + public T getDefaultValue() { + return (T) Boolean.FALSE; + } + }, + /** + * call.interval is used in pagination and waiting/looping + * + * when used in pagination, call.interval specify how long the client should wait + * before submit a new page request. + * + * when used in waiting/looping, call.interval specify the waiting period between + * calls. + * + * this value is in milliseconds. + */ + MSTAGE_CALL_INTERVAL("ms.call.interval.millis", Long.class), + MSTAGE_CSV_COLUMN_HEADER("ms.csv.column.header", Boolean.class), + /** + * a comma-separated string, where each value is either an integer or a range + * representing the index of the field to include + * Valid values include [0 based indexing]: + * 0,1,2,3,4 + * 0,1,2,4-15 + * 0,1,3-7,10 + * 0,5,3-4,2 + * + * Note: the values need not to be ordered + */ + MSTAGE_CSV_COLUMN_PROJECTION("ms.csv.column.projection", String.class) { + @Override + // accepts only validly formed values + public boolean validateNonblank(State state) { + String columnProjections = getProp(state); + return columnProjections != null && columnProjections.split(",").length > 0; + } + }, + MSTAGE_CSV_ESCAPE_CHARACTER("ms.csv.escape.character", String.class) { + @Override + public T getDefaultValue() { + return (T) "u005C"; + } + }, + MSTAGE_CSV_QUOTE_CHARACTER("ms.csv.quote.character", String.class) { + @Override + public T getDefaultValue() { + return (T) "\""; + } + }, + MSTAGE_CSV_SEPARATOR("ms.csv.separator", String.class) { + @Override + public T getDefaultValue() { + return (T) ","; + } + }, + /** + * if csv.column.header is true, csv.skip.lines will be 1 by default, if more than 1 + * row to be skipped, then set this parameter explicitly. + * + * if csv.column.header is false, csv.skip.lines will be 0 by default, if there are + * rows to be skipped, then set this parameter explicitly. + */ + MSTAGE_CSV_SKIP_LINES("ms.csv.skip.lines", Integer.class), + MSTAGE_DATA_EXPLICIT_EOF("ms.data.explicit.eof", Boolean.class) { + @Override + public T getDefaultValue() { + return (T) Boolean.FALSE; + } + }, + MSTAGE_DATA_DEFAULT_TYPE("ms.data.default.type", JsonObject.class), + /** + * DATA_FIELD specified where true data payload is in a nested structure. + * If data.field is not specified or it is blank, then the whole response will be + * treated as the payload. If data.field is not present in the response, + * it will generate an error. + */ + MSTAGE_DATA_FIELD("ms.data.field", String.class), + /** + * derived.fields is an array of field definitions + * + * each field definition will have "name", "type", "source", and "format" + * + * Example 1: following define a derived field using regular expression to subtract part of a source field + * [{ + * "name": "surveyid", + * "formula": { + * "type": "regexp", + * "source": "survey_url", + * "format": "https.*\\/surveys\\/([0-9]+)$" + * } + * }] + * + * Example 2: following define a epoc timestamp field to meet Lumos requirement + * [{ + * "name": "callDate", + * "formula": { + * "type": "epoc", + * "source": "started", + * "format": "yyyy-MM-dd" + * } + * }] + * + */ + MSTAGE_DERIVED_FIELDS("ms.derived.fields", JsonArray.class) { + @Override + // accepts blank and validly formed values, only rejects badly formed values + public boolean validate(State state) { + JsonArray derivedFields = getProp(state); + return derivedFields == null + || derivedFields.size() == 0 + || validateNonblank(state); + } + @Override + // accepts only validly formed values + public boolean validateNonblank(State state) { + JsonArray derivedFields = getProp(state); + return derivedFields != null + && derivedFields.size() > 0 + && derivedFields.get(0).getAsJsonObject().has("name") + && derivedFields.get(0).getAsJsonObject().has("formula"); + } + }, + /** + * In this job property you can specify the fields (array of fields) which needs to be encrypted by the Gobblin + * utility. + * These fields can be of JsonPrimitive type (string/int/boolean/etc.) or JsonObject type (with nested structure) + * e.g. "ms.encryption.fields" : ["emailAddress", "settings.webConferencesRecorded"] + */ + MSTAGE_ENCRYPTION_FIELDS("ms.encryption.fields", JsonArray.class), + /** + * Limited cleansing include tasks such as standardizing element name and + * replacing null values with default ones or dummy values + * + * Limited cleansing can also be used to replace certain elements in Json data. + * Currently white spaces and $ in Json element names will be replaced with _ if + * cleansing is enabled. + * + * Default: true + * + * Default value is used when this parameter is blank. + * + * This feature should be used only on need basis in large datasets where cleansing is expensive, + * for example, where source data element names are un-conforming, such as containing spaces, + * and needed standardization. + * + */ + MSTAGE_ENABLE_CLEANSING("ms.enable.cleansing", Boolean.class) { + @Override + public T getDefaultValue() { + return (T) Boolean.TRUE; + } + }, + + /** + * Dynamic full load will ignore extract.is.full setting and set extract.is.full based on following + * condidtions: + * 1. true if it is SNAPSHOT_ONLY extract + * 2. true if there is no pre-existing watermarks of the job + * + * To observe the extract.is.full setting, disable dynamic full load + */ + MSTAGE_ENABLE_DYNAMIC_FULL_LOAD("ms.enable.dynamic.full.load", Boolean.class) { + @Override + public T getDefaultValue() { + return (T) Boolean.TRUE; + } + }, + /** + * each Extractor will enforce a compliance filter based on given schema, currently this is + * soft enforced. Use case can turn the filter off by setting this parameter to false + */ + MSTAGE_ENABLE_SCHEMA_BASED_FILTERING("ms.enable.schema.based.filtering", Boolean.class) { + @Override + public T getDefaultValue() { + return (T) Boolean.TRUE; + } + }, + MSTAGE_ENCODING("ms.encoding", String.class) { + @Override + public T getDefaultValue() { + return (T) "UTF-8"; + } + }, + /** + * extract.preprocessors define one or more preprocessor classes, see + * {@link com.linkedin.dil.preprocessor.GunzipProcessor} + */ + MSTAGE_EXTRACT_PREPROCESSORS("ms.extract.preprocessors", String.class), + /** + * Parameters to pass into the preprocessor along with the input. + * e.g, If a source file is encrypted, it requires additional credentials to decrypt + * For GPG based decryption/encryption, parameters follow {@link org.apache.gobblin.crypto.EncryptionConfigParser} + * A sample parameter map: + * { + * "action" : string, decrypt/encrypt + * "keystore_password" : string, some password, + * "keystore_path" : string, path to the secret keyring, + * "cipher" : string, optional, cipher algorithm to use, default to CAST5 (128 bit key, as per RFC 2144) + * "key_name" : string, optional, the key id, a long value, of the public Gpg key as a Hex string + * } + */ + MSTAGE_EXTRACT_PREPROCESSORS_PARAMETERS("ms.extract.preprocessor.parameters", JsonObject.class), + + MSTAGE_EXTRACTOR_CLASS("ms.extractor.class", String.class), + + MSTAGE_EXTRACTOR_TARGET_FILE_NAME("ms.extractor.target.file.name", String.class), + //use this property for file dump extractor to save file with specific permission. + MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION("ms.extractor.target.file.permission", String.class) { + @Override + public T getValidNonblankWithDefault(State state) { + return (T) ((validateNonblank(state)) + ? ((String) getProp(state)).toUpperCase() : "755"); + } + }, + /** + * Grace Period is for overlapped data extraction, it assumes that the source can have late comers, + * which are older data that showed up in source after last extract. For example, a record was modified + * 2 days ago, but did not show up until today. In such case, if we extract based records' last + * update date, the last extraction would have missed that record, amd today's extraction will + * again miss it if we cut off by last extraction time (yesterday). + * + * A grace period is thus subtracted from the last extraction time, allowing us move the cut off + * time backward. Therefore, if there is grace period of 2 days, it will capture data arrived 2 days + * late in source. + */ + MSTAGE_GRACE_PERIOD_DAYS("ms.grace.period.days", Integer.class) { + @Override + public Long getMillis(State state) { + return 24L * 3600L * 1000L * (Integer) this.getProp(state); + } + }, + /** + * http.client.factory define an indirect way to specify the type of HttpClient to use. + * default = {@link com.linkedin.dil.factory.ApacheHttpClientFactory} + */ + MSTAGE_HTTP_CLIENT_FACTORY("ms.http.client.factory", String.class) { + @Override + public T getDefaultValue() { + return (T) "com.linkedin.dil.factory.ApacheHttpClientFactory"; + } + }, + /** + * custom headers include Content-Type are to be included in this property + */ + MSTAGE_HTTP_REQUEST_HEADERS("ms.http.request.headers", JsonObject.class), + MSTAGE_HTTP_REQUEST_METHOD("ms.http.request.method", String.class), + /** + * use this property to set custom response type explicitly. You can also have multiple accepted types for + * multi-stage jobs + * ex: {"Content-Type":"application/x-gzip","Content-Type1":"application/json"} + */ + MSTAGE_HTTP_RESPONSE_TYPE("ms.http.response.type", JsonObject.class), + /** + * http.statuses defines success codes and warnings, and optionally errors. + * By default, if this parameter is not set, 200 (OK), 201 (CREATED), and 202 (ACCEPTED) + * will be treated as success; anything else below 400 will be treated as warning; and + * anything 400 and above will be treated as error. Warnings will be logged but will not + * cause job failure. Errors will cause job failure. + * + * In cases where 4xx codes, like 404 (NOT FOUND), happened frequently, and a failure is + * not desirable, exceptions can be added to warnings. + * + * In following configuration, we make 404 an warning, and make 206 a failure indicating + * that partial content is not acceptable: + * {"success": [200], "warning": [404], "error": [206]} + */ + MSTAGE_HTTP_STATUSES("ms.http.statuses", JsonObject.class) { + @Override + public T getDefaultValue() { + return (T) GSON.fromJson("{\"success\":[200,201,202], \"pagination_error\":[401]}", JsonObject.class); + } + }, + /** + * http.status.reasons define reason codes (strings) that have special meaning in determining + * whether a request was a success or failure. + * + * for example, when status is 200, but there is a reason to indicate the request was not successful, + * then the status.reason can be set: + * {"error": ["not found"]} + * + * An Http response is considered success if and only if: + * - status code in http.statuses.success + * - reason code not in http.status.reasons.error + * + * Currently, we don't allow exceptions being made to revert errors by using reason code. + */ + MSTAGE_HTTP_STATUS_REASONS("ms.http.status.reasons", JsonObject.class), + /** + * jdbc.client.factory define an indirect way to specify the type of JDBC Client to use. + * default = {@link com.linkedin.dil.factory.DefaultJdbcClientFactory} + */ + MSTAGE_JDBC_CLIENT_FACTORY("ms.jdbc.client.factory", String.class) { + @Override + public T getDefaultValue() { + return (T) "com.linkedin.dil.factory.DefaultJdbcClientFactory"; + } + }, + + MSTAGE_JDBC_SCHEMA_REFACTOR("ms.jdbc.schema.refactor", String.class) { + @Override + public T getDefaultValue() { + return (T) "none"; + } + }, + /** + * JDBC statement is the query to be executed for data extraction, usually a SELECT + * statement or a store procedure. DIL doesn't explicitly restrict or support syntax + * of the statement. The source database decides whether to accept or fail the statement. + */ + MSTAGE_JDBC_STATEMENT("ms.jdbc.statement", String.class), + MSTAGE_OUTPUT_SCHEMA("ms.output.schema", JsonArray.class), + /** + * pagination is a Json object with 2 members: + * + * fields: is an array of up to 3 string elements, each denote a source key column for: + * 1. page start, or offset + * 2. page size, or limit of each page + * 3. page no, if page no is used to control instead of using page start and page size + * + * initialvalues: is an array of up to 3 integer elements, each denote a initial value for: + * 1. page start, or offset + * 2. pagesize, or limit of each page + * 3. page no, if page no is used to control instead of using page start and page size + */ + MSTAGE_PAGINATION("ms.pagination", JsonObject.class), + /** + * ms.parameter holds a list of parameters in the form of a JsonArray. + * + * Parameters are named, i.e. the name will be referenced in other places. + * + * Parameters will have either static values or dynamic values derived from a formula. + * + * Terminology: in following description, we call parameters used in URI as URI Parameters. + * + * For HTTP GET requests, parameters will be used to form the final URI. In such case, + * the parameter can be used in the URI path or as URI parameter. + * + * When used in URI path, the parameter name need to be specified in URI template + * as a variable contained in {{}}. + * + * When used as URI parameters, the parameter name and derived value will be coded as + * KV pairs; therefore, the parameter name need to be acceptable to source. + * + * For example, if a source accepts URI like http://domainname/endpoint?cursor=xxxx, + * and the "cursor" parameter is optional, then the parameter should be named as + * "cursor", and the URI template should be set as http://domainname/endpoint in pull file. + * In such case, a "?cursor=xxx" will be appended to the final URI when cursor is + * present. + * + * However, if the cursor URI parameter is not optional, the URI template could be coded as + * http://domain/endpoint?cursor={{p1}}, then the parameter can be named as "p1", and the + * parameter value will replace {{p1}} before the request is sent to the URI source. + * + * Examples of setting parameters in pull files: + * + * For one case, the URI needs 3 mandatory variables, and they can be named as p1, p2, and p3. + * And we can configure the pull file as following: + * + * ms.uri=https://domain.com/api/bulk/2.0/syncs/{{p1}}/data?offset={{p2}}&limit={{p3}} + * ms.parameter=[ + * {"name": "p1", "type": "list", "value": "3837498"}, + * {"name": "p2", "type": "pagestart"}, + * {"name": "p3", "type": "pagesize"}] + * + * + * For another case, the URI needs 1 optional variable, and the parameter has to be named as + * required by the source. And here is the configuration: + * + * ms.uri=https://domain.com/users + * ms.parameter=[{"name":"cursor","type":"session"}] + * + * For HTTP POST and HTTP PUT requests, the parameter name will be used as-is in the form of + * "parameter name": "parameter value" in the request body; therefore, the parameter name + * need to be as required by to URI source. + * + */ + + MSTAGE_PARAMETERS("ms.parameters", JsonArray.class) { + @Override + public boolean validateNonblank(State state) { + try { + return ((JsonArray) getProp(state)).size() > 0; + } catch (Exception e) { + return false; + } + } + }, + /** + * Payloads are a type of secondary input. Where activations generates + * work units, payloads don't. In planning phase, payloads are passed + * as is to work units without actually reading the content. Payload + * content are only being read when the transmission starts. The payload + * property, therefore, contains secondary entries, in a JsonArray form. + * And the array contains records of secondary inputs, which include + * path and fields, etc. + */ + MSTAGE_PAYLOAD_PROPERTY("ms.payload.property", JsonArray.class), + MSTAGE_RETENTION("ms.retention", JsonObject.class) { + @Override + public T getDefaultValue() { + JsonObject retention = new JsonObject(); + retention.addProperty("state.store", "P90D"); // keep 90 days state store by default + retention.addProperty("publish.dir", "P731D"); // keep 2 years published data + retention.addProperty("log", "P30D"); + return (T) retention; + } + }, + /** + * s3.client.factory define an indirect way to specify the type of S3 Client to use. + * default = {@link DefaultS3ClientFactory} + */ + MSTAGE_S3_CLIENT_FACTORY("ms.s3.client.factory", String.class) { + @Override + public T getDefaultValue() { + return (T) "com.linkedin.dil.factory.DefaultS3ClientFactory"; + } + }, + /** + * Schema cleansing will replace special characters in the schema element names based + * on a pattern. By default it will replace all blank spaces, $, and @ to underscores. + * + * Schema cleansing parameter is a JsonObject, and it supports the following elements: + * enabled : true|false + * pattern: if enabled, it has default value "(\\s|\\$|@)" + * replacement: if enabled, it has default value "_" + * nullable: it has default value "false", allowed values are: + * 1. true: all fields will be forced to be nullable + * 2. false: the schema infer will try to detect nullability from samples + * + * This configuration has no impact on schema from metadata stores. + */ + MSTAGE_SCHEMA_CLENSING("ms.schema.cleansing", JsonObject.class), + /** + * This property is used to set:

+ * 1. location from where the hdfs data will be loaded as secondary data to call the + * subsequent API

+ * 2. define the field names that needs to be extracted and added into the work units. + * 3. define filters on one or more fields based on following rules + * a. if multiple fields are filtered, the relationship is AND, that means all condition must be met + * b. if a filter is defined on a field, and field value is NULL, the record is rejected + * c. if a filter is defined on a field, and the field value is not NULL, the record will be rejected if + * its value doesn't match the pattern + * d. if no filter is defined on a field, the default filter ".*" is applied to the field, and NULL values + * are accepted + * 4. define the category of the input, currently we allow these categories: + * a. activation, that means the secondary input is for creating work units + * b. authentication, that means the secondary input provide authentication information + *

+ * + * Example : + * + * ms.secondary.input=[{ + * "path": "/path/to/hdfs/inputFileDir/2019/08/07/19/720", + * "fields": ["id", "tempId"], + * "filters": {"status": "(OK|Success)", "field2": "pattern2"}, + * "category" "activation" + * }] + * + * The gobblin job will read records from that location and extract the two fields and inject it into the work units. + */ + MSTAGE_SECONDARY_INPUT("ms.secondary.input", JsonArray.class) { + @Override + public boolean validate(State state) { + return getProp(state) != null; + } + }, + /** + * session.key.field specifies the key field for session and the condition for termination. + * Although Restful API is stateless, data sources can maintain a session in backend + * by a status field, a session cursor, or through pagination (see comments on PAGINATION). + * + * it takes the form a Json object with a "name", "condition", and "failCondition". + * - the name specifies the field in response that gives session info. + * - the condition, specifies when the session should stop. + * - the failCondition specifies when the session should fail. + * + * A conditions can be a regular expression or a formula. Currently, only regular expression is supported. + * + * "name" is required + * "condition" and "failCondition" are optional + * "condition" takes precedence over "failCondition" + * + * When both session and pagination are enabled, the extractor will keep consuming data from + * source until all pages are extracted. Then the extractor will check the status until + * the stop condition is met. + * + * In that regard, when the source give conflicting signal in turns of total expected rows + * and status, the data can have duplicate, and actual extracted rows in log file should + * show more rows extracted than expected. + * + */ + MSTAGE_SESSION_KEY_FIELD("ms.session.key.field", JsonObject.class), + /** + * Default source data character set is UTF-8, which should be good for most use cases. + * See StandardCharsets for other common names, such as UTF-16 + */ + MSTAGE_SOURCE_DATA_CHARACTER_SET("ms.source.data.character.set", String.class) { + @Override + public T getDefaultValue() { + return (T) "UTF-8"; + } + }, + MSTAGE_SOURCE_FILES_PATTERN("ms.source.files.pattern", String.class) { + @Override + public T getDefaultValue() { + return (T) ".*"; + } + }, + + /** + * Parameters specific to the S3 source. + * { + * "region": string, aws region code: https://docs.aws.amazon.com/general/latest/gr/rande.html + * "read_timeout_seconds", integer, read time out in seconds + * "write_timeout_seconds", integer, write time out in seconds + * "connection_timeout_seconds": Sets the socket to timeout after failing to establish a connection with the server after milliseconds. + * "connection_max_idle_millis", Sets the socket to timeout after timeout milliseconds of inactivity on the socket. + * } + */ + MSTAGE_SOURCE_S3_PARAMETERS("ms.source.s3.parameters", JsonObject.class), + /** + * Source schema is a URN string of the following forms: + * 1. a dataset URN, if the source schema can be represented by a dataset. + * For example: urn:li:dataset:(urn:li:dataPlatform:hive,rightnow.incidents,PROD) + * In such case, the latest schema of the dataset will be read from metadata store, + * and then parsed to retrieve fields and types, etc + * 2. a registered schema, if the source schema is registered with metadata store in + * the form of either a pegasus (PDL) or GraphQL schema. + * For example: TODO + * 3. other forms TODO + */ + MSTAGE_SOURCE_SCHEMA_URN("ms.source.schema.urn", String.class), + /** + * Define a factory parameter to decouple proprietary code from OpenSource code + */ + MSTAGE_SOURCE_SCHEMA_READER_FACTORY("ms.source.schema.reader.factory", String.class) { + @Override + public T getDefaultValue() { + return (T) "com.linkedin.dil.factory.reader.MetastoreReader"; + } + }, + /** + * ms.source.uri defines a data source identifier, it follows the URI format + * here: https://en.wikipedia.org/wiki/Uniform_Resource_Identifier + * + * The only exception is that authority is not supported, because all authority + * cannot be fit in the URI. + * + * source.uri also accepts variables that allow substitution in runtime + */ + MSTAGE_SOURCE_URI("ms.source.uri", String.class), + + // TODO: Merge back to @link{MSTAGE_SOURCE_S3_PARAMETERS} + MSTAGE_S3_LIST_MAX_KEYS("ms.s3.list.max.keys", Integer.class) { + @Override + public T getDefaultValue() { + return (T) Integer.valueOf(1000); + } + }, + /** + * Target schema denotes the schema to be passed to writer, this applies + * to situation where the source data are transformed through a converter + * or other processes. + * + * Target schema should be specified through target schema URN. + * An URN can point to the schema storage location on DataHub, which is + * the only supported schema storage for now. + * + * If target schema is specified through ms.target.schema, then the syntax + * of the schema string is same as ms.output.schema. + */ + MSTAGE_TARGET_SCHEMA("ms.target.schema", JsonArray.class), + MSTAGE_TARGET_SCHEMA_URN("ms.target.schema.urn", String.class), + MSTAGE_TARGET_SCHEMA_READER_FACTORY("ms.target.schema.reader.factory", String.class) { + @Override + public T getDefaultValue() { + return (T) "com.linkedin.dil.factory.reader.MetastoreReader"; + } + }, + /** + * Total count field is a Json path. This attribute can be used in many + * types of connectors, typically with Json Extractor + * + * If response is like { "records": { "totalRecords": 10000, "pagesize": 100, "currentpage": 0}}, + * the configurations should be: ms.totalcount.field=records.totalRecords + * + */ + MSTAGE_TOTAL_COUNT_FIELD("ms.total.count.field", String.class) { + @Override + public boolean validateNonblank(State state) { + String tcField = getProp(state); + return StringUtils.isNotBlank(tcField); + } + }, + /** + * If there is not total expected row count, the session will keep looping and waiting + * until either the session completion condition is met or time out. + * + * wait.timeout control how long the job will wait before the session completion status + * is met. + * + * default is 10 minutes or 600 seconds + * + * see also call.interval + * + */ + + MSTAGE_WAIT_TIMEOUT_SECONDS("ms.wait.timeout.seconds", Long.class) { + @Override + public Long getMillis(State state) { + return 1000L * (Long) this.getValidNonblankWithDefault(state); + } + + @Override + public T getDefaultValue() { + return (T) Long.valueOf(600); + } + }, + /** + * ms.watermark holds a list of watermark ranges in the form of a JsonArray. + * A watermark property is a JsonObject with name, type, and range. + * For now, only datetime and unit type watermark are supported. + * + * For datetime watermark, a range has "from" and "to" values. + * They have to be in "yyyy-MM-dd" format; however "to" can be just "-" to present current date. + * + * For example: + * + * ms.watermark=[{"name": "system","type": "datetime","range": {"from": "2019-01-01", "to": "-"}}] + * + */ + MSTAGE_WATERMARK("ms.watermark", JsonArray.class), + MSTAGE_WATERMARK_GROUPS("ms.watermark.groups", JsonArray.class), + MSTAGE_WORK_UNIT_PARALLELISM_MAX("ms.work.unit.parallelism.max", Integer.class) { + @Override + public boolean validateNonblank(State state) { + Integer parallelMax = getProp(state); + return parallelMax > 0; + } + }, + /** + * Work unit partitioning scheme is either a string or a JsonObject. + * + * When it is a string, it will accept values like monthly, weekly, daily, hourly, or none, + * which can be blank or literally "none". + * + * When it is a JsonObject, there can be multiple ways to partition, either with a range. For + * example, following will break 2010-2019 by monthly partitions, and daily partitions afterwards. + * + * {"monthly": ["2010-01-01", "2020-01-01"], "daily": ["2020-01-01": "-"]} + * + * In such case, the partition is called composite. For the composite partition to work, + * the ranges should be continuous with no gaps or overlaps. In order to avoid gaps and overlaps, + * one range end should be the same as another range's start. + * + * Note the end of partition accepts "-" as current date, but it doesn't access PxD syntax, the + * reason being a partition range can be broader than watermark range. + * + * For a composite partition, if the range definition is not as specified, or not valid, then the there + * is no partitioning, equivalent to ms.work.unit.partition='' + * + * For a composite partition, a range is matched against watermark to define partitions, if a range + * is smaller than full partition range, for example {"monthly": ["2020-01-01", "2020-01-18"]}, + * it will still generate a full partition. So to avoid confusion, the range should be at minimum + * 1 partition size. That means, a range should at least 1 month for monthly, or at least 1 week for + * etc. + * + */ + MSTAGE_WORK_UNIT_PARTITION("ms.work.unit.partition", String.class) { + @Override + public T getDefaultValue() { + return (T) "none"; + } + }, + MSTAGE_WORK_UNIT_PARTIAL_PARTITION("ms.work.unit.partial.partition", Boolean.class) { + @Override + public T getDefaultValue() { + return (T) Boolean.TRUE; + } + }, + MSTAGE_WORK_UNIT_PACING_SECONDS("ms.work.unit.pacing.seconds", Integer.class) { + @Override + public Long getMillis(State state) { + return 1000L * (Integer) this.getProp(state); + } + }, + // this is an internal property, its purpose is to pass value between Source and Extractor + MSTAGE_WORKUNIT_STARTTIME_KEY("ms.work.unit.scheduling.starttime", Long.class) { + @Override + public T getDefaultValue() { + return (T) Long.valueOf(0L); + } + }, + MSTAGE_NORMALIZER_BATCH_SIZE("ms.normalizer.batch.size", Long.class) { + @Override + public T getDefaultValue() { + return (T) Long.valueOf(500L); + } + }, + // Properties defined in Gobblin, redefine here to leverage the new features like validation + CONVERTER_CLASSES("converter.classes", String.class), + DATASET_URN_KEY("dataset.urn", String.class), + ENCRYPT_KEY_LOC("encrypt.key.loc", String.class), + EXTRACTOR_CLASSES("extractor.class", String.class), + + // add a default value of FALSE to Gobblin configuration extract.is.full + EXTRACT_IS_FULL("extract.is.full", Boolean.class) { + public T getDefaultValue() { + return (T) Boolean.FALSE; + } + }, + EXTRACT_NAMESPACE_NAME_KEY("extract.namespace", String.class), + EXTRACT_TABLE_NAME_KEY("extract.table.name", String.class), + EXTRACT_TABLE_TYPE_KEY("extract.table.type", String.class) { + @Override + public T getValidNonblankWithDefault(State state) { + return (T) ((validateNonblank(state)) + ? ((String) getProp(state)).toUpperCase() : "SNAPSHOT_ONLY"); + } + }, + SOURCE_CLASS("source.class", String.class), + SOURCE_CONN_USERNAME("source.conn.username", String.class), + SOURCE_CONN_PASSWORD("source.conn.password", String.class), + SOURCE_CONN_USE_PROXY_URL("source.conn.use.proxy.url", String.class), + SOURCE_CONN_USE_PROXY_PORT("source.conn.use.proxy.port", String.class), + STATE_STORE_ENABLED("state.store.enabled", Boolean.class) { + @Override + public T getDefaultValue() { + return (T) Boolean.TRUE; + } + }, + DATA_PUBLISHER_FINAL_DIR("data.publisher.final.dir", String.class); + + final static private Gson GSON = new Gson(); + final static private String PROPERTY_SEPARATOR = "."; + + private final String config; + private final Class className; + private final Object defaultValue; + + MultistageProperties(String config, Class className) { + this.config = config; + this.className = className; + this.defaultValue = null; + } + + @Override + public String toString() { + assert config != null; + return config; + } + + /** + * validate accepts blank entry and validates the value when it is non-blank + *

+ * This version serves those Source properties + *

+ * @param state source state + * @return true when the configuration is blank or non-blank and valid + */ + public boolean validate(State state) { + return true; + } + + /** + * validate rejects blank entry and only validates the value when it is non-blank + *

+ * This version serves those Source properties, the duplication is not ideal, we could + * make this better by define the getProp methods on State, instead of WorkUnitState and + * SourceState separately in Gobblin core. + *

+ * @param state source state + * @return true when the configuration is non-blank and valid + */ + public boolean validateNonblank(State state) { + try { + if (className == JsonArray.class) { + return ((JsonArray) getProp(state)).size() > 0; + } + + if (className == JsonObject.class) { + return ((JsonObject) getProp(state)).entrySet().size() > 0; + } + + if (className == Boolean.class) { + // cannot call getPropAsBoolean to tell if a configuration exists + // as FALSE will be return from empty configuration + String prop = state.getProp(config, StringUtils.EMPTY); + return StringUtils.isNotBlank(prop) + && (prop.equalsIgnoreCase("true") || prop.equalsIgnoreCase("false")); + } + + if (className == String.class) { + return StringUtils.isNotEmpty(getProp(state)); + } + + if (className == Integer.class) { + return (Integer) getProp(state) != 0; + } + + if (className == Long.class) { + return (Long) getProp(state) != 0; + } + } catch (Exception e) { + return false; + } + return true; + } + + public Long getMillis(State state) { + return 0L; + } + + /** + * get property value for a specified MultistageProperties item + *

+ * This version serves those Source properties, the duplication is not ideal, we could + * make this better by define the getProp methods on State, instead of WorkUnitState and + * SourceState separately in Gobblin core. + *

+ * @param state the source or work unit state + * @param the template class type + * @return the property value of the specific MultistageProperties item + */ + public T getProp(State state) { + if (className == Boolean.class) { + return (T) Boolean.valueOf(state.getPropAsBoolean(config)); + } else if (className == Integer.class) { + return (T) Integer.valueOf(state.getPropAsInt(config, 0)); + } else if (className == Long.class) { + return (T) Long.valueOf(state.getPropAsLong(config, 0L)); + } else if (className == String.class) { + return (T) state.getProp(config, StringUtils.EMPTY); + } else if (className == JsonArray.class) { + return (T) GSON.fromJson(state.getProp(config, new JsonArray().toString()), JsonArray.class); + } else if (className == JsonObject.class) { + return (T) GSON.fromJson(state.getProp(config, new JsonObject().toString()), JsonObject.class); + } + return null; + } + + public T getValidNonblankWithDefault(State state) { + if (className == JsonArray.class) { + return (T) (validateNonblank(state) ? getProp(state) : getDefaultValue()); + } + if (className == JsonObject.class) { + return (T) (validateNonblank(state) ? getProp(state) : getDefaultValue()); + } + if (className == Long.class) { + return (T) ((validateNonblank(state)) ? getProp(state) : getDefaultValue()); + } + if (className == String.class) { + return (T) ((validateNonblank(state)) ? getProp(state) : getDefaultValue()); + } + if (className == Integer.class) { + return (T) ((validateNonblank(state)) ? getProp(state) : getDefaultValue()); + } + if (className == Boolean.class) { + return (T) ((validateNonblank(state)) ? getProp(state) : getDefaultValue()); + } + return getProp(state); + } + + public T getDefaultValue() { + if (className == JsonArray.class) { + return (T) new JsonArray(); + } + if (className == JsonObject.class) { + return (T) new JsonObject(); + } + if (className == String.class) { + return (T) StringUtils.EMPTY; + } + if (className == Long.class) { + return (T) Long.valueOf(0L); + } + if (className == Integer.class) { + return (T) Integer.valueOf(0); + } + return null; + } +} +// END of enum MultistageProperties \ No newline at end of file diff --git a/dil/src/main/java/com/linkedin/dil/configuration/StaticConstants.java b/dil/src/main/java/com/linkedin/dil/configuration/StaticConstants.java new file mode 100644 index 0000000..57bc268 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/configuration/StaticConstants.java @@ -0,0 +1,58 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.configuration; + +import com.google.gson.Gson; + + +public interface StaticConstants { + String KEY_WORD_ACTIVATION = "activation"; + String KEY_WORD_ARRAY = "array"; + String KEY_WORD_ARRAY_ITEM = "arrayItem"; + String KEY_WORD_AUTHENTICATION = "authentication"; + String KEY_WORD_BOOLEAN = "boolean"; + String KEY_WORD_CATEGORY = "category"; + String KEY_WORD_COLUMN_NAME = "columnName"; + String KEY_WORD_DATA_IS_NULLABLE = "isNullable"; + String KEY_WORD_DATA_TYPE = "dataType"; + String KEY_WORD_DATA_TYPE_TYPE = "dataType.type"; + String KEY_WORD_EOF = "EOF"; + String KEY_WORD_FIELDS = "fields"; + String KEY_WORD_RANGE_FROM = "from"; + String KEY_WORD_HTTP_OK = "ok"; + String KEY_WORD_INTEGER = "integer"; + String KEY_WORD_ITEMS = "items"; + String KEY_WORD_MAP = "map"; + String KEY_WORD_NAME = "name"; + String KEY_WORD_HTTP_NOTOK = "notok"; + String KEY_WORD_NULL = "null"; + String KEY_WORD_NULLABLE = "nullable"; + String KEY_WORD_NUMBER = "number"; + String KEY_WORD_OBJECT = "object"; + String KEY_WORD_PATH = "path"; + String KEY_WORD_PAYLOAD = "payload"; + String KEY_WORD_PRIMITIVE = "primitive"; + String KEY_WORD_PROPERTIES = "properties"; + String KEY_WORD_RANGE = "range"; + String KEY_WORD_RECORD = "record"; + String KEY_WORD_RETRY = "retry"; + String KEY_WORD_RETRY_COUNT = "retryCount"; + String KEY_WORD_RETRY_DELAY_IN_SEC = "delayInSec"; + String KEY_WORD_ROOT = "root"; + String KEY_WORD_SNAPSHOT_ONLY = "SNAPSHOT_ONLY"; + String KEY_WORD_SOURCE = "source"; + String KEY_WORD_SOURCE_TYPE = "source.type"; + String KEY_WORD_STRING = "string"; + String KEY_WORD_SYMBOLS = "symbols"; + String KEY_WORD_RANGE_TO = "to"; + String KEY_WORD_TIMESTAMP = "timestamp"; + String KEY_WORD_TYPE = "type"; + String KEY_WORD_UNITS = "units"; + String KEY_WORD_UNKNOWN = "unknown"; + String KEY_WORD_VALUES = "values"; + + Gson GSON = new Gson(); + +} diff --git a/dil/src/main/java/com/linkedin/dil/connection/Connection.java b/dil/src/main/java/com/linkedin/dil/connection/Connection.java new file mode 100644 index 0000000..c56ecf5 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/connection/Connection.java @@ -0,0 +1,38 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.util.WorkUnitStatus; + +/** + * The connection interface defines core methods that an Extractor would call + * to establish a transmission channel with the data provider or data receiver. + * + * @author Chris Li + */ +public interface Connection { + /** + * The common method among all connections, read or write, is the execute(). This + * method expects a work unit status object as input parameter, and it gives out + * a new work unit object as output. + * @param status the input WorkUnitStatus object + * @return the output of the execution in a WorkUnitStatus object + * @throws RetriableAuthenticationException exception to allow retry at higher level + */ + WorkUnitStatus execute(final WorkUnitStatus status) throws RetriableAuthenticationException; + /** + * Close the connection and pool of connections if applicable + * @param message the message to send to the other end of connection upon closing + * @return true if connections are successfully closed, or false if connections are not + * closed successfully + */ + boolean closeAll(final String message); + /** + * Close the current cursor or stream if applicable + * @return true if closeStream was successful, or false if not able to close the stream + */ + boolean closeStream(); +} diff --git a/dil/src/main/java/com/linkedin/dil/connection/HdfsConnection.java b/dil/src/main/java/com/linkedin/dil/connection/HdfsConnection.java new file mode 100644 index 0000000..589035a --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/connection/HdfsConnection.java @@ -0,0 +1,164 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; +import java.io.InputStream; +import java.net.URI; +import java.util.List; +import java.util.stream.Collectors; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.HdfsKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.InputStreamUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.source.extractor.filebased.FileBasedHelperException; +import org.apache.gobblin.source.extractor.filebased.TimestampAwareFileBasedHelper; +import org.apache.gobblin.source.extractor.hadoop.HadoopFsHelper; + + +/** + * HdfsConnection creates transmission channel with HDFS data provider or HDFS data receiver, + * and it executes commands per Extractor calls. + * + * @author Chris Li + */ +@Slf4j +public class HdfsConnection extends MultistageConnection { + private final static String URI_REGEXP_PATTERN = "RE="; + @Getter + final private HdfsKeys hdfsKeys; + @Setter (AccessLevel.PACKAGE) + private TimestampAwareFileBasedHelper fsHelper; + + public HdfsConnection(State state, JobKeys jobKeys, ExtractorKeys extractorKeys) { + super(state, jobKeys, extractorKeys); + assert jobKeys instanceof HdfsKeys; + hdfsKeys = (HdfsKeys) jobKeys; + } + + /** + * Get a list of files if the URI has pattern match, else read the file at the URI. + * + * In order to perform a list operation and output the list of files, the + * ms.source.uri need to coded like /directory?RE=file-name-pattern. If the purpose + * is to list all files, the file name pattern can be just ".*". + * + * In order to perform a read of a file and output the content of the file as + * InputStream. ms.source.uri need to be full path without RE expression. If a + * partial path is given, then only a single file will be picked because there + * will be a list command performed before the read. A partial path could result + * in multiple files being listed, but then only the first file will be used. + * + * So if the intention is to read a single file, support the full path to ms.source.uri. + * + * @param status prior work unit status + * @return new work unit status + */ + @Override + public WorkUnitStatus execute(final WorkUnitStatus status) { + assert hdfsKeys.getSourceUri() != null; + URI uri = URI.create(getWorkUnitSpecificString(hdfsKeys.getSourceUri(), + getExtractorKeys().getDynamicParameters())); + + if (uri.getPath() == null) { + return status; + } + + if (uri.getQuery() != null && uri.getQuery().matches(URI_REGEXP_PATTERN + ".*")) { + status.setBuffer(InputStreamUtils.convertListToInputStream( + readFileList(uri.getPath(), uri.getQuery().substring(URI_REGEXP_PATTERN.length())))); + } else { + List files = readFileList(uri.getPath(), ".*"); + if (files.size() > 0) { + status.setBuffer(readSingleFile(files.get(0))); + } + } + return status; + } + + /** + * Close the connection to HDFS + * @param message the message to send to the other end of connection upon closing + * @return true if closed successfully, or false + */ + @Override + public boolean closeAll(String message) { + try { + fsHelper.close(); + fsHelper = null; + return true; + } catch (Exception e) { + log.error("Error closing file system connection", e); + return false; + } + } + + /** + * execute the HDFS read command (ls or getFileStream) + * @param workUnitStatus prior work unit status + * @return the updated work unit status + * @throws RetriableAuthenticationException if retry is needed + */ + @Override + public WorkUnitStatus executeFirst(final WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + WorkUnitStatus status = super.executeFirst(workUnitStatus); + if (fsHelper == null) { + fsHelper = getHdfsClient(); + } + return execute(status); + } + + /** + * Read a list of files based on the given pattern + * @param path base path of files + * @param pattern file name pattern + * @return a list of paths + */ + private List readFileList(final String path, final String pattern) { + try { + return this.fsHelper.ls(path) + .stream() + .filter(fileName -> fileName.matches(pattern)) + .collect(Collectors.toList()); + } catch (FileBasedHelperException e) { + log.error("Not able to run ls command due to " + e.getMessage(), e); + } + return Lists.newArrayList(); + } + + /** + * Read a single file from HDFS + * @param path full path of the file + * @return the file content in an InputStream + */ + private InputStream readSingleFile(final String path) { + try { + return fsHelper.getFileStream(path); + } catch (FileBasedHelperException e) { + log.error("Not able to run getFileStream command due to " + e.getMessage(), e); + return null; + } + } + + @VisibleForTesting + TimestampAwareFileBasedHelper getHdfsClient() { + TimestampAwareFileBasedHelper fsHelper = new HadoopFsHelper(this.getState()); + try { + fsHelper.connect(); + return fsHelper; + } catch (Exception e) { + log.error("Failed to initialize HdfsSource", e); + return null; + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/connection/HttpConnection.java b/dil/src/main/java/com/linkedin/dil/connection/HttpConnection.java new file mode 100644 index 0000000..7156649 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/connection/HttpConnection.java @@ -0,0 +1,320 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.io.Closeable; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.factory.HttpClientFactory; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.HttpKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.HttpRequestMethod; +import com.linkedin.dil.util.JsonUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.http.Header; +import org.apache.http.HeaderElement; +import org.apache.http.HttpResponse; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.util.EntityUtils; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * HttpConnection creates transmission channel with HTTP data provider or HTTP data receiver, + * and it executes commands per Extractor calls. + * + * @author Chris Li + */ +@Slf4j +public class HttpConnection extends MultistageConnection { + @Getter (AccessLevel.PACKAGE) + final private HttpKeys httpSourceKeys; + + @Setter (AccessLevel.PACKAGE) + private HttpClient httpClient; + @Setter (AccessLevel.PACKAGE) + private CloseableHttpResponse response; + + public HttpConnection(State state, JobKeys jobKeys, ExtractorKeys extractorKeys) { + super(state, jobKeys, extractorKeys); + httpClient = getHttpClient(state); + assert jobKeys instanceof HttpKeys; + httpSourceKeys = (HttpKeys) jobKeys; + } + + @Override + public WorkUnitStatus execute(WorkUnitStatus status) throws RetriableAuthenticationException { + return execute(HttpRequestMethod.valueOf(httpSourceKeys.getHttpRequestMethod()), status); + } + + /** + * Thread-safely create HttpClient as needed. This connection object + * is mostly going to be initialized in separate threads; therefore, + * this is more of a precaution. + */ + synchronized HttpClient getHttpClient(State state) { + if (httpClient == null) { + try { + Class factoryClass = Class.forName( + MultistageProperties.MSTAGE_HTTP_CLIENT_FACTORY.getValidNonblankWithDefault(state)); + HttpClientFactory factory = (HttpClientFactory) factoryClass.newInstance(); + httpClient = factory.get(state); + } catch (Exception e) { + log.error("Error creating HttpClient: {}", e.getMessage()); + } + } + return httpClient; + } + + @Override + public WorkUnitStatus executeFirst(WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + WorkUnitStatus status = super.executeFirst(workUnitStatus); + return execute(status); + } + + @Override + public WorkUnitStatus executeNext(WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + WorkUnitStatus status = super.executeNext(workUnitStatus); + return execute(status); + } + + @VisibleForTesting + @SneakyThrows + WorkUnitStatus execute(HttpRequestMethod command, WorkUnitStatus status) throws RetriableAuthenticationException { + Preconditions.checkNotNull(status, "WorkUnitStatus is not initialized."); + try { + response = retryExecuteHttpRequest(command, + getExtractorKeys().getDynamicParameters()); + } catch (RetriableAuthenticationException e) { + throw e; + } catch (Exception e) { + log.error(e.getMessage(), e); + return null; + } + + // if no exception (error), but warnings, return work unit status as it was + // this will be treated as "request was successful but don't process data records" + if (response == null) { + return status; + } + + // even no error, no warning, we still need to process potential silent failures + try { + status.getMessages().put("contentType", getResponseContentType(response)); + status.getMessages().put("headers", getResponseHeaders(response).toString()); + if (response.getEntity() != null) { + status.setBuffer(response.getEntity().getContent()); + } + } catch (Exception e) { + // Log but ignore errors when getting content and content type + // These errors will lead to a NULL buffer in work unit status + // And that situation will be handled in extractor accordingly + log.error(e.getMessage()); + } + + return status; + } + + @SneakyThrows + private CloseableHttpResponse retryExecuteHttpRequest( + final HttpRequestMethod command, + final JsonObject parameters + ) throws RetriableAuthenticationException { + log.debug("Execute Http {} with parameters:", command.toString()); + for (Map.Entry entry: parameters.entrySet()) { + if (!entry.getKey().equalsIgnoreCase(KEY_WORD_PAYLOAD)) { + log.debug("parameter: {} value: {}", entry.getKey(), entry.getValue()); + } + } + Pair response = executeHttpRequest(command, + httpSourceKeys.getSourceUri(), + parameters, + httpSourceKeys.getHttpRequestHeadersWithAuthentication()); + + if (response.getLeft().equalsIgnoreCase(KEY_WORD_HTTP_OK)) { + log.info("Request was successful, return HTTP response"); + return response.getRight(); + } + + Integer status = response.getRight().getStatusLine().getStatusCode(); + + // treat as warning if: + // status is < 400, and not in error list + // or status is in warning list + // by returning NULL, the task will complete without failure + if (status < 400 && !httpSourceKeys.getHttpStatuses().getOrDefault("error", Lists.newArrayList()).contains(status) + || httpSourceKeys.getHttpStatuses().getOrDefault("warning", Lists.newArrayList()).contains(status)) { + log.warn("Request was successful with warnings, return NULL response"); + return null; + } + + // checks if there is an error related to retrieving the access token or + // whether it has expired between pagination + List paginationErrors = httpSourceKeys.getHttpStatuses().getOrDefault( + "pagination_error", Lists.newArrayList()); + if (getJobKeys().getIsSecondaryAuthenticationEnabled() && paginationErrors.contains(status)) { + log.info("Request was unsuccessful, and needed retry with new authentication credentials"); + log.info("Sleep {} seconds, waiting for credentials to refresh", getJobKeys().getRetryDelayInSec()); + throw new RetriableAuthenticationException("Stale authentication token."); + } + + // every other error that should fail the job + throw new RuntimeException("Error in executing HttpRequest: " + status.toString()); + } + + /** + * Execute the request and return the response when everything goes OK, or null when + * there are warnings, or raising runtime exception if any error. + * + * Successful if the response status code is one of the codes in ms.http.statuses.success and + * the response status reason is not one of the codes in ms.http.status.reasons.error. + * + * Warning means the response cannot be process by the Extractor, and the task need to + * terminate, but it should not fail the job. Status codes below 400 are considered as warnings + * in general, but exceptions can be made by putting 4XX or 5XX codes in ms.http.statuses.warning + * configuration. + * + * Error means the response cannot be process by the Extractor, and the task need to be terminated, + * and the job should fail. Status codes 400 and above are considered as errors in general, but + * exceptions can be made by putting 4XX or 5XX codes in ms.http.statuses.success or ms.http.statuses.warning, + * or by putting 2XX and 3XX codes in ms.http.statuses.error. + * + * @param command the HttpRequestMethod object + * @param httpUriTemplate the Uri template + * @param parameters Http Request parameters + * @param headers additional Http Request headers + * @return a overall status and response pair, the overall status will be OK if status code is one of the + * success status codes, anything else, including warnings, are considered as NOT OK + */ + private Pair executeHttpRequest(final HttpRequestMethod command, + final String httpUriTemplate, final JsonObject parameters, final Map headers) { + // trying to make a Http request, capture the client side error and + // fail the task if any encoding exception or IO exception + CloseableHttpResponse response; + try { + JsonObject payloads = new JsonObject(); + JsonObject queryParameters = new JsonObject(); + for (Map.Entry entry: parameters.entrySet()) { + if (entry.getKey().equalsIgnoreCase(KEY_WORD_PAYLOAD)) { + payloads = JsonUtils.deepCopy(entry.getValue()).getAsJsonObject(); + } else { + queryParameters.add(entry.getKey(), entry.getValue()); + } + } + response = (CloseableHttpResponse) httpClient.execute( + command.getHttpRequest(httpUriTemplate, queryParameters, headers, payloads)); + } catch (Exception e) { + throw new RuntimeException(e.getMessage(), e); + } + + // fail the task if response object is null + Preconditions.checkNotNull(response, "Error in executing HttpRequest: response is null"); + + // only pass the response stream to extractor when the status code and reason code all + // indicate a success or there is a pagination error i.e. token has expired in between the pagination calls (in that + // it will retry accessing the token by passing the response object back). + Integer status = response.getStatusLine().getStatusCode(); + String reason = response.getStatusLine().getReasonPhrase(); + log.info("processing status: {} and reason: {}", status, reason); + if (httpSourceKeys.getHttpStatuses().getOrDefault("success", Lists.newArrayList()).contains(status) + && !httpSourceKeys.getHttpStatusReasons().getOrDefault("error", Lists.newArrayList()).contains(reason)) { + log.info("Request was successful, returning OK and HTTP response."); + return Pair.of(KEY_WORD_HTTP_OK, response); + } + + // trying to consume the response stream and close it, + // and fail the job if IOException happened during the process + if (null != response.getEntity()) { + try { + reason += StringUtils.LF + EntityUtils.toString(response.getEntity()); + log.error("Status code: {}, reason: {}", status, reason); + response.close(); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + log.warn("Request was unsuccessful, returning NOTOK and HTTP response"); + return Pair.of(KEY_WORD_HTTP_NOTOK, response); + } + + /** + * Get the content type string from response + * @param response HttpResponse + * @return the content type if available, otherwise, an empty string + */ + private String getResponseContentType(HttpResponse response) { + if (response.getEntity() != null + && response.getEntity().getContentType() != null) { + HeaderElement[] headerElements = response.getEntity().getContentType().getElements(); + if (headerElements.length > 0) { + return headerElements[0].getName(); + } + } + return StringUtils.EMPTY; + } + + /** + * Get all headers from response + * @param response HttpResponse + * @return the headers in a JsonObject format, otherwise, an empty JsonObject + */ + private JsonObject getResponseHeaders(HttpResponse response) { + JsonObject headers = new JsonObject(); + if (response.getAllHeaders() != null) { + for (Header header : response.getAllHeaders()) { + headers.addProperty(header.getName(), header.getValue()); + } + } + return headers; + } + + @Override + public boolean closeStream() { + log.info("Closing InputStream for {}", getExtractorKeys().getSignature()); + try { + if (response != null) { + response.close(); + } + } catch (Exception e) { + log.warn("Error closing the input stream", e); + return false; + } + return true; + } + + + @Override + public boolean closeAll(String message) { + try { + if (this.httpClient instanceof Closeable) { + ((Closeable) this.httpClient).close(); + httpClient = null; + } + } catch (IOException e) { + log.error("error closing HttpSource {}", e.getMessage()); + return false; + } + return true; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/connection/JdbcConnection.java b/dil/src/main/java/com/linkedin/dil/connection/JdbcConnection.java new file mode 100644 index 0000000..9d24103 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/connection/JdbcConnection.java @@ -0,0 +1,270 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.NonNull; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.factory.JdbcClientFactory; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JdbcKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.JdbcUtils; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.SchemaBuilder; +import com.linkedin.dil.util.WorkUnitStatus; + +/** + * JdbcConnection creates transmission channel with JDBC data provider or JDBC data receiver, + * and it executes commands per Extractor calls. + * + * @author Chris Li + */ +@Slf4j +public class JdbcConnection extends MultistageConnection { + @Getter(AccessLevel.PACKAGE) + @Setter(AccessLevel.PACKAGE) + private JdbcKeys jdbcSourceKeys; + @Getter(AccessLevel.PACKAGE) + @Setter(AccessLevel.PACKAGE) + private Connection jdbcConnection; + + public JdbcConnection(State state, JobKeys jobKeys, ExtractorKeys extractorKeys) { + super(state, jobKeys, extractorKeys); + assert jobKeys instanceof JdbcKeys; + jdbcSourceKeys = (JdbcKeys) jobKeys; + } + + @Override + public WorkUnitStatus execute(WorkUnitStatus status) { + try { + return executeStatement( + getWorkUnitSpecificString(jdbcSourceKeys.getJdbcStatement(), getExtractorKeys().getDynamicParameters()), + status); + } catch (Exception e) { + log.error(e.getMessage(), e); + return null; + } + } + + @Override + public boolean closeAll(String message) { + try { + if (jdbcConnection != null) { + jdbcConnection.close(); + jdbcConnection = null; + } + } catch (Exception e) { + log.error("Error closing the input stream", e); + return false; + } + return true; + } + + @Override + public WorkUnitStatus executeFirst(WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + WorkUnitStatus status = super.executeFirst(workUnitStatus); + jdbcConnection = getJdbcConnection(getState()); + return jdbcConnection != null ? execute(status) : null; + } + + @Override + public WorkUnitStatus executeNext(WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + WorkUnitStatus status = super.executeNext(workUnitStatus); + jdbcConnection = jdbcConnection == null ? getJdbcConnection(getState()) : jdbcConnection; + return jdbcConnection != null ? execute(status) : null; + } + + /** + * Create jdbcConnection for work unit in thread-safe mode + */ + private synchronized Connection getJdbcConnection(State state) { + try { + Class factoryClass = Class.forName(MultistageProperties.MSTAGE_JDBC_CLIENT_FACTORY.getValidNonblankWithDefault(state)); + JdbcClientFactory factory = (JdbcClientFactory) factoryClass.newInstance(); + + return factory.getConnection( + jdbcSourceKeys.getSourceUri(), + MultistageProperties.SOURCE_CONN_USERNAME.getValidNonblankWithDefault(state), + MultistageProperties.SOURCE_CONN_PASSWORD.getValidNonblankWithDefault(state), + state); + } catch (Exception e) { + log.error("Error creating Jdbc connection: {}", e.getMessage()); + } + return null; + } + + /** + * Execute the user provided statement and put the result as an InputStream in WorkUnitStatus + * + * The content of the InputStream can be either a JsonArray or a CSV buffer, depends on the extractor + * configuration. The reason it is devised so is that we would use JsonExtractor to handle nested data, + * and use CsvExtractor to handle larger data volume. + * + * Use case developers should decide on which option to use based on the payload by setting + * ms.extractor.class accordingly + * + * Pagination is one way to control the batch size, here we fetch the page in once. + * This control only makes sense when Limit clause is not present in the SQL statement. + * + * When Limit Offset is used in the SQL statement, page size = result set size. + * + * For better performance optimization, please use: + * 1. time watermark partitioning if an date time index is available on the table + * 2. unit watermarks if any attributes can be used effectively to breakdown data ingestion to smaller chunks + * see ms.watermarks and go/dil-doc for details + * + * @param query the query to be executed + * @param wuStatus the input work unit status + * @return the updated work unit status object + * @throws SQLException extractor shall handle this exception and fail the work unit + */ + @SuppressFBWarnings + private WorkUnitStatus executeStatement( + String query, + WorkUnitStatus wuStatus) throws SQLException { + + log.info("Executing SQL statement: {}", query); + Statement stmt = jdbcConnection.createStatement(); + + if (jdbcSourceKeys.isPaginationEnabled()) { + try { + stmt.setFetchSize(jdbcSourceKeys.getPaginationInitValues().get(ParameterTypes.PAGESIZE).intValue()); + } catch (SQLException e) { + log.warn("not able to set fetch size"); + } + } + + if (stmt.execute(query)) { + ResultSet resultSet = stmt.getResultSet(); + if (MultistageProperties.MSTAGE_EXTRACTOR_CLASS.getValidNonblankWithDefault(getState()).toString() + .matches(".*JsonExtractor.*")) { + wuStatus.setBuffer(new ByteArrayInputStream(toJson(resultSet, + resultSet.getMetaData()).toString().getBytes(StandardCharsets.UTF_8))); + } else if (MultistageProperties.MSTAGE_EXTRACTOR_CLASS.getValidNonblankWithDefault(getState()).toString() + .matches(".*CsvExtractor.*")) { + wuStatus.setBuffer(new ByteArrayInputStream(toCsv(resultSet, + resultSet.getMetaData()).getBytes(StandardCharsets.UTF_8))); + } else { + stmt.close(); + throw new UnsupportedOperationException(); + } + // if source schema is not present, try retrieving the source schema and store in the work unit message + // this also prevents from processing source schema repeatedly in the pagination scenario + if (!jdbcSourceKeys.hasSourceSchema()) { + wuStatus.getMessages().put("schema", retrieveSchema(resultSet.getMetaData()).toString()); + } + } + stmt.close(); + return wuStatus; + } + + /** + * Converts a ResultSet to a JsonArray + * + * for nested dataset, this is more preferred + * + * @param resultSet the input result set + * @param resultSetMetadata the result set metadata + * @return the converted JsonArray + * @throws SQLException SQL Exception from processing ResultSet + */ + private JsonArray toJson(final ResultSet resultSet, final ResultSetMetaData resultSetMetadata) throws SQLException { + JsonArray jsonArray = new JsonArray(); + while (resultSet.next()) { + JsonObject jsonObject = new JsonObject(); + for (int i = 0; i < resultSetMetadata.getColumnCount(); i++) { + jsonObject.addProperty(getColumnName(resultSetMetadata, i + 1), JdbcUtils.parseColumnAsString(resultSet, resultSetMetadata, i + 1)); + } + jsonArray.add(jsonObject); + } + return jsonArray; + } + + /** + * Converts a ResultSet to CSV + * + * for large dataset, this is more preferred + * + * @param resultSet the input result set + * @param resultSetMetadata the result set metadata + * @return a 2-dimensional string matrix representing a CSV file + * @throws SQLException SQL Exception from processing ResultSet + */ + @NonNull + private String toCsv(final ResultSet resultSet, final ResultSetMetaData resultSetMetadata) throws SQLException { + StringBuilder builder = new StringBuilder(); + + while (resultSet.next()) { + for (int i = 0; i < resultSetMetadata.getColumnCount(); i++) { + builder.append(StringEscapeUtils.escapeCsv(JdbcUtils.parseColumnAsString(resultSet, resultSetMetadata, i + 1))); + if (i < resultSetMetadata.getColumnCount() - 1) { + builder.append(jdbcSourceKeys.getSeparator()); + } else { + builder.append(System.lineSeparator()); + } + } + } + return builder.toString(); + } + + /** + * Retrieve schema info from metadata + * @param resultSetMetadata result set metadata + * @return schema in JsonArray format + * @throws SQLException SQL Exception from processing ResultSet + */ + private JsonArray retrieveSchema(final ResultSetMetaData resultSetMetadata) throws SQLException { + List columns = new ArrayList<>(); + for (int i = 0; i < resultSetMetadata.getColumnCount(); i++) { + boolean nullable = resultSetMetadata.isNullable(i + 1) == ResultSetMetaData.columnNullable; + columns.add(new SchemaBuilder(getColumnName(resultSetMetadata, i + 1), + SchemaBuilder.PRIMITIVE, nullable, new ArrayList<>()).setPrimitiveType( + JdbcUtils.parseColumnType(resultSetMetadata.getColumnType(i + 1), nullable).getAltName())); + } + return new SchemaBuilder(SchemaBuilder.RECORD, false, columns) + .buildAltSchema(new HashMap<>(), + getJobKeys().isEnableCleansing(), + getJobKeys().getSchemaCleansingPattern(), + getJobKeys().getSchemaCleansingReplacement(), + getJobKeys().getSchemaCleansingNullable()) + .getAsJsonArray(); + } + + /** + * convert column names if required + * @param resultSetMetadata the result set schema + * @param index1 the 1 based index of the column + * @return the column name after conversion + * @throws SQLException + */ + private String getColumnName(final ResultSetMetaData resultSetMetadata, int index1) throws SQLException { + if (jdbcSourceKeys.getSchemaRefactorFunction().equalsIgnoreCase("toupper")) { + return resultSetMetadata.getColumnName(index1).toUpperCase(); + } else if (jdbcSourceKeys.getSchemaRefactorFunction().equalsIgnoreCase("tolower")) { + return resultSetMetadata.getColumnName(index1).toLowerCase(); + } + return resultSetMetadata.getColumnName(index1); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/connection/MultistageConnection.java b/dil/src/main/java/com/linkedin/dil/connection/MultistageConnection.java new file mode 100644 index 0000000..3275b9a --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/connection/MultistageConnection.java @@ -0,0 +1,115 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.gson.JsonObject; +import lombok.Getter; +import lombok.Setter; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.VariableUtils; +import com.linkedin.dil.util.WorkUnitStatus; + +/** + * MultistageConnection is a basic implementation of Connection interface. + * + * @author Chris Li + */ +@Slf4j +public class MultistageConnection implements Connection { + @Getter @Setter private State state = null; + @Getter @Setter private JobKeys jobKeys = null; + @Getter @Setter private ExtractorKeys extractorKeys = null; + + public MultistageConnection(State state, JobKeys jobKeys, ExtractorKeys extractorKeys) { + this.setJobKeys(jobKeys); + this.setState(state); + this.setExtractorKeys(extractorKeys); + } + + /** + * Default execute methods + * @param status prior work unit status + * @return new work unit status + */ + @Override + public WorkUnitStatus execute(final WorkUnitStatus status) throws RetriableAuthenticationException { + return status.toBuilder().build(); + } + + /** + * Close the connection and pool of connections if applicable, default + * implementation does nothing. + * @param message the message to send to the other end of connection upon closing + * @return true (default) + */ + @Override + public boolean closeAll(final String message) { + return true; + } + + /** + * Close the current cursor or stream if applicable, default + * implementation do nothing. + * @return true (default) + */ + @Override + public boolean closeStream() { + return true; + } + + public JsonObject getWorkUnitParameters() { + return null; + } + + /** + * Default implementation of a multistage read connection + * @param workUnitStatus prior work unit status + * @return new work unit status + */ + @SneakyThrows + public WorkUnitStatus executeFirst(final WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + return WorkUnitStatus.builder().build(); + } + + public WorkUnitStatus executeNext(final WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + try { + Thread.sleep(jobKeys.getCallInterval()); + } catch (Exception e) { + log.warn(e.getMessage()); + } + log.info("Starting a new request to the source, work unit = {}", extractorKeys.getSignature()); + log.debug("Prior parameters: {}", extractorKeys.getDynamicParameters().toString()); + log.debug("Prior work unit status: {}", workUnitStatus.toString()); + return workUnitStatus; + } + + /** + * This method applies the work unit parameters to string template, and + * then return a work unit specific string + * + * @param template the template string + * @param parameters the parameters with all variables substituted + * @return work unit specific string + */ + protected String getWorkUnitSpecificString(String template, JsonObject parameters) { + String finalString = template; + try { + // substitute with parameters defined in ms.parameters and activation parameters + finalString = VariableUtils.replaceWithTracking( + finalString, + parameters, + false).getKey(); + } catch (Exception e) { + log.error("Error getting work unit specific string " + e); + } + log.info("Final work unit specific string: {}", finalString); + return finalString; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/connection/S3Connection.java b/dil/src/main/java/com/linkedin/dil/connection/S3Connection.java new file mode 100644 index 0000000..0db233d --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/connection/S3Connection.java @@ -0,0 +1,198 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.common.collect.Lists; +import java.net.URI; +import java.time.Duration; +import java.util.List; +import java.util.stream.Collectors; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.factory.S3ClientFactory; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.keys.S3Keys; +import com.linkedin.dil.util.EncryptionUtils; +import com.linkedin.dil.util.InputStreamUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.ResponseInputStream; +import software.amazon.awssdk.core.sync.ResponseTransformer; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.GetObjectResponse; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; +import software.amazon.awssdk.utils.AttributeMap; + +import static software.amazon.awssdk.http.SdkHttpConfigurationOption.*; + +/** + * S3Connection creates transmission channel with AWS S3 data provider or AWS S3 data receiver, + * and it executes commands per Extractor calls. + * + * @author Chris Li + */ +@Slf4j +public class S3Connection extends MultistageConnection { + @Getter final private S3Keys s3SourceV2Keys; + @Setter private S3Client s3Client = null; + + public S3Connection(State state, JobKeys jobKeys, ExtractorKeys extractorKeys) { + super(state, jobKeys, extractorKeys); + assert jobKeys instanceof S3Keys; + s3SourceV2Keys = (S3Keys) jobKeys; + } + + @Override + public WorkUnitStatus execute(WorkUnitStatus status) { + return null; + } + + @Override + public boolean closeAll(String message) { + return true; + } + + /* + Below is the logic of when to download a file and when to list similar files based on the uri and pattern + ms.source.file.pattern + if Is not blank: + List the S3 keys and output as CSV + + if Is blank: + ms.extract.target.file.name? + If is blank: + List the S3 keys and output as CSV + If is not blank: + If ms.source.uri prefix produces only 1 file: + dump the S3 object into the given output file name + If ms.source.uir prefix produces more than 1 file: + dump only the file where prefix = object key, and ignore all other objects + */ + @Override + public WorkUnitStatus executeFirst(WorkUnitStatus workUnitStatus) throws RetriableAuthenticationException { + WorkUnitStatus status = super.executeFirst(workUnitStatus); + s3Client = getS3HttpClient(getState()); + + String finalPrefix = getWorkUnitSpecificString(s3SourceV2Keys.getPrefix(), getExtractorKeys().getDynamicParameters()); + log.debug("Final Prefix to get files list: {}", finalPrefix); + try { + List files = getFilesList(finalPrefix); + boolean isObjectWithPrefixExist = files.stream().anyMatch(objectKey -> objectKey.equals(finalPrefix)); + log.debug("Number of files identified: {}", files.size()); + + if (StringUtils.isNotBlank(s3SourceV2Keys.getFilesPattern())) { + List filteredFiles = files.stream() + .filter(fileName -> fileName.matches(s3SourceV2Keys.getFilesPattern())) + .collect(Collectors.toList()); + status.setBuffer(InputStreamUtils.convertListToInputStream(filteredFiles)); + } else { + if (StringUtils.isBlank(s3SourceV2Keys.getTargetFilePattern())) { + status.setBuffer(InputStreamUtils.convertListToInputStream(files)); + } else { + String fileToDownload = ""; + if (files.size() == 1) { + fileToDownload = files.get(0); + } else if (isObjectWithPrefixExist) { + fileToDownload = finalPrefix; + } + if (StringUtils.isNotBlank(fileToDownload)) { + log.debug("Downloading file: {}", files.get(0)); + GetObjectRequest getObjectRequest = + GetObjectRequest.builder().bucket(s3SourceV2Keys.getBucket()).key(files.get(0)).build(); + ResponseInputStream response = + s3Client.getObject(getObjectRequest, ResponseTransformer.toInputStream()); + status.setBuffer(response); + } else { + log.warn("Invalid set of parameters. To list down files from a bucket, pattern " + + "parameter is needed and to get object from s3 source target file name is needed."); + } + } + } + } catch (Exception e) { + log.error("Unexpected Exception", e); + return null; + } + return status; + } + + /** + * Thread-safely create S3Client as needed + */ + synchronized S3Client getS3HttpClient(State state) { + if (s3Client == null) { + try { + Class factoryClass = Class.forName(MultistageProperties.MSTAGE_S3_CLIENT_FACTORY.getValidNonblankWithDefault(state)); + S3ClientFactory factory = (S3ClientFactory) factoryClass.newInstance(); + + Integer connectionTimeout = s3SourceV2Keys.getConnectionTimeout(); + AttributeMap config = connectionTimeout == null ? GLOBAL_HTTP_DEFAULTS + : GLOBAL_HTTP_DEFAULTS.toBuilder() + .put(CONNECTION_TIMEOUT, Duration.ofSeconds(connectionTimeout)) + .build(); + + s3Client = S3Client.builder() + .region(this.s3SourceV2Keys.getRegion()) + .endpointOverride(URI.create(s3SourceV2Keys.getEndpoint())) + .httpClient(factory.getHttpClient(state, config)) + .credentialsProvider(getCredentialsProvider(state)) + .build(); + } catch (Exception e) { + log.error("Error creating S3 Client: {}", e.getMessage()); + } + } + return s3Client; + } + + /** + * retrieve a list of objects given a bucket name and a prefix + * @return list of object keys + */ + private List getFilesList(String finalPrefix) { + List files = Lists.newArrayList(); + ListObjectsV2Request.Builder builder = + ListObjectsV2Request.builder().bucket(s3SourceV2Keys.getBucket()).maxKeys(s3SourceV2Keys.getMaxKeys()); + + if (!finalPrefix.isEmpty()) { + builder.prefix(finalPrefix); + } + ListObjectsV2Request request = builder.build(); + ListObjectsV2Response listObjectsV2Response = null; + + log.debug("Listing object by prefix: {}", finalPrefix); + do { + if (listObjectsV2Response != null) { + request = builder.continuationToken(listObjectsV2Response.continuationToken()).build(); + } + listObjectsV2Response = s3Client.listObjectsV2(request); + listObjectsV2Response.contents().forEach(f -> { + files.add(f.key()); + }); + } while (listObjectsV2Response.isTruncated()); + return files; + } + + public AwsCredentialsProvider getCredentialsProvider(State state) { + AwsCredentialsProvider credentialsProvider = AnonymousCredentialsProvider.create(); + if (StringUtils.isNotBlank(s3SourceV2Keys.getAccessKey()) || StringUtils.isNotEmpty(s3SourceV2Keys.getSecretId())) { + AwsCredentials credentials = + AwsBasicCredentials.create(EncryptionUtils.decryptGobblin(s3SourceV2Keys.getAccessKey(), state), + EncryptionUtils.decryptGobblin(s3SourceV2Keys.getSecretId(), state)); + credentialsProvider = StaticCredentialsProvider.create(credentials); + } + return credentialsProvider; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/converter/AvroNormalizerConverter.java b/dil/src/main/java/com/linkedin/dil/converter/AvroNormalizerConverter.java new file mode 100644 index 0000000..c3dc585 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/converter/AvroNormalizerConverter.java @@ -0,0 +1,208 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.converter; + +import com.google.common.base.Optional; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.converter.Converter; +import org.apache.gobblin.converter.SchemaConversionException; +import org.apache.gobblin.converter.SingleRecordIterable; +import org.apache.gobblin.converter.avro.UnsupportedDateTypeException; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.util.AvroSchemaUtils; +import org.apache.gobblin.util.AvroUtils; +import org.apache.gobblin.util.EmptyIterable; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * AvroNormalizerConverter normalizes records in GenericRecord format. The source + * is expected to be an array of GenericRecord. And the converter is + * fed with GenericRecord one by one. And at the end, an explicit EOF is + * expected so that the Normalizer can write the normalized data to downstream. + * + * The converter depends on Target Schema, which can share some common fields with + * the source (input) schema, and one (1) Normalized Field, which is the first + * field in the target (output) schema but not in the source schema. All common fields + * between source schema and target schema will have values copied from the very first + * record of a conversion series. + * + * At the end of a conversion series, the normalized records are added to output record + * as an Avro array under the Normalized Field. The normalized records include all + * fields that are not in Target Schema. + * + * We can control the batch size of normalized record using ms.normalizer.max.records.per.batch. + * By default the batch size is 500. + */ +public class AvroNormalizerConverter extends Converter { + final private Set outputFields = new HashSet<>(); + private GenericData.Array normalized; + private String normalizedField; + private int maxRecordsPerBatch; + private JsonArray targetSchema; + private GenericRecord firstRecord; + // schema of a normalizing field record, which are fields not in the output schema + // and need to be pushed into the normalized field + private Schema normalizingFieldsRecordSchema; + // schema of the array containing all normalizing field records, i.e. array[GenericRecord] + private Schema normalizingFieldsArraySchema; + // schema of the final normalized record, e.g. {asIs:string, normalized:array[GenericRecord]} + private Schema normalizedRecordSchema; + private boolean haveIntermediateSchemas = false; + + + @Override + public Converter init(WorkUnitState workUnit) { + // Avro Array's max capacity is max int. In case of overflow, use the default value 500. + try { + maxRecordsPerBatch = + Math.toIntExact(MultistageProperties.MSTAGE_NORMALIZER_BATCH_SIZE.getValidNonblankWithDefault(workUnit)); + } catch (ArithmeticException e) { + maxRecordsPerBatch = 500; + } + + targetSchema = MultistageProperties.MSTAGE_TARGET_SCHEMA.getValidNonblankWithDefault(workUnit); + return this; + } + + @Override + public Schema convertSchema(Schema schema, WorkUnitState workUnitState) throws SchemaConversionException { + Schema finalSchema = null; + for (JsonElement element : targetSchema) { + String columnName = element.getAsJsonObject().get(KEY_WORD_COLUMN_NAME).getAsString(); + outputFields.add(columnName); + if (normalizedField == null && !schemaSearch(schema, columnName)) { + normalizedField = columnName; + } + } + + if (!haveIntermediateSchemas) { + buildIntermediateSchemas(schema); + } + + try { + finalSchema = AvroSchemaUtils.fromJsonSchema(targetSchema, workUnitState); + } catch (UnsupportedDateTypeException e) { + throw new SchemaConversionException(e); + } + return finalSchema; + } + + @Override + public Iterable convertRecord(Schema schema, GenericRecord inputRecord, WorkUnitState workUnitState) { + Optional eof = AvroUtils.getFieldValue(inputRecord, KEY_WORD_EOF); + if (eof.isPresent() && eof.get().toString().equals(KEY_WORD_EOF)) { + // only output when there's at least one record + return outputIterable(1); + } + // note: the common fields among records will have the same value, so we only need to retain one record + if (firstRecord == null) { + firstRecord = inputRecord; + } + normalized.add(getNormalizingFields(inputRecord)); + return outputIterable(maxRecordsPerBatch); + } + + /** + * Output a single record iterable when the size of the normalized array has reached the threshold + * and empty iterable otherwise. + * @param threshold the threshold to output + * @return iterable of generic record + */ + private Iterable outputIterable(int threshold) { + if (normalized.size() >= threshold) { + return new SingleRecordIterable<>(buildNormalizedRecord()); + } else { + return new EmptyIterable<>(); + } + } + + /** + * Utility method to build all intermediate schemas + * @param schema inputSchema + */ + private void buildIntermediateSchemas(Schema schema) { + // build normalizing fields' schema + normalizingFieldsRecordSchema = + Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); + // build normalized record's schema + normalizedRecordSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); + + // populate fields for the record containing normalized fields and the normalized record + List normalizingFields = new ArrayList<>(); + List normalizedRecordsFields = new ArrayList<>(); + for (Schema.Field field : schema.getFields()) { + if (outputFields.contains(field.name())) { + normalizedRecordsFields.add(AvroSchemaUtils.deepCopySchemaField(field)); + } else { + normalizingFields.add(AvroSchemaUtils.deepCopySchemaField(field)); + } + } + normalizingFieldsRecordSchema.setFields(normalizingFields); + // create the normalized field array + normalizingFieldsArraySchema = Schema.createArray(normalizingFieldsRecordSchema); + // add the normalized field to final schema + normalizedRecordsFields.add( + new Schema.Field(normalizedField, normalizingFieldsArraySchema, normalizedField, null)); + normalizedRecordSchema.setFields(normalizedRecordsFields); + normalized = new GenericData.Array<>(maxRecordsPerBatch, normalizingFieldsArraySchema); + + haveIntermediateSchemas = true; + } + + /** + * Distill those fields that are to be normalized + * @param inputRecord the input record + * @return the part of fields to be normalized + */ + private GenericRecord getNormalizingFields(GenericRecord inputRecord) { + GenericRecord normalizingFieldsRecord = new GenericData.Record(normalizingFieldsRecordSchema); + // copy values from input record to normalizing fields record + // fields not found in the input record are padded with null + for (String fieldName : AvroSchemaUtils.getSchemaFieldNames(normalizingFieldsRecordSchema)) { + Optional fieldValue = AvroUtils.getFieldValue(inputRecord, fieldName); + normalizingFieldsRecord.put(fieldName, fieldValue.isPresent() ? fieldValue.get() : null); + } + return normalizingFieldsRecord; + } + + /** + * Build a final normalized record + * @return the normalized record + */ + private GenericRecord buildNormalizedRecord() { + GenericRecord normalizedRecord = new GenericData.Record(normalizedRecordSchema); + for (String fieldName : AvroSchemaUtils.getSchemaFieldNames(firstRecord.getSchema())) { + if (outputFields.contains(fieldName)) { + Optional fieldValue = AvroUtils.getFieldValue(firstRecord, fieldName); + normalizedRecord.put(fieldName, fieldValue.isPresent() ? fieldValue.get() : null); + } + } + normalizedRecord.put(normalizedField, normalized); + // reset the buffer + normalized = new GenericData.Array<>(maxRecordsPerBatch, normalizingFieldsArraySchema); + return normalizedRecord; + } + + /** + * search if a given column is in the schema + * @param schema the schema in Avro Schema format + * @param name the column name + * @return true if the column is in the schema, otherwise false + */ + private boolean schemaSearch(Schema schema, String name) { + return AvroSchemaUtils.getSchemaFieldNames(schema).contains(name); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/converter/JsonNormalizerConverter.java b/dil/src/main/java/com/linkedin/dil/converter/JsonNormalizerConverter.java new file mode 100644 index 0000000..ee97722 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/converter/JsonNormalizerConverter.java @@ -0,0 +1,168 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.converter; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.converter.Converter; +import org.apache.gobblin.converter.SingleRecordIterable; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.util.JsonUtils; +import org.apache.gobblin.util.EmptyIterable; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * JsonNormalizerConverter normalizes records in JsonObject format. The source + * is expected to be an array of records, or JsonArray. And the converter is + * fed with JsonObject objects one by one. And at the end, an explicit EOF is + * expected so that the Normalizer can write the normalized data to downstream. + * + * The converter depends on Target Schema, which can share some common fields with + * the source (input) schema, and one (1) Normalized Field, which is the first + * field in the target (output) schema but not in the source schema. All common fields + * between source schema and target schema will have values copied from the very first + * record of a conversion series. + * + * At the end of a conversion series, the normalized records are added to output record + * as an JsonArray under the Normalized Field. The normalized records include all + * fields that are not in Target Schema. + * + * We can control the batch size of normalized record using ms.normalizer.max.records.per.batch. + * By default the batch size is 500. + */ +public class JsonNormalizerConverter extends Converter { + final private Set outputFields = new HashSet<>(); + private JsonArray normalized = new JsonArray(); + private String normalizedField; + private JsonObject firstRecord; + private long maxRecordsPerBatch; + private JsonArray targetSchema; + + @Override + public Converter init(WorkUnitState workUnit) { + maxRecordsPerBatch = MultistageProperties.MSTAGE_NORMALIZER_BATCH_SIZE.getValidNonblankWithDefault(workUnit); + targetSchema = MultistageProperties.MSTAGE_TARGET_SCHEMA.getValidNonblankWithDefault(workUnit); + return this; + } + + @Override + public JsonArray convertSchema(JsonArray inputSchema, WorkUnitState workUnit) { + for (JsonElement element : targetSchema) { + String columnName = element.getAsJsonObject().get(KEY_WORD_COLUMN_NAME).getAsString(); + outputFields.add(columnName); + if (normalizedField == null && !schemaSearch(inputSchema, columnName)) { + normalizedField = columnName; + } + } + assert normalizedField != null; + JsonObject dataType = JsonUtils.get(KEY_WORD_COLUMN_NAME, + normalizedField, KEY_WORD_DATA_TYPE, targetSchema).getAsJsonObject(); + String trueType = JsonUtils.get(KEY_WORD_TYPE, dataType).getAsString(); + JsonElement values = JsonUtils.get(KEY_WORD_VALUES, dataType); + if (trueType.equalsIgnoreCase(KEY_WORD_RECORD) && values.isJsonNull()) { + values = new JsonArray(); + for (JsonElement element: inputSchema) { + String columnName = element.getAsJsonObject().get(KEY_WORD_COLUMN_NAME).getAsString(); + if (!schemaSearch(targetSchema, columnName)) { + values.getAsJsonArray().add(element); + } + } + dataType.add(KEY_WORD_VALUES, values); + } + return targetSchema; + } + + @Override + public Iterable convertRecord(JsonArray outputSchema, JsonObject inputRecord, WorkUnitState workUnit) { + if (inputRecord.has(KEY_WORD_EOF) && inputRecord.get(KEY_WORD_EOF).getAsString().equals(KEY_WORD_EOF)) { + // only output when there's at least one record + return outputIterable(1); + } + // note: the common fields among records will have the same value, so we only need to retain one record + if (firstRecord == null) { + firstRecord = inputRecord; + } + normalized.add(getNormalizingFields(inputRecord)); + return outputIterable(maxRecordsPerBatch); + } + + /** + * Output a single record iterable when the size of the normalized array has reached the threshold + * and empty iterable otherwise. + * @param threshold the threshold to output + * @return iterable of JsonObject + */ + private Iterable outputIterable(long threshold) { + if (normalized.size() >= threshold) { + return new SingleRecordIterable<>(buildNormalizedRecord()); + } else { + return new EmptyIterable<>(); + } + } + + /** + * Distill those fields that are to be normalized + * @param record the input record + * @return the part of fields to be normalized + */ + private JsonObject getNormalizingFields(JsonObject record) { + JsonObject newRecord = new JsonObject(); + for (Map.Entry entry : record.entrySet()) { + if (!outputFields.contains(entry.getKey())) { + newRecord.add(entry.getKey(), entry.getValue()); + } + } + return newRecord; + } + + /** + * Build a final normalized record + * @return the normalized record + */ + private JsonObject buildNormalizedRecord() { + JsonObject newRecord = new JsonObject(); + for (Map.Entry entry : firstRecord.entrySet()) { + if (outputFields.contains(entry.getKey())) { + newRecord.add(entry.getKey(), entry.getValue()); + } + } + + String columnType = JsonUtils.get(KEY_WORD_COLUMN_NAME, + normalizedField, KEY_WORD_DATA_TYPE_TYPE, targetSchema).getAsString(); + if (columnType.equalsIgnoreCase(KEY_WORD_MAP) || columnType.equalsIgnoreCase(KEY_WORD_RECORD)) { + newRecord.add(normalizedField, normalized.get(0)); + } else { + newRecord.add(normalizedField, normalized); + } + // reset the buffer + normalized = new JsonArray(); + return newRecord; + } + + /** + * search if a given column is in the schema + * @param schema the schema in JasonArray format + * @param name the column name + * @return true if the column is in the schema, otherwise false + */ + private boolean schemaSearch(JsonArray schema, String name) { + for (JsonElement element : schema) { + if (!element.isJsonObject()) { + return false; + } + if (element.getAsJsonObject().get(KEY_WORD_COLUMN_NAME).getAsString().equals(name)) { + return true; + } + } + return false; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/exception/RetriableAuthenticationException.java b/dil/src/main/java/com/linkedin/dil/exception/RetriableAuthenticationException.java new file mode 100644 index 0000000..fb086ba --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/exception/RetriableAuthenticationException.java @@ -0,0 +1,14 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.exception; + +/** + * An {@link Exception} thrown when it can be retried + */ +public class RetriableAuthenticationException extends Exception { + public RetriableAuthenticationException(String message) { + super(message); + } +} \ No newline at end of file diff --git a/dil/src/main/java/com/linkedin/dil/extractor/AvroExtractor.java b/dil/src/main/java/com/linkedin/dil/extractor/AvroExtractor.java new file mode 100644 index 0000000..6dbe5d3 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/extractor/AvroExtractor.java @@ -0,0 +1,352 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.annotations.VisibleForTesting; +import com.google.gson.JsonArray; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.lang3.StringUtils; +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.converter.avro.UnsupportedDateTypeException; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.filter.AvroSchemaBasedFilter; +import com.linkedin.dil.keys.AvroExtractorKeys; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.AvroSchemaUtils; +import com.linkedin.dil.util.JsonIntermediateSchema; +import com.linkedin.dil.util.SchemaUtils; +import org.apache.gobblin.util.AvroUtils; +import org.testng.Assert; + +import static org.apache.avro.Schema.Type.*; + + +/** + * AvroExtractor reads Avro formatted files from HDFS locations. + * + * This extractor will output schema in Avro Schema format. + * + * The rows will be pass output to converters in the form of GenericRecord, which represent + * rows. + * + * This extractor can be used to feed into a AvroToJsonConvertor to get json data in the end. + * + * @author esong + */ +@Slf4j +public class AvroExtractor extends MultistageExtractor { + @Getter + private AvroExtractorKeys avroExtractorKeys = new AvroExtractorKeys(); + + public AvroExtractor(WorkUnitState state, JobKeys jobKeys) { + super(state, jobKeys); + super.initialize(avroExtractorKeys); + initialize(avroExtractorKeys); + } + + @Override + protected void initialize(ExtractorKeys keys) { + avroExtractorKeys.logUsage(state); + avroExtractorKeys.logDebugAll(state.getWorkunit()); + } + /** + * Utility function to do a double assignment + * @param avroExtractorKeys the extractor key + */ + @VisibleForTesting + protected void setAvroExtractorKeys(AvroExtractorKeys avroExtractorKeys) { + this.extractorKeys = avroExtractorKeys; + this.avroExtractorKeys = avroExtractorKeys; + } + + /** + * getSchema will be called by Gobblin to retrieve the schema of the output of this extract. + * The returned schema will be used in subsequent converters. The alternative schema here suites + * JsonIntermediateToAvroConverter. Future development can support other converter by making + * the schema conversion configurable. + * + * + * @return the schema of the extracted record set in AvroSchema + */ + @SneakyThrows + @Override + public Schema getSchema() { + Schema avroSchema; + log.debug("Retrieving schema definition"); + if (this.jobKeys.hasOutputSchema()) { + // take pre-defined fixed schema + JsonArray schemaArray = jobKeys.getOutputSchema(); + setRowFilter(schemaArray); + avroSchema = fromJsonSchema(schemaArray); + } else { + avroSchema = processInputStream(0) ? avroExtractorKeys.getAvroOutputSchema() + : fromJsonSchema(createMinimumSchema()); + } + Assert.assertNotNull(avroSchema); + return addDerivedFieldsToSchema(avroSchema); + } + + /** + * Initialize row filter + * @param schemaArray schema array + */ + @Override + protected void setRowFilter(JsonArray schemaArray) { + if (rowFilter == null) { + if (MultistageProperties.MSTAGE_ENABLE_SCHEMA_BASED_FILTERING.getValidNonblankWithDefault(state)) { + rowFilter = new AvroSchemaBasedFilter(new JsonIntermediateSchema(jobKeys.getOutputSchema()), + avroExtractorKeys, state); + } + } + } + + /** + * if pagination is not enabled, this method will iterate through the iterator and send records one + * by one, each row formatted as a GenericRecord. + * + * if pagination is enabled, the method will try to get a new set of data from the Source after + * the iterator is exhausted. + * + * @param reuse not used, just to match interface + * @return a row of avro data in GenericRecord format + */ + @Nullable + @Override + public GenericRecord readRecord(GenericRecord reuse) { + if (avroExtractorKeys.getAvroRecordIterator() == null + && !processInputStream(0)) { + return null; + } + + DataFileStream avroRecordIterator = avroExtractorKeys.getAvroRecordIterator(); + + if (hasNext()) { + avroExtractorKeys.incrProcessedCount(); + // update work unit status along the way, since we are using iterators + workUnitStatus.setPageStart(avroExtractorKeys.getProcessedCount()); + workUnitStatus.setPageNumber(avroExtractorKeys.getCurrentPageNumber()); + GenericRecord row = avroRecordIterator.next(); + AvroSchemaBasedFilter avroSchemaBasedFilter = (AvroSchemaBasedFilter) rowFilter; + if (avroSchemaBasedFilter != null) { + row = avroSchemaBasedFilter.filter(row); + } + return addDerivedFields(row); + } + + if (!this.eof && extractorKeys.getExplictEof()) { + eof = true; + return AvroSchemaUtils.createEOF(state); + } + return null; + } + + /** + * This is the main method in this extractor, it extracts data from source and perform essential checks. + * + * @param starting [0, +INF), points to the last count of record processed, 0 means it's the first of a series of requests + * @return true if Successful + */ + @Override + protected boolean processInputStream(long starting) { + if (!super.processInputStream(starting)) { + return false; + } + + DataFileStream avroRecordIterator; + try { + avroRecordIterator = new DataFileStream<>(workUnitStatus.getBuffer(), + new GenericDatumReader<>()); + avroExtractorKeys.setAvroRecordIterator(avroRecordIterator); + // store the original schema for further processing + if (hasNext() && avroExtractorKeys.getAvroOutputSchema() == null) { + avroExtractorKeys.setAvroOutputSchema(avroRecordIterator.getSchema()); + } + if (jobKeys.hasOutputSchema()) { + List schemaColumns = new ArrayList<>(new JsonIntermediateSchema(jobKeys.getOutputSchema()) + .getColumns().keySet()); + List fieldNames = AvroSchemaUtils.getSchemaFieldNames(avroExtractorKeys.getAvroOutputSchema()); + avroExtractorKeys.setIsValidOutputSchema(SchemaUtils.isValidOutputSchema(schemaColumns, fieldNames)); + } + } catch (Exception e) { + log.error("Source Error: {}", e.getMessage()); + state.setWorkingState(WorkUnitState.WorkingState.FAILED); + return false; + } + + // return false to stop the job under these situations + if (workUnitStatus.getBuffer() == null + || avroExtractorKeys.getAvroRecordIterator() == null) { + return false; + } + avroExtractorKeys.incrCurrentPageNumber(); + + avroExtractorKeys.logDebugAll(state.getWorkunit()); + workUnitStatus.logDebugAll(); + extractorKeys.logDebugAll(state.getWorkunit()); + return hasNext(); + } + + /** + * If the iterator is null, then it must be the first request + * @param starting the starting position of the request + * @return true if the iterator is null, otherwise false + */ + @Override + protected boolean isFirst(long starting) { + return avroExtractorKeys.getAvroRecordIterator() == null; + } + + /** + * Helper function that indicates if there are any records left to read + * @return true if there are more records and false otherwise + */ + protected boolean hasNext() { + DataFileStream avroRecordIterator = avroExtractorKeys.getAvroRecordIterator(); + return avroRecordIterator != null && avroRecordIterator.hasNext(); + } + + /** + * Append the derived field definition to the output schema + * @param schema current schema + * @return modified schema + */ + private Schema addDerivedFieldsToSchema(Schema schema) { + Set>> derivedFields = jobKeys.getDerivedFields().entrySet(); + if (derivedFields.size() == 0) { + return schema; + } + // create the new schema with original fields and derived fields + Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false); + List fields = AvroUtils.deepCopySchemaFields(schema); + for (Map.Entry> derivedField: derivedFields) { + String name = derivedField.getKey(); + String type = derivedField.getValue().get("type"); + switch (type) { + case "epoc": + fields.add(new Schema.Field(name, Schema.create(LONG), name, null)); + break; + case "string": + case "regexp": + fields.add(new Schema.Field(name, Schema.create(STRING), name, null)); + break; + case "boolean": + fields.add(new Schema.Field(name, Schema.create(BOOLEAN), name, null)); + break; + case "integer": + fields.add(new Schema.Field(name, Schema.create(INT), name, null)); + break; + case "number": + fields.add(new Schema.Field(name, Schema.create(DOUBLE), name, null)); + break; + default: + failWorkUnit("Unsupported type for derived fields: " + type); + break; + } + } + // create a new record with the new schema + newSchema.setFields(fields); + return newSchema; + } + + /** + * calculate and add derived fields, + * derivedFields map in this in structure {name1 : {type: type1, source: source1, format: format1}} + * @param row original record + * @return modified record + */ + private GenericRecord addDerivedFields(GenericRecord row) { + Set>> derivedFields = jobKeys.getDerivedFields().entrySet(); + int numDerivedFields = derivedFields.size(); + if (numDerivedFields == 0) { + return row; + } + Schema schema = row.getSchema(); + Schema newSchema = addDerivedFieldsToSchema(schema); + // Create the new record and copy over old fields + GenericRecord rowWithDerivedFields = new GenericData.Record(newSchema); + schema.getFields().forEach(field -> { + String fieldName = field.name(); + rowWithDerivedFields.put(fieldName, row.get(fieldName)); + }); + // process derived fields and add to the new record + for (Map.Entry> derivedField: derivedFields) { + String name = derivedField.getKey(); + Map derivedFieldDef = derivedField.getValue(); + String strValue = processDerivedFieldSource(row, derivedFieldDef); + String type = derivedField.getValue().get("type"); + switch (type) { + case "epoc": + if (strValue.length() > 0) { + rowWithDerivedFields.put(name, Long.parseLong(strValue)); + } + break; + case "string": + case "regexp": + rowWithDerivedFields.put(name, strValue); + break; + case "boolean": + rowWithDerivedFields.put(name, Boolean.parseBoolean(strValue)); + break; + case "integer": + rowWithDerivedFields.put(name, Integer.parseInt(strValue)); + break; + case "number": + rowWithDerivedFields.put(name, Double.parseDouble(strValue)); + break; + default: + failWorkUnit("Unsupported type for derived fields: " + type); + break; + } + + } + return rowWithDerivedFields; + } + + /** + * Process the derived field source to get intermediate value + * @param row current row being processed + * @param derivedFieldDef map {type: type1, source: source1, format: format1} + * @return String value of the derived field + */ + private String processDerivedFieldSource(GenericRecord row, Map derivedFieldDef) { + String source = derivedFieldDef.getOrDefault("source", StringUtils.EMPTY); + String inputValue = derivedFieldDef.getOrDefault("value", StringUtils.EMPTY); + boolean isInputValueFromSource = false; + + // get the base value from the source row if present + if (isInputValueFromSource(source)) { + Object ele = row.get(source); + if (ele != null) { + inputValue = ele.toString(); + isInputValueFromSource = true; + } + } + + return generateDerivedFieldValue(derivedFieldDef, inputValue, isInputValueFromSource); + } + + /** + * Utility method to convert JsonArray schema to avro schema + * @param schema of JsonArray type + * @return avro schema + * @throws UnsupportedDateTypeException + */ + private Schema fromJsonSchema(JsonArray schema) throws UnsupportedDateTypeException { + return AvroSchemaUtils.fromJsonSchema(schema, state); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/extractor/CsvExtractor.java b/dil/src/main/java/com/linkedin/dil/extractor/CsvExtractor.java new file mode 100644 index 0000000..d78b4c1 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/extractor/CsvExtractor.java @@ -0,0 +1,514 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.primitives.Doubles; +import com.google.common.primitives.Floats; +import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.opencsv.CSVParser; +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.filter.CsvSchemaBasedFilter; +import com.linkedin.dil.keys.CsvExtractorKeys; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.preprocessor.InputStreamProcessor; +import com.linkedin.dil.preprocessor.StreamProcessor; +import com.linkedin.dil.util.CsvUtils; +import com.linkedin.dil.util.JsonIntermediateSchema; +import com.linkedin.dil.util.SchemaBuilder; +import com.linkedin.dil.util.SchemaUtils; +import com.linkedin.dil.util.VariableUtils; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Period; +import org.testng.Assert; + + +/** + * CSV Extractor extracts CSV formatted data from an InputStream passed from a Source. + * + * The extractor accepts configurable preprocessors that transforms one InputStream to another + * InputStream. Those preprocessors include Gunzip and GPG decryption steps + * + * @author chrli, esong + */ +@Slf4j +public class CsvExtractor extends MultistageExtractor { + private final static Long SCHEMA_INFER_MAX_SAMPLE_SIZE = 100L; + @Getter + private CsvExtractorKeys csvExtractorKeys = new CsvExtractorKeys(); + + public CsvExtractor(WorkUnitState state, JobKeys jobKeys) { + super(state, jobKeys); + super.initialize(csvExtractorKeys); + initialize(csvExtractorKeys); + } + + @Override + protected void initialize(ExtractorKeys keys) { + csvExtractorKeys.logUsage(state); + csvExtractorKeys.setColumnHeader( + MultistageProperties.MSTAGE_CSV_COLUMN_HEADER.validateNonblank(state) ? MultistageProperties.MSTAGE_CSV_COLUMN_HEADER.getProp( + state) : false); + csvExtractorKeys.setRowsToSkip(MultistageProperties.MSTAGE_CSV_SKIP_LINES.getValidNonblankWithDefault(state)); + if (csvExtractorKeys.getColumnHeader() && csvExtractorKeys.getRowsToSkip() == 0) { + csvExtractorKeys.setRowsToSkip(1); + } + csvExtractorKeys.setSeparator( + CsvUtils.unescape(MultistageProperties.MSTAGE_CSV_SEPARATOR.getValidNonblankWithDefault(state))); + csvExtractorKeys.setQuoteCharacter( + CsvUtils.unescape(MultistageProperties.MSTAGE_CSV_QUOTE_CHARACTER.getValidNonblankWithDefault(state))); + csvExtractorKeys.setEscapeCharacter( + CsvUtils.unescape(MultistageProperties.MSTAGE_CSV_ESCAPE_CHARACTER.getValidNonblankWithDefault(state))); + csvExtractorKeys.setSampleRows(new ArrayDeque<>()); + + // check if user has defined the output schema + if (jobKeys.hasOutputSchema()) { + JsonArray outputSchema = jobKeys.getOutputSchema(); + csvExtractorKeys.setColumnProjection(expandColumnProjection(MultistageProperties.MSTAGE_CSV_COLUMN_PROJECTION + .getValidNonblankWithDefault(state), outputSchema.size())); + // initialize the column name to index map based on the schema when derived fields are present + if (jobKeys.getDerivedFields().entrySet().size() > 0) { + buildColumnToIndexMap(outputSchema); + } + } + csvExtractorKeys.logDebugAll(state.getWorkunit()); + } + + /** + * Utility function to do a double assignment + * @param csvExtractorKeys the extractor key + */ + @VisibleForTesting + protected void setCsvExtractorKeys(CsvExtractorKeys csvExtractorKeys) { + this.extractorKeys = csvExtractorKeys; + this.csvExtractorKeys = csvExtractorKeys; + } + + /** + * This method rely on the parent class to get a JsonArray formatted schema, and pass it out as + * a string. Typically we expect the downstream is a CsvToJsonConverter. + * + * @return schema that is structured as a JsonArray but formatted as a String + */ + @Override + public String getSchema() { + log.debug("Retrieving schema definition"); + JsonArray schemaArray = super.getOrInferSchema(); + Assert.assertNotNull(schemaArray); + if (jobKeys.getDerivedFields().size() > 0) { + schemaArray.addAll(addDerivedFieldsToAltSchema()); + } + return schemaArray.toString(); + } + + /** + * if pagination is not enabled, this method will iterate through the iterator and send records one + * by one, each row formatted as a String[]. + * + * if pagination is enabled, the method will try to get a new set of data from the Source after + * the iterator is exhausted. + * + * @param reuse not used, just to match interface + * @return a row of CSV data in String[] format + */ + @Nullable + @Override + public String[] readRecord(String[] reuse) { + if (csvExtractorKeys.getCsvIterator() == null && !processInputStream(0)) { + return null; + } + + Iterator readerIterator = csvExtractorKeys.getCsvIterator(); + if (csvExtractorKeys.getSampleRows().size() > 0) { + csvExtractorKeys.incrProcessedCount(); + // update work unit status along the way, since we are using iterators + workUnitStatus.setPageStart(csvExtractorKeys.getProcessedCount()); + workUnitStatus.setPageNumber(csvExtractorKeys.getCurrentPageNumber()); + String[] row = csvExtractorKeys.getSampleRows().pollFirst(); + return addDerivedFields(row); + } else if (readerIterator.hasNext()) { + csvExtractorKeys.incrProcessedCount(); + // update work unit status along the way, since we are using iterators + workUnitStatus.setPageStart(csvExtractorKeys.getProcessedCount()); + workUnitStatus.setPageNumber(csvExtractorKeys.getCurrentPageNumber()); + // filtering is only required when schema is defined + String[] row = readerIterator.next(); + CsvSchemaBasedFilter csvSchemaBasedFilter = (CsvSchemaBasedFilter) rowFilter; + if (csvSchemaBasedFilter != null) { + row = csvSchemaBasedFilter.filter(row); + // when column projection is specified, the filter data should be the same size as the column projection + if (csvExtractorKeys.getColumnProjection().size() > 0 && row.length != csvExtractorKeys.getColumnProjection() + .size()) { + failWorkUnit("Some indicies in column projection are out of bound"); + } + } + return addDerivedFields(row); + } else { + connection.closeStream(); + if (hasNextPage() && processInputStream(csvExtractorKeys.getProcessedCount())) { + return readRecord(reuse); + } + } + return null; + } + + /** + * This is the main method in this extractor, it extracts data from source and perform essential checks. + * + * @param starting the initial record count, indicating if it is the first of a series of requests + * @return true if Successful + */ + @Override + protected boolean processInputStream(long starting) { + if (!super.processInputStream(starting)) { + return false; + } + + // if Content-Type is provided, but not text/csv, the response can have + // useful error information + JsonObject expectedContentType = MultistageProperties.MSTAGE_HTTP_RESPONSE_TYPE.getValidNonblankWithDefault(state); + HashSet expectedContentTypeSet = new LinkedHashSet<>(Arrays.asList("text/csv", "application/gzip")); + if (expectedContentType.has(CONTENT_TYPE_KEY) || expectedContentType.has(CONTENT_TYPE_KEY.toLowerCase())) { + for (Map.Entry entry: expectedContentType.entrySet()) { + expectedContentTypeSet.add(entry.getValue().getAsString()); + } + } + + if (!checkContentType(workUnitStatus, expectedContentTypeSet)) { + return false; + } + + if (workUnitStatus.getBuffer() != null) { + try { + InputStream input = workUnitStatus.getBuffer(); + for (StreamProcessor transformer : extractorKeys.getPreprocessors()) { + if (transformer instanceof InputStreamProcessor) { + input = ((InputStreamProcessor) transformer).process(input); + } + } + + CSVParser parser = new CSVParserBuilder().withSeparator(csvExtractorKeys.getSeparator().charAt(0)) + .withQuoteChar(csvExtractorKeys.getQuoteCharacter().charAt(0)) + .withEscapeChar(csvExtractorKeys.getEscapeCharacter().charAt(0)) + .build(); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(input, Charset.forName( + MultistageProperties.MSTAGE_SOURCE_DATA_CHARACTER_SET.getValidNonblankWithDefault(state)))).withCSVParser(parser) + .build(); + Iterator readerIterator = reader.iterator(); + // convert some sample data to json to infer the schema + if (!jobKeys.hasOutputSchema() && starting == 0) { + // initialize a reader without skipping lines since header might be used + JsonArray inferredSchema = inferSchemaWithSample(readerIterator); + extractorKeys.setInferredSchema(inferredSchema); + // build the columnToIndexMap for derived fields based on the inferred schema + if (jobKeys.getDerivedFields().entrySet().size() != 0) { + buildColumnToIndexMap(inferredSchema); + } + } else { + skipRowAndSaveHeader(readerIterator); + } + csvExtractorKeys.setCsvIterator(readerIterator); + } catch (Exception e) { + log.error("Error reading the input stream: {}", e.getMessage()); + return false; + } + } + + // return false to stop the job under these situations + if (workUnitStatus.getBuffer() == null || csvExtractorKeys.getCsvIterator() == null) { + return false; + } + csvExtractorKeys.incrCurrentPageNumber(); + + csvExtractorKeys.logDebugAll(state.getWorkunit()); + workUnitStatus.logDebugAll(); + extractorKeys.logDebugAll(state.getWorkunit()); + + return hasNext(); + } + + /** + * Initialize row filter + * @param schemaArray schema array + */ + @Override + protected void setRowFilter(JsonArray schemaArray) { + if (rowFilter == null) { + if (MultistageProperties.MSTAGE_ENABLE_SCHEMA_BASED_FILTERING.getValidNonblankWithDefault(state)) { + rowFilter = new CsvSchemaBasedFilter(new JsonIntermediateSchema(schemaArray), csvExtractorKeys); + } + } + } + + /** + * Expand a column projection input string + * @param columnProjection columns to project + * @param numColumnsInPredefinedSchema number of columns + * @return a set of column indices + */ + private Set expandColumnProjection(String columnProjection, int numColumnsInPredefinedSchema) { + Set expandedColumnProjection = new HashSet<>(); + if (columnProjection != null && columnProjection.length() > 0) { + for (String val : columnProjection.split(",")) { + if (val.matches("^(\\d+)-(\\d+)$")) { // range + int left = Integer.parseInt(val.split("-")[0]); + int right = Integer.parseInt(val.split("-")[1]); + if (left < 0 || right < 0 || left >= right) { + failWorkUnit(String.format("Invalid range in column projection input %s", val)); + break; + } else { + for (int i = left; i <= right; i++) { + expandedColumnProjection.add(i); + } + } + } else if (val.matches("^\\d+$")) { // single number + int col = Integer.parseInt(val); + if (col < 0) { + failWorkUnit(String.format("Invalid index in column projection input %s", val)); + break; + } else { + expandedColumnProjection.add(col); + } + } else { // unknown patterns + failWorkUnit(String.format("Invalid value in column projection input %s", val)); + break; + } + } + + if (expandedColumnProjection.size() != numColumnsInPredefinedSchema) { + failWorkUnit("The number of columns in column projection does not match the size of the predefined schema"); + } + } + return expandedColumnProjection; + } + + /** + * Helper function that builds the column name to index map + * @param schema the Avro-flavor schema + */ + private void buildColumnToIndexMap(JsonArray schema) { + Map columnToIndexMap = new HashMap<>(); + int index = 0; + for (JsonElement column : schema) { + String columnName = column.getAsJsonObject().get("columnName").getAsString(); + columnToIndexMap.put(columnName, index++); + } + csvExtractorKeys.setColumnToIndexMap(columnToIndexMap); + } + + /** + * Process the derived field source to get intermediate value + * @param row current row being processed + * @param derivedFieldDef map {type: type1, source: source1, format: format1} + * @return String value of the derived field + */ + private String processDerivedFieldSource(String[] row, Map derivedFieldDef) { + String source = derivedFieldDef.get("source"); + String strValue = ""; + DateTimeZone timeZone = DateTimeZone.forID(timezone.isEmpty() ? DEFAULT_TIMEZONE : timezone); + + // get the base value from various sources + if (source.equalsIgnoreCase("currentdate")) { + strValue = String.valueOf(DateTime.now().getMillis()); + } else if (source.matches("P\\d+D")) { + Period period = Period.parse(source); + strValue = + String.valueOf(DateTime.now().withZone(timeZone).minus(period).dayOfMonth().roundFloorCopy().getMillis()); + } else if (csvExtractorKeys.getColumnToIndexMap().containsKey(source)) { + int sourceIndex = csvExtractorKeys.getColumnToIndexMap().get(source); + strValue = row[sourceIndex]; + } else if (VariableUtils.PATTERN.matcher(source).matches()) { + strValue = replaceVariable(source); + } else { + failWorkUnit("Unsupported source for derived fields: " + source); + } + + // further processing required for specific types + String type = derivedFieldDef.get("type"); + if (type.equals("epoc") && !(source.equalsIgnoreCase(CURRENT_DATE) || source.matches(PXD)) + && derivedFieldDef.containsKey("format")) { + strValue = deriveEpoc(derivedFieldDef.get("format"), strValue); + } + return strValue; + } + + /** + * calculate and add derived fields, + * derivedFields map in this in structure {name1 : {type: type1, source: source1, format: format1}} + * @param row original record + * @return modified record + */ + private String[] addDerivedFields(String[] row) { + Set>> derivedFields = jobKeys.getDerivedFields().entrySet(); + int numDerivedFields = derivedFields.size(); + if (numDerivedFields == 0) { + return row; + } + // allocate a larger array to accommodate the derived fields + int originalLength = row.length; + row = Arrays.copyOf(row, originalLength + numDerivedFields); + + int index = originalLength; + for (Map.Entry> derivedField : derivedFields) { + Map derivedFieldDef = derivedField.getValue(); + String strValue = processDerivedFieldSource(row, derivedFieldDef); + String type = derivedFieldDef.get("type"); + if (SUPPORTED_DERIVED_FIELD_TYPES.contains(type)) { + row[index] = strValue; + } else { + failWorkUnit("Unsupported type for derived fields: " + type); + } + index++; // increment index so the next derived field is written to a new column + } + return row; + } + + /** + * Save the header row if present, and skip rows + * @param readerIterator iterator of input stream + */ + private void skipRowAndSaveHeader(Iterator readerIterator) { + int linesRead = 0; + while (readerIterator.hasNext() && linesRead < csvExtractorKeys.getRowsToSkip()) { + String[] line = getNextLineWithCleansing(readerIterator); + if (linesRead == 0 && csvExtractorKeys.getColumnHeader()) { + // if header is present, the first row will be used as header + csvExtractorKeys.setHeaderRow(line); + // check if header has all columns in schema + if (jobKeys.hasOutputSchema()) { + List schemaColumns = new ArrayList<>(new JsonIntermediateSchema(jobKeys.getOutputSchema()) + .getColumns().keySet()); + List headerRow = Arrays.asList(csvExtractorKeys.getHeaderRow()); + csvExtractorKeys.setIsValidOutputSchema(SchemaUtils.isValidOutputSchema(schemaColumns, headerRow)); + } + linesRead++; + } + } + } + + /** + * Perform limited cleansing so that data can be processed by converters + * + * @param input the input data to be cleansed + * @return the cleansed data + */ + private void limitedCleanse(String[] input) { + for (int i = 0; i < input.length; i++) { + input[i] = input[i].replaceAll("(\\s|\\$)", "_"); + } + } + + /** + * Read next row and cleanse the data if enabled + * @param readerIterator iterator of input stream + * @return next line + */ + private String[] getNextLineWithCleansing(Iterator readerIterator) { + String[] line = readerIterator.next(); + if (jobKeys.isEnableCleansing()) { + limitedCleanse(line); + } + return line; + } + + /** + * Infer schema based on sample data. Rows read while preparing the sample is saved in a queue to be read again later. + * @param readerIterator iterator of input stream + * @return inferred schema + */ + private JsonArray inferSchemaWithSample(Iterator readerIterator) { + skipRowAndSaveHeader(readerIterator); + String[] header = csvExtractorKeys.getHeaderRow(); + JsonArray sample = new JsonArray(); + int linesRead = 0; + // read record until iterator is empty or enough lines have been read for the sample + while (readerIterator.hasNext() && linesRead < SCHEMA_INFER_MAX_SAMPLE_SIZE) { + String[] line = readerIterator.next(); + // save the new line to the end of queue + csvExtractorKeys.getSampleRows().offerLast(line); + // add the current row data to the sample json + JsonObject row = new JsonObject(); + for (int i = 0; i < line.length; i++) { + // do not use headers as keys if the header row and data have different lengths + String key = header != null && header.length == line.length ? header[i] : "col" + i; + addParsedCSVData(key, line[i], row); + } + sample.add(row); + linesRead++; + } + return SchemaBuilder.fromJsonData(sample).buildAltSchema(jobKeys.getDefaultFieldTypes(), + jobKeys.isEnableCleansing(), + jobKeys.getSchemaCleansingPattern(), + jobKeys.getSchemaCleansingReplacement(), + jobKeys.getSchemaCleansingNullable()).getAsJsonArray(); + } + + /** + * Helper function for creating sample json data for schema inference + * Type conversion is required as all data will be parsed as string otherwise + * @param key name of the column + * @param data original data from a column + * @param row json form of the row + */ + private void addParsedCSVData(String key, String data, JsonObject row) { + if (Ints.tryParse(data) != null) { + row.addProperty(key, Integer.valueOf(data)); + } else if (Longs.tryParse(data) != null) { + row.addProperty(key, Long.valueOf(data)); + } else if (Doubles.tryParse(data) != null) { + row.addProperty(key, Double.valueOf(data)); + } else if (data.toLowerCase().matches("(true|false)")) { + row.addProperty(key, Boolean.valueOf(data)); + } else if (Floats.tryParse(data) != null) { + row.addProperty(key, Float.valueOf(data)); + } else { + row.addProperty(key, data); + } + } + + /** + * Helper function that indicates if there are any records left to read + * @return true if there are more records and false otherwise + */ + protected boolean hasNext() { + return csvExtractorKeys.getCsvIterator().hasNext() || csvExtractorKeys.getSampleRows().size() > 0; + } + + /** + * If the iterator is null, then it must be the first request + * @param starting the starting position of the request + * @return true if the iterator is null, otherwise false + */ + @Override + protected boolean isFirst(long starting) { + return csvExtractorKeys.getCsvIterator() == null; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/extractor/FileDumpExtractor.java b/dil/src/main/java/com/linkedin/dil/extractor/FileDumpExtractor.java new file mode 100644 index 0000000..d030b7d --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/extractor/FileDumpExtractor.java @@ -0,0 +1,248 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.List; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.FileDumpExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.preprocessor.InputStreamProcessor; +import com.linkedin.dil.preprocessor.OutputStreamProcessor; +import com.linkedin.dil.preprocessor.StreamProcessor; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.VariableUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; + + +/** + * FileDumpExtractor takes an InputStream, applies proper preprocessors, and saves the InputStream + * to a file. + */ +@Slf4j +public class FileDumpExtractor extends MultistageExtractor { + private final static int HADOOP_DEFAULT_FILE_LENGTH_LIMIT = 255; + @Getter + private FileDumpExtractorKeys fileDumpExtractorKeys = new FileDumpExtractorKeys(); + + public FileDumpExtractor(WorkUnitState state, JobKeys jobKeys) { + super(state, jobKeys); + super.initialize(fileDumpExtractorKeys); + initialize(fileDumpExtractorKeys); + } + + @Override + protected void initialize(ExtractorKeys keys) { + fileDumpExtractorKeys.logUsage(state); + // initialize FileDumpExtractor keys + // Extractors follow the pattern of initializing in constructor to avoid forgetting initialization + // in sub-classes + if (MultistageProperties.DATA_PUBLISHER_FINAL_DIR.validateNonblank(state)) { + fileDumpExtractorKeys.setFileDumpLocation(MultistageProperties.DATA_PUBLISHER_FINAL_DIR.getProp(state)); + } else { + throw new RuntimeException("data publisher final dir is empty or null"); + } + + // file permission is required, but a default value is given in MultistageProperties + fileDumpExtractorKeys.setFileWritePermissions( + MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getValidNonblankWithDefault(state)); + + // work unit file name is based on a template that is defined by ms.extractor.target.file.name + // and then substituted with activation parameters + // TODO to allow substitution of variables defined in ms.parameters + fileDumpExtractorKeys.setFileName(getFileName(state)); + + fileDumpExtractorKeys.logDebugAll(state.getWorkunit()); + } + + /** + * Utility function to do a double assignment + * @param fileDumpExtractorKeys the extractor key + */ + @VisibleForTesting + protected void setFileDumpExtractorKeys(FileDumpExtractorKeys fileDumpExtractorKeys) { + this.extractorKeys = fileDumpExtractorKeys; + this.fileDumpExtractorKeys = fileDumpExtractorKeys; + } + + /** + * This method rely on the parent class to get a JsonArray formatted schema, and pass it out as + * a string. Typically we expect the downstream is a CsvToJsonConverter. + * + * @return schema that is structured as a JsonArray but formatted as a String + */ + @Override + public String getSchema() { + return super.getOrInferSchema().toString(); + } + + + /** + * TODO return 1 record of the file metadata like path, size, and timestamp, etc. + * For dumping files on hdfs we don't need to return a specific record but just save file on hdfs and return null. + */ + @Nullable + @Override + public String readRecord(String reuse) { + workUnitStatus.setPageStart(fileDumpExtractorKeys.getCurrentFileNumber() + * jobKeys.getPaginationInitValues().getOrDefault(ParameterTypes.PAGESIZE, 1L)); + workUnitStatus.setPageNumber(fileDumpExtractorKeys.getCurrentFileNumber() + 1); + workUnitStatus.setPageSize(jobKeys.getPaginationInitValues().getOrDefault(ParameterTypes.PAGESIZE, 1L)); + + if (processInputStream(this.fileDumpExtractorKeys.getCurrentFileNumber()) + && jobKeys.isPaginationEnabled()) { + this.fileDumpExtractorKeys.incrCurrentFileNumber(); + return readRecord(reuse); + } + return null; + } + + /** + * This is the main method in this extractor, it extracts data from source and perform essential checks. + * + * @param starting the initial record count, indicating if it is the first of a series of requests + * @return true if Successful + */ + @Override + protected boolean processInputStream(long starting) { + if (!super.processInputStream(starting)) { + return false; + } + + if (StringUtils.isBlank(fileDumpExtractorKeys.getFileName())) { + log.error("File name is empty so cannot dump onto the file system."); + this.state.setWorkingState(WorkUnitState.WorkingState.FAILED); + return false; + } + + if (workUnitStatus.getBuffer() == null) { + log.info("Received a NULL InputStream, end the work unit"); + return false; + } + + try { + + // apply preprocessors + InputStream input = workUnitStatus.getBuffer(); + for (StreamProcessor transformer : extractorKeys.getPreprocessors()) { + if (transformer instanceof InputStreamProcessor) { + input = ((InputStreamProcessor) transformer).process(input); + } + } + + String fileName = fileDumpExtractorKeys.getFileDumpLocation() + "/" + + fileDumpExtractorKeys.getFileName(); + if (jobKeys.isPaginationEnabled()) { + fileName += "_"; + fileName += this.fileDumpExtractorKeys.getCurrentFileNumber(); + } + writeToFileSystem(input, fileName); + } catch (Exception e) { + log.error("Error while extracting from source or writing to target", e); + this.state.setWorkingState(WorkUnitState.WorkingState.FAILED); + return false; + } + return true; + } + + /** + * write an input stream at the dump location. + */ + private void writeToFileSystem(InputStream is, String dumplocation) { + Preconditions.checkNotNull(is, "InputStream"); + try { + FileSystem fs = FileSystem.get(new Configuration()); + FsPermission logPermission = new FsPermission(fileDumpExtractorKeys.getFileWritePermissions()); + + // handle file name extensions + String path = dumplocation; + for (StreamProcessor transformer : extractorKeys.getPreprocessors()) { + if (transformer instanceof OutputStreamProcessor) { + path = ((OutputStreamProcessor) transformer).convertFileName(path); + } + } + + // create output stream after renaming the file with proper extensions if needed + // if there is a output preprocessor, like GPG encryptor, specified + OutputStream os = FileSystem.create(fs, new Path(path), logPermission);; + for (StreamProcessor transformer : extractorKeys.getPreprocessors()) { + if (transformer instanceof OutputStreamProcessor) { + os = ((OutputStreamProcessor) transformer).process(os); + } + } + + byte[] buffer = new byte[8192]; + long totalBytes = 0; + int len = 0; + while ((len = is.read(buffer)) != -1) { + os.write(buffer, 0, len); + totalBytes += len; + } + is.close(); + os.flush(); + os.close(); + log.info("FileDumpExtractor: written {} bytes to file {}", totalBytes, dumplocation); + } catch (IOException e) { + throw new RuntimeException("Unable to dump file at specified location from FileDumpExtractor", e); + } + } + + /** + * TODO allow ms.extractor.target.file.name to use variables defined in ms.parameters + * TODO encode or remove restricted characters from file name + * Figure out what the file name should be based on the file name template and activation parameters + * @param state work unit state contains key configuration + * @return the file name + */ + private String getFileName(WorkUnitState state) { + String fileNameTemplate = MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_NAME.getValidNonblankWithDefault(state); + JsonObject activationParameters = extractorKeys.getActivationParameters(); + try { + String filePath = VariableUtils.replaceWithTracking(fileNameTemplate, activationParameters).getKey(); + List segments = Lists.newArrayList(filePath.split(Path.SEPARATOR)); + String fileName = segments.get(segments.size() - 1); + if (fileName.length() > HADOOP_DEFAULT_FILE_LENGTH_LIMIT) { + log.warn("File name is truncated to {} characters", HADOOP_DEFAULT_FILE_LENGTH_LIMIT); + fileName = fileName.substring(0, HADOOP_DEFAULT_FILE_LENGTH_LIMIT - 1); + } + segments.remove(segments.size() - 1); + segments.add(fileName); + return Joiner.on(Path.SEPARATOR_CHAR).join(segments); + } catch (Exception e) { + log.error("Error resolving placeholders in {}", MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_NAME.toString()); + log.error("The value \"{}\" will be used as is", fileNameTemplate); + return fileNameTemplate; + } + } + + /* + TODO : Support reading file from hdfs again to apply transformation with GZIPInputStream + private InputStream readFromFileSystem(String location) { + InputStream in = null; + try { + FileSystem fs = FileSystem.get(new Configuration()); + in = fs.open(new Path(location)); + } catch (Exception e) { + e.printStackTrace(); + } + return in; + } */ +} diff --git a/dil/src/main/java/com/linkedin/dil/extractor/JsonExtractor.java b/dil/src/main/java/com/linkedin/dil/extractor/JsonExtractor.java new file mode 100644 index 0000000..1e0c7b8 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/extractor/JsonExtractor.java @@ -0,0 +1,617 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Splitter; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import com.google.gson.JsonPrimitive; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.Map; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.filter.JsonSchemaBasedFilter; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.keys.JsonExtractorKeys; +import com.linkedin.dil.util.EncryptionUtils; +import com.linkedin.dil.util.JsonUtils; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.SchemaBuilder; +import org.testng.Assert; + + +/** + * JsonExtractor reads Json formatted responses from HTTP sources, like Rest API source. + * + * This extractor will output schema in JsonArray format, such as + * [{"columnName": "id", "type": "string"},{"columnName": "duration", "type": "integer"}] + * + * The rows will be pass output to converters in the form of JsonObjects, which represent + * rows. + * + * This extractor can used to feed into a JsonIntermediateToAvroConverter and sink data into Avro. + * + * @author chrli + */ +@Slf4j +public class JsonExtractor extends MultistageExtractor { + final private static JsonObject EOF = new Gson().fromJson("{\"EOF\": \"EOF\"}", JsonObject.class); + + private final static String JSON_MEMBER_SEPARATOR = "."; + private final static Long SCHEMA_INFER_MAX_SAMPLE_SIZE = 100L; + + @Getter + private JsonExtractorKeys jsonExtractorKeys = new JsonExtractorKeys(); + + public JsonExtractor(WorkUnitState state, JobKeys jobKeys) { + super(state, jobKeys); + super.initialize(jsonExtractorKeys); + initialize(jsonExtractorKeys); + } + + @Override + protected void initialize(ExtractorKeys keys) { + jsonExtractorKeys.logUsage(state); + jsonExtractorKeys.logDebugAll(state.getWorkunit()); + } + + /** + * Utility function to do a double assignment + * @param jsonExtractorKeys the extractor key + */ + @VisibleForTesting + protected void setJsonExtractorKeys(JsonExtractorKeys jsonExtractorKeys) { + this.extractorKeys = jsonExtractorKeys; + this.jsonExtractorKeys = jsonExtractorKeys; + } + + /** + * getSchema will be called by Gobblin to retrieve the schema of the output of this extract. + * The returned schema will be used in subsequent converters. The alternative schema here suites + * JsonIntermediateToAvroConverter. Future development can support other converter by making + * the schema conversion configurable. + * + * typically a json schema would be like following with nesting structures + *

{

+ *

"type": "array",

+ *

"items": {

+ *

"id": {

+ *

"type": "string"

+ *

},

+ *

"emailAddress": {

+ *

"type": "string"

+ *

},

+ *

"emailAliases": {

+ *

"type": "array",

+ *

"items": {

+ *

"type": ["string", "null"]

+ *

}

+ *

}, + *

"personalMeetingUrls": {

+ *

"type": "string",

+ *

"items": {

+ *

"type": "null"

+ *

}

+ *

},

+ *

"settings": {

+ *

"type": "object",

+ *

"properties": {

+ *

"webConferencesRecorded": {

+ *

"type": "boolean"

+ *

},

+ *

"preventWebConferenceRecording": {

+ *

"type": "boolean"

+ *

},

+ *

"preventEmailImport": {

+ *

"type": "boolean"

+ *

}

+ *

}

+ *

}

+ *

}

+ *

}

+ * + *

However an alternative or intermediate way of writing the schema

+ *

{"emailAddress": {"type": "string"}}

+ *

is:

+ *

{"columnName": "emailAddress", "dataType": {"type": "string'}}

+ * + * @return the schema of the extracted record set in a JasonArray String + */ + @Override + public JsonArray getSchema() { + log.debug("Retrieving schema definition"); + JsonArray schemaArray = super.getOrInferSchema(); + Assert.assertNotNull(schemaArray); + if (jobKeys.getDerivedFields().size() > 0) { + schemaArray.addAll(addDerivedFieldsToAltSchema()); + } + return schemaArray; + } + + @Nullable + @Override + public JsonObject readRecord(JsonObject reuse) { + if (jsonExtractorKeys.getJsonElementIterator() == null && !processInputStream(0)) { + return null; + } + + if (jsonExtractorKeys.getJsonElementIterator().hasNext()) { + jsonExtractorKeys.setProcessedCount(1 + jsonExtractorKeys.getProcessedCount()); + JsonObject row = jsonExtractorKeys.getJsonElementIterator().next().getAsJsonObject(); + if (jobKeys.getEncryptionField() != null && jobKeys.getEncryptionField().size() > 0) { + row = encryptJsonFields("", row); + } + if (jobKeys.isEnableCleansing()) { + row = limitedCleanse(row).getAsJsonObject(); + } + JsonSchemaBasedFilter jsonSchemaBasedFilter = (JsonSchemaBasedFilter) rowFilter; + return addDerivedFields(jsonSchemaBasedFilter != null ? jsonSchemaBasedFilter.filter(row) : row); + } else { + connection.closeStream(); + if (hasNextPage() && processInputStream(jsonExtractorKeys.getProcessedCount())) { + return readRecord(reuse); + } + } + if (!this.eof && extractorKeys.getExplictEof()) { + eof = true; + return EOF; + } + return null; + } + + /** + * This is the main method in this extractor, it extracts data from source and perform essential checks. + * + * @param starting [0, +INF), points to the last count of record processed, 0 means it's the first of a series of requests + * @return true if Successful + */ + @Override + protected boolean processInputStream(long starting) { + if (!super.processInputStream(starting)) { + return false; + } + + // if Content-Type is provided, but not application/json, the response can have + // useful error information + JsonObject expectedContentType = MultistageProperties.MSTAGE_HTTP_RESPONSE_TYPE.getValidNonblankWithDefault(state); + HashSet expectedContentTypeSet = new LinkedHashSet<>(Arrays.asList("application/json")); + if (expectedContentType.has(CONTENT_TYPE_KEY)) { + for (Map.Entry entry: expectedContentType.entrySet()) { + expectedContentTypeSet.add(entry.getValue().getAsString()); + } + } + if (!checkContentType(workUnitStatus, expectedContentTypeSet)) { + return false; + } + + JsonElement data; + try { + data = extractJson(workUnitStatus.getBuffer()); + // return false to stop the job under these situations + if (data == null || data.isJsonNull() || data.isJsonPrimitive()) { + return false; + } + } catch (Exception e) { + log.error("Source Error: {}", e.getMessage()); + state.setWorkingState(WorkUnitState.WorkingState.FAILED); + return false; + } + + log.debug("Checking parsed Json object"); + + JsonArray coreData = new JsonArray(); + JsonElement payload; + if (StringUtils.isNotBlank(jobKeys.getDataField())) { + payload = JsonUtils.get(data.getAsJsonObject(), jobKeys.getDataField()); + if (payload.isJsonNull()) { + log.info("Terminate the ingestion because no actual payload in the response"); + return false; + } + } else { + payload = data; + } + + if (payload.isJsonArray()) { + coreData = payload.getAsJsonArray(); + } else { + log.info("Payload is not a Json Array, therefore add the whole payload a one single entry"); + coreData.add(payload); + } + + // get basic profile of the returned data + jsonExtractorKeys.setTotalCount(getTotalCountValue(data)); + jsonExtractorKeys.setPushDowns(retrievePushDowns(data, jobKeys.getDerivedFields())); + extractorKeys.setSessionKeyValue(retrieveSessionKeyValue(data)); + jsonExtractorKeys.setCurrentPageNumber(jsonExtractorKeys.getCurrentPageNumber() + 1); + + // get profile of the payload + if (!jobKeys.hasOutputSchema() && starting == 0 && coreData.size() > 0) { + JsonArray sample = new JsonArray(); + for (int i = 0; i < Long.min(coreData.size(), SCHEMA_INFER_MAX_SAMPLE_SIZE); i++) { + sample.add(JsonUtils.deepCopy(coreData.get(i))); + } + extractorKeys.setInferredSchema(SchemaBuilder.fromJsonData(sample).buildAltSchema( + jobKeys.getDefaultFieldTypes(), + jobKeys.isEnableCleansing(), + jobKeys.getSchemaCleansingPattern(), + jobKeys.getSchemaCleansingReplacement(), + jobKeys.getSchemaCleansingNullable()).getAsJsonArray()); + } + + // update work unit status for next Source call + workUnitStatus.setSetCount(coreData.size()); + workUnitStatus.setTotalCount(jsonExtractorKeys.getTotalCount()); + workUnitStatus.setSessionKey(extractorKeys.getSessionKeyValue()); + updatePaginationStatus(data); + + jsonExtractorKeys.logDebugAll(state.getWorkunit()); + workUnitStatus.logDebugAll(); + extractorKeys.logDebugAll(state.getWorkunit()); + + jsonExtractorKeys.setJsonElementIterator(coreData.getAsJsonArray().iterator()); + return coreData.getAsJsonArray().size() > 0; + } + + /** + * Process the derived field source to get intermediate value + * @param row current row being processed + * @param name derived field's name + * @param derivedFieldDef map {type: type1, source: source1, format: format1} + * @return String value of the derived field + */ + private String processDerivedFieldSource(JsonObject row, String name, Map derivedFieldDef) { + String source = derivedFieldDef.getOrDefault("source", StringUtils.EMPTY); + String inputValue = derivedFieldDef.getOrDefault("value", StringUtils.EMPTY); + boolean isInputValueFromSource = false; + + // get the base value from the source row or push down if present + if (jsonExtractorKeys.getPushDowns().entrySet().size() > 0 && jsonExtractorKeys.getPushDowns().has(name)) { + inputValue = jsonExtractorKeys.getPushDowns().get(name).getAsString(); + isInputValueFromSource = true; + } else if (isInputValueFromSource(source)) { + JsonElement ele = JsonUtils.get(row, source); + if (ele != null && !ele.isJsonNull()) { + inputValue = ele.getAsString(); + isInputValueFromSource = true; + } + } + + return generateDerivedFieldValue(derivedFieldDef, inputValue, isInputValueFromSource); + } + + /** + * calculate and add derived fields + * derivedFields map in this in structure {name1 : {type: type1, source: source1, format: format1}} + * @param row original record + * @return modified record + */ + private JsonObject addDerivedFields(JsonObject row) { + for (Map.Entry> derivedField : jobKeys.getDerivedFields().entrySet()) { + String name = derivedField.getKey(); + Map derivedFieldDef = derivedField.getValue(); + String strValue = processDerivedFieldSource(row, name, derivedFieldDef); + String type = derivedField.getValue().get("type"); + switch (type) { + case "epoc": + if (strValue.length() > 0) { + row.addProperty(name, Long.parseLong(strValue)); + } + break; + case "string": + case "regexp": + row.addProperty(name, strValue); + break; + case "boolean": + row.addProperty(name, Boolean.parseBoolean(strValue)); + break; + case "integer": + row.addProperty(name, Integer.parseInt(strValue)); + break; + case "number": + row.addProperty(name, Double.parseDouble(strValue)); + break; + default: + failWorkUnit("Unsupported type for derived fields: " + type); + break; + } + } + return row; + } + + /** + * Update pagination parameters + * @param data response from the source, can be JsonArray or JsonObject + */ + private Map getNextPaginationValues(JsonElement data) { + Map paginationKeys = jobKeys.getPaginationFields(); + Map paginationValues = new HashMap<>(); + + if (data.isJsonObject()) { + JsonElement pageStartElement = null; + JsonElement pageSizeElement = null; + JsonElement pageNumberElement = null; + + if (paginationKeys.containsKey(ParameterTypes.PAGESTART)) { + pageStartElement = JsonUtils.get(data.getAsJsonObject(), paginationKeys.get(ParameterTypes.PAGESTART)); + } else { + // update page start directly to rows processed as Next page start + paginationValues.put(ParameterTypes.PAGESTART, + jsonExtractorKeys.getProcessedCount() + workUnitStatus.getSetCount()); + } + + if (paginationKeys.containsKey(ParameterTypes.PAGESIZE)) { + pageSizeElement = JsonUtils.get(data.getAsJsonObject(), paginationKeys.get(ParameterTypes.PAGESIZE)); + } else { + paginationValues.put(ParameterTypes.PAGESIZE, + jobKeys.getPaginationInitValues().getOrDefault(ParameterTypes.PAGESIZE, 0L)); + } + + if (paginationKeys.containsKey(ParameterTypes.PAGENO)) { + pageNumberElement = JsonUtils.get(data.getAsJsonObject(), paginationKeys.get(ParameterTypes.PAGENO)); + } else { + paginationValues.put(ParameterTypes.PAGENO, jsonExtractorKeys.getCurrentPageNumber()); + } + + if (pageStartElement != null && pageSizeElement != null && !pageStartElement.isJsonNull() + && !pageSizeElement.isJsonNull()) { + paginationValues.put(ParameterTypes.PAGESTART, pageStartElement.getAsLong() + pageSizeElement.getAsLong()); + paginationValues.put(ParameterTypes.PAGESIZE, pageSizeElement.getAsLong()); + } + if (pageNumberElement != null && !pageNumberElement.isJsonNull()) { + paginationValues.put(ParameterTypes.PAGENO, pageNumberElement.getAsLong() + 1); + } + } else if (data.isJsonArray()) { + paginationValues.put(ParameterTypes.PAGESTART, + jsonExtractorKeys.getProcessedCount() + data.getAsJsonArray().size()); + paginationValues.put(ParameterTypes.PAGESIZE, + jobKeys.getPaginationInitValues().getOrDefault(ParameterTypes.PAGESIZE, 0L)); + paginationValues.put(ParameterTypes.PAGENO, jsonExtractorKeys.getCurrentPageNumber()); + } + return paginationValues; + } + + /** + * retrieveSessionKey() parses the response JSON and extract the session key value + * + * @param input the Json payload + * @return the session key if the property is available + */ + private String retrieveSessionKeyValue(JsonElement input) { + if (jobKeys.getSessionKeyField().entrySet().size() == 0 || !input.isJsonObject()) { + return StringUtils.EMPTY; + } + + JsonObject data = input.getAsJsonObject(); + + Iterator members = Splitter.on(JSON_MEMBER_SEPARATOR) + .omitEmptyStrings() + .trimResults() + .split(jobKeys.getSessionKeyField().get("name").getAsString()) + .iterator(); + + JsonElement e = data; + while (members.hasNext()) { + String member = members.next(); + if (e.getAsJsonObject().has(member)) { + e = e.getAsJsonObject().get(member); + if (!members.hasNext()) { + return e.getAsString(); + } + } + } + return extractorKeys.getSessionKeyValue() == null ? StringUtils.EMPTY : extractorKeys.getSessionKeyValue(); + } + + /** + * + * Retrieves the total row count member if it is expected. Without a total row count, + * this request is considered completed after the first call or when the session + * completion criteria is met, see {@link JsonExtractor#readRecord(JsonObject)} ()} + * + * @param data HTTP response JSON + * @return the expected total record count if the property is available + */ + private Long getTotalCountValue(JsonElement data) { + if (StringUtils.isBlank(jobKeys.getTotalCountField())) { + if (data.isJsonObject()) { + if (StringUtils.isNotBlank(jobKeys.getDataField())) { + JsonElement payload = JsonUtils.get(data.getAsJsonObject(), jobKeys.getDataField()); + if (payload.isJsonNull()) { + log.info("Expected payload at JsonPath={} doesn't exist", jobKeys.getDataField()); + return jsonExtractorKeys.getTotalCount(); + } else if (payload.isJsonArray()) { + return jsonExtractorKeys.getTotalCount() + payload.getAsJsonArray().size(); + } else { + throw new RuntimeException("Payload is not a JsonArray, only array payload is supported"); + } + } else { + // no total count field and no data field + return jsonExtractorKeys.getTotalCount() + 1; + } + } else if (data.isJsonArray()) { + return jsonExtractorKeys.getTotalCount() + data.getAsJsonArray().size(); + } else { + return jsonExtractorKeys.getTotalCount(); + } + } + + Iterator members = Splitter.on(JSON_MEMBER_SEPARATOR) + .omitEmptyStrings() + .trimResults() + .split(jobKeys.getTotalCountField()) + .iterator(); + + JsonElement e = data; + while (members.hasNext()) { + String member = members.next(); + if (e.getAsJsonObject().has(member)) { + e = e.getAsJsonObject().get(member); + if (!members.hasNext()) { + return e.getAsLong(); + } + } + } + return jsonExtractorKeys.getTotalCount(); + } + + private void updatePaginationStatus(JsonElement data) { + // update work unit status, and get ready for next calls, these steps are possible only + // when data is a JsonObject + Map pagination = getNextPaginationValues(data); + workUnitStatus.setPageStart(pagination.getOrDefault(ParameterTypes.PAGESTART, 0L)); + workUnitStatus.setPageSize(pagination.getOrDefault(ParameterTypes.PAGESIZE, 0L)); + workUnitStatus.setPageNumber(pagination.getOrDefault(ParameterTypes.PAGENO, 0L)); + } + + /** + * Perform limited cleansing so that data can be processed by converters + * + * TODO: make a dummy value for Null values + * @param input the input data to be cleansed + * @return the cleansed data + */ + private JsonElement limitedCleanse(JsonElement input) { + JsonElement output; + + if (input.isJsonObject()) { + output = new JsonObject(); + for (Map.Entry entry : input.getAsJsonObject().entrySet()) { + ((JsonObject) output).add(entry.getKey().replaceAll(jobKeys.getSchemaCleansingPattern(), + jobKeys.getSchemaCleansingReplacement()), limitedCleanse(entry.getValue())); + } + } else if (input.isJsonArray()) { + output = new JsonArray(); + for (JsonElement ele : input.getAsJsonArray()) { + ((JsonArray) output).add(limitedCleanse(ele)); + } + } else { + output = input; + } + return output; + } + + /** + * Function which iterates through the fields in a row and encrypts the particular field defined in the + * ms.encrypted.field property. + * @param input parentKey, JsonElement + * parentKey -> holds the key name in case of nested structures + * e.g. settings.webprocessor (parentkey = settings) + * @return row with the field encrypted through the Gobblin Utility + */ + private JsonObject encryptJsonFields(String parentKey, JsonElement input) { + JsonObject output = new JsonObject(); + JsonArray encryptionFields = jobKeys.getEncryptionField(); + for (Map.Entry entry : input.getAsJsonObject().entrySet()) { + JsonElement value = entry.getValue(); + String key = entry.getKey(); + // absolutekey holds the complete path of the key for matching with the encryptedfield + String absoluteKey = (parentKey.length() == 0) ? key : (parentKey + "." + key); + // this function assumes that the final value to be encrypted will always be a JsonPrimitive object and in case of + // of JsonObject it will iterate recursively. + if (value.isJsonPrimitive() && encryptionFields.contains(new JsonPrimitive(absoluteKey))) { + String valStr = EncryptionUtils.encryptGobblin(value.isJsonNull() ? "" : value.getAsString(), state); + output.add(key, new JsonPrimitive(valStr)); + } else if (value.isJsonObject()) { + output.add(key, encryptJsonFields(absoluteKey, value)); + } else { + output.add(key, value); + } + } + return output; + } + + /** + * Save values that are not in the "data" payload, but will be used in de-normalization. + * Values are saved by their derived field name. + * + * TODO: push down non-string values (low priority) + * + * @param response the Json response from source + * @param derivedFields list of derived fields + * @return list of values to be used in derived fields + */ + private JsonObject retrievePushDowns(JsonElement response, Map> derivedFields) { + if (response == null || response.isJsonNull() || response.isJsonArray()) { + return new JsonObject(); + } + JsonObject data = response.getAsJsonObject(); + JsonObject pushDowns = new JsonObject(); + for (Map.Entry> entry : derivedFields.entrySet()) { + String source = entry.getValue().get("source"); + if (data.has(source)) { + pushDowns.addProperty(entry.getKey(), data.get(source).getAsString()); + log.info("Identified push down value: {}", pushDowns); + } + } + return pushDowns; + } + + /** + * Convert the input stream buffer to a Json object + * @param input the InputStream buffer + * @return a Json object of type JsonElement + */ + private JsonElement extractJson(InputStream input) throws UnsupportedCharsetException { + log.debug("Parsing response InputStream as Json"); + JsonElement data = null; + if (input != null) { + data = new JsonParser().parse(new InputStreamReader(input, + Charset.forName(MultistageProperties.MSTAGE_SOURCE_DATA_CHARACTER_SET.getValidNonblankWithDefault(state)))); + connection.closeStream(); + } + return data; + } + + /** + * Terminate the extraction if: + * 1. total count has been initialized + * 2. all expected rows are fetched + * + * @param starting the current position, or starting position of next request + * @return true if all rows retrieve + */ + @Override + protected boolean isWorkUnitCompleted(long starting) { + return super.isWorkUnitCompleted(starting) || starting != 0 && StringUtils.isNotBlank(jobKeys.getTotalCountField()) + && starting >= jsonExtractorKeys.getTotalCount(); + } + + /** + * If the iterator is null, then it must be the first request + * @param starting the starting position of the request + * @return true if the iterator is null, otherwise false + */ + @Override + protected boolean isFirst(long starting) { + return jsonExtractorKeys.getJsonElementIterator() == null; + } + + /** + * Add condition to allow total row count can be used to control pagination. + * + * @return true if a new page request is needed + */ + @Override + protected boolean hasNextPage() { + return super.hasNextPage() || jsonExtractorKeys.getProcessedCount() < jsonExtractorKeys.getTotalCount(); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/extractor/MultistageExtractor.java b/dil/src/main/java/com/linkedin/dil/extractor/MultistageExtractor.java new file mode 100644 index 0000000..bb9defc --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/extractor/MultistageExtractor.java @@ -0,0 +1,876 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.collect.ImmutableList; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.annotation.Nullable; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.validator.routines.LongValidator; +import org.apache.gobblin.configuration.ConfigurationKeys; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.MultistageConnection; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.filter.JsonSchemaBasedFilter; +import com.linkedin.dil.filter.MultistageSchemaBasedFilter; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.preprocessor.StreamProcessor; +import com.linkedin.dil.util.DateTimeUtils; +import com.linkedin.dil.util.EncryptionUtils; +import com.linkedin.dil.util.HdfsReader; +import com.linkedin.dil.util.InputStreamUtils; +import com.linkedin.dil.util.JsonIntermediateSchema; +import com.linkedin.dil.util.JsonParameter; +import com.linkedin.dil.util.JsonUtils; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.SchemaBuilder; +import com.linkedin.dil.util.VariableUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.source.extractor.Extractor; +import org.apache.gobblin.source.extractor.extract.LongWatermark; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Period; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.testng.Assert; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * MulistageExtractor is the base class of other format specific Extractors. + * + * The base class only defines function to deal with work units and activation parameters + * + * @author chrli + * @param The schema class + * @param The data class + */ +@Slf4j +public class MultistageExtractor implements Extractor { + protected final static String CURRENT_DATE = "currentdate"; + protected final static String PXD = "P\\d+D"; + protected final static String CONTENT_TYPE_KEY = "Content-Type"; + protected final static List SUPPORTED_DERIVED_FIELD_TYPES = + Arrays.asList("epoc", "string", "integer", "number"); + protected static final String COMMA_STR = ","; + protected final static String DEFAULT_TIMEZONE = "America/Los_Angeles"; + + @Setter + protected String timezone = ""; + @Getter(AccessLevel.PUBLIC) + @Setter(AccessLevel.PACKAGE) + protected WorkUnitStatus workUnitStatus = WorkUnitStatus.builder().build(); + + @Getter(AccessLevel.PUBLIC) + protected WorkUnitState state = null; + protected MultistageSchemaBasedFilter rowFilter = null; + protected Boolean eof = false; + // subclass might override this to decide whether to do record + // level pagination + protected Iterator payloadIterator = null; + @Getter + ExtractorKeys extractorKeys = new ExtractorKeys(); + @Getter + JsonObject currentParameters = null; + @Getter + @Setter + MultistageConnection connection = null; + @Getter + @Setter + JobKeys jobKeys; + + public MultistageExtractor(WorkUnitState state, JobKeys jobKeys) { + this.state = state; + this.jobKeys = jobKeys; + } + + protected void initialize(ExtractorKeys keys) { + extractorKeys = keys; + extractorKeys.setActivationParameters(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getValidNonblankWithDefault(state)); + extractorKeys.setDelayStartTime(MultistageProperties.MSTAGE_WORKUNIT_STARTTIME_KEY.getProp(state)); + extractorKeys.setExplictEof(MultistageProperties.MSTAGE_DATA_EXPLICIT_EOF.getValidNonblankWithDefault(state)); + extractorKeys.setSignature(MultistageProperties.DATASET_URN_KEY.getProp(state)); + extractorKeys.setPreprocessors(getPreprocessors(state)); + extractorKeys.setPayloads(getPayloads(state)); + payloadIterator = extractorKeys.getPayloads().iterator(); + extractorKeys.logDebugAll(state.getWorkunit()); + } + + @Override + public S getSchema() { + return null; + } + + @Override + public long getExpectedRecordCount() { + return 0; + } + + @Override + public long getHighWatermark() { + return 0; + } + + @Nullable + @Override + public D readRecord(D reuse) { + return null; + } + + @Override + public void close() { + if (state.getWorkingState().equals(WorkUnitState.WorkingState.SUCCESSFUL)) { + state.setActualHighWatermark(state.getWorkunit().getExpectedHighWatermark(LongWatermark.class)); + } + if (connection != null) { + connection.closeAll(StringUtils.EMPTY); + } + } + + /** + * Core data extract function that calls the Source to obtain an InputStream and then + * decode the InputStream to records. + * + * @param starting the starting position of this extract, which mostly means the actual records + * that have been extracted previously + * @return false if no more data to be pulled or an significant error that requires early job termination + */ + protected boolean processInputStream(long starting) { + holdExecutionUnitPresetStartTime(); + + if (isWorkUnitCompleted(starting)) { + return false; + } + + currentParameters = isFirst(starting) ? getInitialWorkUnitParameters() : getCurrentWorkUnitParameters(); + extractorKeys.setDynamicParameters(currentParameters); + + WorkUnitStatus updatedStatus = null; + long retryies = Math.max(jobKeys.getRetryCount(), 1); + while (retryies > 0) { + try { + updatedStatus = connection == null ? null : isFirst(starting) ? connection.executeFirst(this.workUnitStatus) + : connection.executeNext(this.workUnitStatus); + retryies = 0; + } catch (RetriableAuthenticationException e) { + // TODO update sourceKeys + retryies--; + } + } + + if (updatedStatus == null) { + this.failWorkUnit("Received a NULL WorkUnitStatus, fail the work unit"); + return false; + } + // update work unit status + workUnitStatus.setBuffer(updatedStatus.getBuffer()); + workUnitStatus.setMessages(updatedStatus.getMessages()); + workUnitStatus.setSessionKey(getSessionKey(updatedStatus)); + + // update extractor key + extractorKeys.setSessionKeyValue(workUnitStatus.getSessionKey()); + + // read source schema from the message if available + if (jobKeys != null && !jobKeys.hasSourceSchema() && !jobKeys.hasOutputSchema() && workUnitStatus.getMessages() + .containsKey("schema")) { + jobKeys.setSourceSchema(workUnitStatus.getSchema()); + } + return true; + } + + /** + * Initialize row filter; by default, json schema based filter is used + * @param schemaArray schema array + */ + protected void setRowFilter(JsonArray schemaArray) { + if (rowFilter == null) { + if (MultistageProperties.MSTAGE_ENABLE_SCHEMA_BASED_FILTERING.getValidNonblankWithDefault(state)) { + rowFilter = new JsonSchemaBasedFilter(new JsonIntermediateSchema(schemaArray)); + } + } + } + + /** + * returns the schema definition from properties, or if not definition not present, returns the inferred schema + * + * @return the schema definition in a JsonArray + */ + JsonArray getOrInferSchema() { + if (!this.jobKeys.hasOutputSchema()) { + if (!processInputStream(0)) { + return createMinimumSchema(); + } + } + + JsonArray schemaArray = new JsonArray(); + if (this.jobKeys.hasOutputSchema()) { + // take pre-defined fixed schema + schemaArray = jobKeys.getOutputSchema(); + setRowFilter(schemaArray); + } else { + if (this.jobKeys.hasSourceSchema()) { + schemaArray = this.jobKeys.getSourceSchema(); + schemaArray = JsonUtils.deepCopy(schemaArray).getAsJsonArray(); + log.info("Source provided schema: {}", schemaArray.toString()); + } else if (extractorKeys.getInferredSchema() != null) { + schemaArray = JsonUtils.deepCopy(extractorKeys.getInferredSchema()).getAsJsonArray(); + log.info("Inferred schema: {}", schemaArray.toString()); + } + } + + return schemaArray; + } + + /** + * Get the work unit watermarks from the work unit state + * + * the return value will have format like + * + * {"low": 123, "high": 456} + * + * @return the specified low and expected high wartermark in a JsonObject format + */ + public JsonObject getWorkUnitWaterMarks() { + Long lowWatermark = state.getWorkunit().getLowWatermark(LongWatermark.class).getValue(); + Long highWatermark = state.getWorkunit().getExpectedHighWatermark(LongWatermark.class).getValue(); + JsonObject watermark = new JsonObject(); + watermark.addProperty("low", lowWatermark); + watermark.addProperty("high", highWatermark); + return watermark; + } + + /** + * a utility method to wait for its turn when multiple work units were started at the same time + */ + protected void holdExecutionUnitPresetStartTime() { + if (extractorKeys.getDelayStartTime() != 0) { + while (DateTime.now().getMillis() < extractorKeys.getDelayStartTime()) { + try { + Thread.sleep(100L); + } catch (Exception e) { + log.warn(e.getMessage()); + } + } + } + } + + /** + * read preprocessor configuration and break it into an array of strings, and then + * dynamically load each class and instantiate preprocessors. + * + * @param state the work unit state + * @return a list of preprocessors + */ + List> getPreprocessors(State state) { + ImmutableList.Builder> builder = ImmutableList.builder(); + JsonObject preprocessorsParams = + MultistageProperties.MSTAGE_EXTRACT_PREPROCESSORS_PARAMETERS.getValidNonblankWithDefault(state); + String preprocessors = MultistageProperties.MSTAGE_EXTRACT_PREPROCESSORS.getValidNonblankWithDefault(state); + JsonObject preprocessorParams; + for (String preprocessor : preprocessors.split(COMMA_STR)) { + String p = preprocessor.trim(); + if (!p.isEmpty()) { + try { + preprocessorParams = new JsonObject(); + if (preprocessorsParams.has(p)) { + // Get the parameters for the given processor class + preprocessorParams = preprocessorsParams.getAsJsonObject(p); + + // backward compatibility, by default create a decryption preprocessor + if (p.contains("GpgProcessor")) { + if (preprocessorParams.has("action") && preprocessorParams.get("action") + .getAsString() + .equalsIgnoreCase("encrypt")) { + p = p.replaceAll("GpgProcessor", "GpgEncryptProcessor"); + } else { + p = p.replaceAll("GpgProcessor", "GpgDecryptProcessor"); + } + } + + // Decrypt if any credential is encrypted + for (Map.Entry entry : preprocessorParams.entrySet()) { + String key = entry.getKey(); + String value = preprocessorParams.get(key).getAsString(); + String decryptedValue = EncryptionUtils.decryptGobblin(value, state); + preprocessorParams.addProperty(key, decryptedValue); + } + } + Class clazz = Class.forName(p); + StreamProcessor instance = + (StreamProcessor) clazz.getConstructor(JsonObject.class).newInstance(preprocessorParams); + builder.add(instance); + } catch (Exception e) { + log.error("Error creating preprocessor: {}, Exception: {}", p, e.getMessage()); + } + } + } + return builder.build(); + } + + /** + * set work unit state to fail and log an error message as failure reason + * @param error failure reason + */ + protected void failWorkUnit(String error) { + if (!StringUtils.isEmpty(error)) { + log.error(error); + } + this.state.setWorkingState(WorkUnitState.WorkingState.FAILED); + } + + /** + * read the source and derive epoc from an existing field + * @param format specified format of datetime string + * @param strValue pre-fetched value from the data source + * @return the epoc string: empty if failed to format strValue in the specified way + */ + protected String deriveEpoc(String format, String strValue) { + String epoc = ""; + // the source value must be a datetime string in the specified format + try { + DateTimeFormatter datetimeFormatter = DateTimeFormat.forPattern(format); + DateTime dateTime = datetimeFormatter.parseDateTime( + strValue.length() > format.length() ? strValue.substring(0, format.length()) : strValue); + epoc = String.valueOf(dateTime.getMillis()); + } catch (IllegalArgumentException e) { + try { + epoc = String.valueOf(DateTimeUtils.parse(strValue).getMillis()); + } catch (Exception e1) { + failWorkUnit(e1.getMessage() + e.getMessage()); + } + } + return epoc; + } + + /*** + * Append the derived field definition to the output schema + * + * @return output schema with the added derived field + */ + protected JsonArray addDerivedFieldsToAltSchema() { + JsonArray columns = new JsonArray(); + for (Map.Entry> entry : jobKeys.getDerivedFields().entrySet()) { + JsonObject column = new JsonObject(); + column.addProperty("columnName", entry.getKey()); + JsonObject dataType = new JsonObject(); + switch (entry.getValue().get(KEY_WORD_TYPE)) { + case "epoc": + dataType.addProperty(KEY_WORD_TYPE, "long"); + break; + case KEY_WORD_STRING: + case KEY_WORD_INTEGER: + case KEY_WORD_NUMBER: + case KEY_WORD_BOOLEAN: + dataType.addProperty(KEY_WORD_TYPE, entry.getValue().get(KEY_WORD_TYPE)); + break; + case "regexp": + dataType.addProperty(KEY_WORD_TYPE, KEY_WORD_STRING); + break; + default: + // by default take the source types + JsonElement source = JsonUtils.get(entry.getValue().get(KEY_WORD_SOURCE), jobKeys.getOutputSchema()); + dataType.addProperty(KEY_WORD_TYPE, source.isJsonNull() ? KEY_WORD_STRING + : source.getAsJsonObject().get(KEY_WORD_TYPE).getAsString()); + break; + } + column.add("dataType", dataType); + columns.add(column); + } + return columns; + } + + protected boolean isInputValueFromSource(String source) { + return !(StringUtils.isEmpty(source) || source.equalsIgnoreCase(CURRENT_DATE) || source.matches(PXD) + || VariableUtils.PATTERN.matcher(source).matches()); + } + + protected String generateDerivedFieldValue(Map derivedFieldDef, + final String inputValue, boolean isStrValueFromSource) { + String strValue = StringUtils.EMPTY; + long longValue = Long.MIN_VALUE; + String source = derivedFieldDef.getOrDefault("source", StringUtils.EMPTY); + String type = derivedFieldDef.get("type"); + String format = derivedFieldDef.getOrDefault("format", StringUtils.EMPTY); + DateTimeZone timeZone = DateTimeZone.forID(timezone.isEmpty() ? DEFAULT_TIMEZONE : timezone); + + // get the base value from various sources + if (source.equalsIgnoreCase(CURRENT_DATE)) { + longValue = DateTime.now().getMillis(); + } else if (source.matches(PXD)) { + Period period = Period.parse(source); + longValue = DateTime.now().withZone(timeZone).minus(period).dayOfMonth().roundFloorCopy().getMillis(); + } else if (VariableUtils.PATTERN.matcher(source).matches()) { + strValue = replaceVariable(source); + } else if (!StringUtils.isEmpty(source) && !isStrValueFromSource) { + failWorkUnit("Unsupported source for derived fields: " + source); + } + + // further processing required for specific types + switch (type) { + case "epoc": + if (longValue != Long.MIN_VALUE) { + strValue = String.valueOf(longValue); + } else if (!format.equals(StringUtils.EMPTY)) { + strValue = deriveEpoc(format, inputValue); + } else { + // Otherwise, the strValue should be a LONG string derived from a dynamic variable source + Assert.assertNotNull(LongValidator.getInstance().validate(strValue)); + } + break; + case "regexp": + Pattern pattern = Pattern.compile(!format.equals(StringUtils.EMPTY) ? format : "(.*)"); + Matcher matcher = pattern.matcher(inputValue); + if (matcher.find()) { + strValue = matcher.group(1); + } else { + log.error("Regular expression finds no match!"); + strValue = "no match"; + } + break; + case "boolean": + if (!StringUtils.isEmpty(inputValue)) { + strValue = inputValue; + } + break; + default: + break; + } + return strValue; + } + + /** + * Extract the text from input stream for scenarios where an error page is returned as successful response + * @param input the InputStream, which most likely is from an HttpResponse + * @return the String extracted from InputStream, if the InputStream cannot be converted to a String + * then an exception should be logged in debug mode, and an empty string returned. + */ + protected String extractText(InputStream input) { + log.debug("Parsing response InputStream as Text"); + String data = ""; + if (input != null) { + try { + data = InputStreamUtils.extractText(input, + MultistageProperties.MSTAGE_SOURCE_DATA_CHARACTER_SET.getValidNonblankWithDefault(state)); + } catch (Exception e) { + log.debug(e.toString()); + } + } + return data; + } + + /** + * If Content-Type is provided, but not as expected, the response can have + * useful error information + * + * @param wuStatus work unit status + * @param expectedContentType expected content type + * @return false if content type is present but not as expected otherwise true + */ + @Deprecated + protected boolean checkContentType(WorkUnitStatus wuStatus, String expectedContentType) { + if (wuStatus.getMessages() != null && wuStatus.getMessages().containsKey("contentType")) { + String contentType = wuStatus.getMessages().get("contentType"); + if (!contentType.equalsIgnoreCase(expectedContentType)) { + log.info("Content is {}, expecting {}", contentType, expectedContentType); + log.debug(extractText(wuStatus.getBuffer())); + return false; + } + } + return true; + } + + protected boolean checkContentType(WorkUnitStatus wuStatus, HashSet expectedContentType) { + if (wuStatus.getMessages() != null && wuStatus.getMessages().containsKey("contentType")) { + String contentType = wuStatus.getMessages().get("contentType"); + if (!expectedContentType.contains(contentType.toLowerCase())) { + log.info("Content is {}, expecting {}", contentType, expectedContentType.toString()); + log.debug(extractText(wuStatus.getBuffer())); + return false; + } + } + return true; + } + + /** + * Retrieve session keys from the payload or header + * @param wuStatus + * @return the session key in the headers + */ + protected String getSessionKey(WorkUnitStatus wuStatus) { + if (wuStatus.getMessages() != null && wuStatus.getMessages().containsKey("headers") + && jobKeys.getSessionKeyField() != null && jobKeys.getSessionKeyField().has("name")) { + JsonObject headers = GSON.fromJson(wuStatus.getMessages().get("headers"), JsonObject.class); + if (headers.has(this.jobKeys.getSessionKeyField().get("name").getAsString())) { + return headers.get(this.jobKeys.getSessionKeyField().get("name").getAsString()).getAsString(); + } + } + return StringUtils.EMPTY; + } + + /** + * Check if the work unit is completed. + * + * When there is no payload data from the secondary input, it returns + * false by default and lets the sub-classes to decide whether complete + * the work unit becuase sub-classes can parse the incoming data. + * + * When a payload is configured, the records in a payload will be + * processed one by one through the pagination mechanism. If a payload + * dataset has many records, and the records should not be send out + * one by one, but rather in batches, the preprocess should group the + * records into batches. + * + * @param starting the starting position of the request + * @return default true payload iterator has no more entries + */ + protected boolean isWorkUnitCompleted(long starting) { + if (extractorKeys.getPayloads().size() == 0) { + // let sub class decide + return false; + } + + // sub-class can override this by reassigning a different iterator + // to payloadIterator. This statement reflects the default record + // by record pagination. + return !payloadIterator.hasNext(); + } + + /** + * If the position is 0, then it must be the first request + * @param starting the starting position of the request + * @return true if the starting position is 0, otherwise false + */ + protected boolean isFirst(long starting) { + return starting == 0; + } + + /** + * check if the stop condition has been met or if it should timeout, + * however, when no condition is present, we assume no wait + * + * @return true if stop condition is met or it should timeout + */ + protected boolean waitingBySessionKeyWithTimeout() { + if (!jobKeys.isSessionStateEnabled() || isSessionStateMatch()) { + return true; + } + + // Fail if the session failCondition is met + if (isSessionStateFailed()) { + String message = String.format("Session fail condition is met: %s", jobKeys.getSessionStateFailCondition()); + log.warn(message); + throw new RuntimeException(message); + } + + // if stop condition is present but the condition has not been met, we + // will check if the session should time out + if (DateTime.now().getMillis() > extractorKeys.getStartTime() + jobKeys.getSessionTimeout()) { + log.warn("Session time out after {} seconds", jobKeys.getSessionTimeout() / 1000); + throw new RuntimeException("Session timed out before ending condition is met"); + } + + // return false to indicate wait should continue + return false; + } + + /** + * Check if session state is enabled and session stop condition is met + * + * @return true if session state is enabled and session stop condition is met + * otherwise return false + */ + protected boolean isSessionStateMatch() { + return jobKeys.isSessionStateEnabled() && extractorKeys.getSessionKeyValue() + .matches(jobKeys.getSessionStateCondition()); + } + + /** + * Check if session state is enabled and session fail condition is met + * + * @return true if session state is enabled and session fail condition is met + * otherwise return false + */ + protected boolean isSessionStateFailed() { + return jobKeys.isSessionStateEnabled() && extractorKeys.getSessionKeyValue() + .matches(jobKeys.getSessionStateFailCondition()); + } + + /** + * This helper function determines whether to send a new pagination request. A new page + * should be requested if: + * 1. if session state control is enabled, then check if session stop condition is met or if timeout + * 2. otherwise, check if pagination is enabled + * + * Sub-classes should further refine the new page condition. + * + * @return true if a new page should be requested + */ + protected boolean hasNextPage() { + try { + if (jobKeys.isSessionStateEnabled()) { + return !waitingBySessionKeyWithTimeout(); + } else { + return jobKeys.isPaginationEnabled(); + } + } catch (Exception e) { + failWorkUnit(String.format("Timeout waiting for next page: %s", e.getMessage())); + return false; + } + } + + /** + * Utility function in the extractor to replace a variable + * @param variableString variable string + * @return actual value of a variable; empty string if variable not found + */ + protected String replaceVariable(String variableString) { + String finalString = ""; + try { + finalString = VariableUtils.replaceWithTracking(variableString, currentParameters, false).getKey(); + } catch (IOException e) { + failWorkUnit("Invalid parameter " + variableString); + } + return finalString; + } + + /** + * When there is no data return from the source, schema inferring will fail; however, Gobblin + * will always call schema converter before record converter. When it does so in the event of + * empty data, schema converter will fail. + * + * This function creates a dummy schema with primary keys and delta key to cheat converter + * + * @return the dummy schema with primary keys and delta keys + */ + protected JsonArray createMinimumSchema() { + List elements = new ArrayList<>(); + + if (state.contains(ConfigurationKeys.EXTRACT_PRIMARY_KEY_FIELDS_KEY)) { + String[] primaryKeys = + state.getProp(ConfigurationKeys.EXTRACT_PRIMARY_KEY_FIELDS_KEY, + StringUtils.EMPTY).split(COMMA_STR); + for (String key: primaryKeys) { + if (!key.isEmpty()) { + elements.add(new SchemaBuilder(key, SchemaBuilder.PRIMITIVE, true, new ArrayList<>()) + .setPrimitiveType(KEY_WORD_STRING)); + } + } + } + if (state.contains(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY)) { + String[] deltaKeys = + state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY, + StringUtils.EMPTY).split(COMMA_STR); + for (String key: deltaKeys) { + if (!key.isEmpty()) { + elements.add(new SchemaBuilder(key, SchemaBuilder.PRIMITIVE, true, new ArrayList<>()) + .setPrimitiveType(KEY_WORD_TIMESTAMP)); + } + } + } + return new SchemaBuilder(SchemaBuilder.RECORD, true, elements).buildAltSchema().getAsJsonArray(); + } + + public boolean closeConnection() { + if (connection != null) { + connection.closeAll(StringUtils.EMPTY); + } + return true; + } + + /** + * ms.parameters have variables. For the initial execution of each work unit, we substitute those + * variables with initial work unit variable values. + * + * @return the substituted parameters + */ + protected JsonObject getInitialWorkUnitParameters() { + JsonObject definedParameters = + JsonParameter.getParametersAsJson(jobKeys.getSourceParameters().toString(), getInitialWorkUnitVariableValues(), + this.state); + JsonObject initialParameters = replaceVariablesInParameters(appendActivationParameter(definedParameters)); + if (this.payloadIterator.hasNext()) { + initialParameters.add("payload", payloadIterator.next()); + } + return initialParameters; + } + + /** + * Initial variable values are not specific to protocols, moving this method here + * so that it can be shared among protocols. + * + * Initial work unit variable values include + * - watermarks defined for each work unit + * - initial pagination defined at the source level + * + * @return work unit specific initial parameters for the first request to source + */ + private JsonObject getInitialWorkUnitVariableValues() { + JsonObject variableValues = new JsonObject(); + + variableValues.add(ParameterTypes.WATERMARK.toString(), getWorkUnitWaterMarks()); + for (Map.Entry entry : jobKeys.getPaginationInitValues().entrySet()) { + variableValues.addProperty(entry.getKey().toString(), entry.getValue()); + } + return variableValues; + } + + /** + * Replace variables in the parameters itself, so that ms.parameters can accept variables. + * @param parameters the JsonObject with parameters + * @return the replaced parameter object + */ + JsonObject replaceVariablesInParameters(final JsonObject parameters) { + JsonObject parametersCopy = JsonUtils.deepCopy(parameters).getAsJsonObject(); + JsonObject finalParameter = JsonUtils.deepCopy(parameters).getAsJsonObject(); + try { + Pair replaced = + VariableUtils.replaceWithTracking(parameters.toString(), parametersCopy, false); + finalParameter = GSON.fromJson(replaced.getKey(), JsonObject.class); + + // for each parameter in the original parameter list, if the name of the parameter + // name starts with "tmp" and the parameter was used once in this substitution operation, + // then it shall be removed from the final list + for (Map.Entry entry : parameters.entrySet()) { + if (entry.getKey().matches("tmp.*") && !replaced.getRight().has(entry.getKey())) { + finalParameter.remove(entry.getKey()); + } + } + } catch (Exception e) { + log.error("Encoding error is not expected, but : {}", e.getMessage()); + } + log.debug("Final parameters: {}", finalParameter.toString()); + return finalParameter; + } + + /** + * Add activation parameters to work unit parameters + * @param parameters the defined parameters + * @return the set of parameters including activation parameters + */ + private JsonObject appendActivationParameter(JsonObject parameters) { + JsonObject activationParameters = extractorKeys.getActivationParameters(); + if (activationParameters.entrySet().size() > 0) { + for (Map.Entry entry : activationParameters.entrySet()) { + String key = entry.getKey(); + parameters.add(key, activationParameters.get(key)); + } + } + return JsonUtils.deepCopy(parameters).getAsJsonObject(); + } + + protected JsonObject getCurrentWorkUnitParameters() { + JsonObject definedParameters = JsonParameter.getParametersAsJson(jobKeys.getSourceParameters().toString(), + getUpdatedWorkUnitVariableValues(getInitialWorkUnitVariableValues()), state); + JsonObject currentParameters = replaceVariablesInParameters(appendActivationParameter(definedParameters)); + if (this.payloadIterator.hasNext()) { + currentParameters.add("payload", payloadIterator.next()); + } + return currentParameters; + } + + /** + * Update variable values based on work unit status + * + * Following variable values are updated: + * 1. session key value if the work unit status has session key + * 2. page start value if page start control is used + * 3. page size value if page size control is used + * 4. page number value if page number control is used + * + * Typically use case can use any of following pagination methods, some may use multiple: + * 1. use page start (offset) and page size to control pagination + * 2. use page number and page size to control pagination + * 3. use page number to control pagination, while page size can be fixed + * 4. use session key to control pagination, and the session key decides what to fetch next + * 5. not use any variables, the session just keep going until following conditions are met: + * a. return an empty page + * b. return a specific status, such as "complete", in response + * + * @param initialVariableValues initial variable values + * @return the updated variable values + */ + private JsonObject getUpdatedWorkUnitVariableValues(JsonObject initialVariableValues) { + JsonObject updatedVariableValues = JsonUtils.deepCopy(initialVariableValues).getAsJsonObject(); + WorkUnitStatus wuStatus = this.getWorkUnitStatus(); + + // if session key is used, the extractor has to provide it int its work unit status + // in order for this to work + if (updatedVariableValues.has(ParameterTypes.SESSION.toString())) { + updatedVariableValues.remove(ParameterTypes.SESSION.toString()); + } + updatedVariableValues.addProperty(ParameterTypes.SESSION.toString(), wuStatus.getSessionKey()); + + // if page start is used, the extractor has to provide it int its work unit status + // in order for this to work + if (updatedVariableValues.has(ParameterTypes.PAGESTART.toString())) { + updatedVariableValues.remove(ParameterTypes.PAGESTART.toString()); + } + updatedVariableValues.addProperty(ParameterTypes.PAGESTART.toString(), wuStatus.getPageStart()); + + // page size doesn't change much often, if extractor doesn't provide + // a page size, then assume it is the same as initial value + if (updatedVariableValues.has(ParameterTypes.PAGESIZE.toString()) && wuStatus.getPageSize() > 0) { + updatedVariableValues.remove(ParameterTypes.PAGESIZE.toString()); + } + if (wuStatus.getPageSize() > 0) { + updatedVariableValues.addProperty(ParameterTypes.PAGESIZE.toString(), wuStatus.getPageSize()); + } + + // if page number is used, the extractor has to provide it in its work unit status + // in order for this to work + if (updatedVariableValues.has(ParameterTypes.PAGENO.toString())) { + updatedVariableValues.remove(ParameterTypes.PAGENO.toString()); + } + updatedVariableValues.addProperty(ParameterTypes.PAGENO.toString(), wuStatus.getPageNumber()); + + return updatedVariableValues; + } + + protected void logUsage(State state) { + log.info("Checking essential (not always mandatory) parameters..."); + log.info("Values can be default values for the specific type if the property is not configured"); + for (MultistageProperties p : JobKeys.ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } + + /** + * Read payload records from secondary input location. Subclasses might + * override this to process payload differently. + * + * @param state WorkUnitState + * @return the payload records + */ + protected JsonArray getPayloads(State state) { + JsonArray payloads = MultistageProperties.MSTAGE_PAYLOAD_PROPERTY.getValidNonblankWithDefault(state); + JsonArray records = new JsonArray(); + for (JsonElement entry : payloads) { + records.addAll(new HdfsReader(state).readSecondary(entry.getAsJsonObject())); + } + return records; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/ApacheHttpClientFactory.java b/dil/src/main/java/com/linkedin/dil/factory/ApacheHttpClientFactory.java new file mode 100644 index 0000000..3b08e2d --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/ApacheHttpClientFactory.java @@ -0,0 +1,19 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import org.apache.gobblin.configuration.State; +import org.apache.http.client.HttpClient; +import org.apache.http.impl.client.HttpClientBuilder; + + +/** + * A vehicle to produce an Apache HttpClient + */ +public class ApacheHttpClientFactory implements HttpClientFactory { + public HttpClient get(State state) { + return HttpClientBuilder.create().build(); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/DefaultJdbcClientFactory.java b/dil/src/main/java/com/linkedin/dil/factory/DefaultJdbcClientFactory.java new file mode 100644 index 0000000..8af4a8b --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/DefaultJdbcClientFactory.java @@ -0,0 +1,27 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import java.sql.Connection; +import java.sql.DriverManager; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.util.EncryptionUtils; + + +/** + * An implementation to create an default JDBC connection + */ +public class DefaultJdbcClientFactory implements JdbcClientFactory { + public Connection getConnection(String jdbcUrl, String userId, String cryptedPassword, State state) { + try { + return DriverManager.getConnection( + EncryptionUtils.decryptGobblin(jdbcUrl, state), + EncryptionUtils.decryptGobblin(userId, state), + EncryptionUtils.decryptGobblin(cryptedPassword, state)); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/DefaultS3ClientFactory.java b/dil/src/main/java/com/linkedin/dil/factory/DefaultS3ClientFactory.java new file mode 100644 index 0000000..f19d7a2 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/DefaultS3ClientFactory.java @@ -0,0 +1,24 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import org.apache.gobblin.configuration.State; +import software.amazon.awssdk.http.SdkHttpClient; +import software.amazon.awssdk.http.apache.ApacheHttpClient; +import software.amazon.awssdk.utils.AttributeMap; + +import static software.amazon.awssdk.http.SdkHttpConfigurationOption.*; + + +/** + * An implementation to produce an Apache HttpClient + */ +public class DefaultS3ClientFactory implements S3ClientFactory { + public SdkHttpClient getHttpClient(State state, AttributeMap config) { + return ApacheHttpClient.builder() + .connectionTimeout(config.get(CONNECTION_TIMEOUT)) + .build(); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/HttpClientFactory.java b/dil/src/main/java/com/linkedin/dil/factory/HttpClientFactory.java new file mode 100644 index 0000000..ae1f698 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/HttpClientFactory.java @@ -0,0 +1,16 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import org.apache.gobblin.configuration.State; +import org.apache.http.client.HttpClient; + + +/** + * The interface for dynamic HttpClient creation based on environment + */ +public interface HttpClientFactory { + HttpClient get(State state); +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/JdbcClientFactory.java b/dil/src/main/java/com/linkedin/dil/factory/JdbcClientFactory.java new file mode 100644 index 0000000..384e385 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/JdbcClientFactory.java @@ -0,0 +1,23 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import java.sql.Connection; +import org.apache.gobblin.configuration.State; + + +/** + * The interface for dynamic JDBC Connection creation based on environment. + */ +public interface JdbcClientFactory { + /** + * @param jdbcUrl plain or encrypted URL + * @param userId plain or encrypted user name + * @param cryptedPassword plain or encrypted password + * @param state source or work unit state that can provide the encryption master key location + * @return a JDBC connection + */ + Connection getConnection(String jdbcUrl, String userId, String cryptedPassword, State state); +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/S3ClientFactory.java b/dil/src/main/java/com/linkedin/dil/factory/S3ClientFactory.java new file mode 100644 index 0000000..412653f --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/S3ClientFactory.java @@ -0,0 +1,17 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import org.apache.gobblin.configuration.State; +import software.amazon.awssdk.http.SdkHttpClient; +import software.amazon.awssdk.utils.AttributeMap; + + +/** + * The interface for dynamic S3Client creation based on environment + */ +public interface S3ClientFactory { + SdkHttpClient getHttpClient(State state, AttributeMap config); +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/SchemaReaderFactory.java b/dil/src/main/java/com/linkedin/dil/factory/SchemaReaderFactory.java new file mode 100644 index 0000000..ad949a7 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/SchemaReaderFactory.java @@ -0,0 +1,32 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.factory.reader.SchemaReader; + + +/** + * The factory to create SchemaReader + */ +public interface SchemaReaderFactory { + /** + * Creating a schema reader, default reads from TMS + * @param state Gobblin configuration + * @return the reader factory + */ + @VisibleForTesting + static SchemaReader create(State state) { + try { + Class readerClass = Class.forName( + MultistageProperties.MSTAGE_SOURCE_SCHEMA_READER_FACTORY.getValidNonblankWithDefault(state)); + return (SchemaReader) readerClass.newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/reader/JsonFileReader.java b/dil/src/main/java/com/linkedin/dil/factory/reader/JsonFileReader.java new file mode 100644 index 0000000..7aede16 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/reader/JsonFileReader.java @@ -0,0 +1,30 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory.reader; + +import com.google.gson.JsonElement; +import org.apache.gobblin.configuration.State; + + +/** + * TODO + * a utility class that implement a schema reader from a Json file on HDFS + */ +public class JsonFileReader implements SchemaReader { + /** + * @param state a Gobbline State object with needed properties + * @param urn the HDFS file path + * @return a JsonSchema object + */ + @Override + public JsonElement read(final State state, final String urn) { + return null; + } + + @Override + public void close() { + // do nothing + } +} diff --git a/dil/src/main/java/com/linkedin/dil/factory/reader/SchemaReader.java b/dil/src/main/java/com/linkedin/dil/factory/reader/SchemaReader.java new file mode 100644 index 0000000..41c9a62 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/factory/reader/SchemaReader.java @@ -0,0 +1,17 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory.reader; + +import com.google.gson.JsonElement; +import org.apache.gobblin.configuration.State; + + +/** + * The base class for dynamic schema reader based on environment. + */ +public interface SchemaReader { + JsonElement read(final State state, final String urn); + void close(); +} diff --git a/dil/src/main/java/com/linkedin/dil/filter/AvroSchemaBasedFilter.java b/dil/src/main/java/com/linkedin/dil/filter/AvroSchemaBasedFilter.java new file mode 100644 index 0000000..d6bf695 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/filter/AvroSchemaBasedFilter.java @@ -0,0 +1,46 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + +import com.google.common.base.Optional; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.keys.AvroExtractorKeys; +import com.linkedin.dil.util.AvroSchemaUtils; +import com.linkedin.dil.util.JsonIntermediateSchema; +import org.apache.gobblin.util.AvroUtils; + + +@Slf4j +public class AvroSchemaBasedFilter extends MultistageSchemaBasedFilter { + private AvroExtractorKeys avroExtractorKeys; + private WorkUnitState state; + + public AvroSchemaBasedFilter(JsonIntermediateSchema schema, AvroExtractorKeys avroExtractorKeys, + WorkUnitState state) { + super(schema); + this.avroExtractorKeys = avroExtractorKeys; + this.state = state; + } + + @SneakyThrows + @Override + public GenericRecord filter(GenericRecord input) { + Schema outputSchema = AvroSchemaUtils.fromJsonSchema(schema.toJson(), state); + GenericRecord filteredRow = new GenericData.Record(outputSchema); + if (avroExtractorKeys.getIsValidOutputSchema()) { + log.warn("Some columns from the schema are not present at source, padding with null value."); + } + for (String fieldName : AvroSchemaUtils.getSchemaFieldNames(outputSchema)) { + Optional fieldValue = AvroUtils.getFieldValue(input, fieldName); + filteredRow.put(fieldName, fieldValue.isPresent() ? fieldValue.get() : null); + } + return filteredRow; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/filter/CsvSchemaBasedFilter.java b/dil/src/main/java/com/linkedin/dil/filter/CsvSchemaBasedFilter.java new file mode 100644 index 0000000..d2cf2c5 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/filter/CsvSchemaBasedFilter.java @@ -0,0 +1,74 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + +import java.util.Arrays; +import java.util.Set; +import lombok.extern.slf4j.Slf4j; +import com.linkedin.dil.keys.CsvExtractorKeys; +import com.linkedin.dil.util.JsonIntermediateSchema; + + +/** + * Filter CSV records by Json Intermediate schema + * + * @author esong + * + */ +@Slf4j +public class CsvSchemaBasedFilter extends MultistageSchemaBasedFilter { + private CsvExtractorKeys csvExtractorKeys; + + public CsvSchemaBasedFilter(JsonIntermediateSchema schema, CsvExtractorKeys csvExtractorKeys) { + super(schema); + this.csvExtractorKeys = csvExtractorKeys; + } + + @Override + public String[] filter(String[] input) { + Set columnProjection = csvExtractorKeys.getColumnProjection(); + if (columnProjection.size() > 0) { + // use user-defined column projection to filter + return filter(input, columnProjection); + } else if (csvExtractorKeys.getHeaderRow() != null && csvExtractorKeys.getIsValidOutputSchema()) { + // use the header and schema to generate column projection, then filter + String[] headerRow = csvExtractorKeys.getHeaderRow(); + for (int i = 0; i < headerRow.length; i++) { + if (schema.getColumns().keySet().stream().anyMatch(headerRow[i]::equalsIgnoreCase)) { + columnProjection.add(i); + } + } + csvExtractorKeys.setColumnProjection(columnProjection); + return filter(input, columnProjection); + } else { + log.debug("Defaulting to project first N columns"); + // take first N column, where N is the number of columns in the schema + // if the schema's size larger than input, then the extra columns will be padded with null + return Arrays.copyOf(input, schema.getColumns().size()); + } + } + + /** + * shift the wanted fields to front in place, and then truncate the array + * @param input original row + * @param columnProjection column projection + * @return modified row + */ + private String[] filter(String[] input, Set columnProjection) { + int curr = 0; + for (int i = 0; i < input.length; i++) { + if (columnProjection.contains(i)) { + swap(input, i, curr++); + } + } + return Arrays.copyOf(input, curr); + } + + private void swap(String[] input, int i, int j) { + String temp = input[i]; + input[i] = input[j]; + input[j] = temp; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/filter/JsonSchemaBasedFilter.java b/dil/src/main/java/com/linkedin/dil/filter/JsonSchemaBasedFilter.java new file mode 100644 index 0000000..4facfb0 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/filter/JsonSchemaBasedFilter.java @@ -0,0 +1,81 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonPrimitive; +import java.util.Map; +import com.linkedin.dil.util.JsonElementTypes; +import com.linkedin.dil.util.JsonIntermediateSchema; +import com.linkedin.dil.util.JsonUtils; + + +/** + * Filter Json records by Json Intermediate schema + * + * TODO handle UNIONs + * + * @author kgoodhop, chrli + * + */ +public class JsonSchemaBasedFilter extends MultistageSchemaBasedFilter { + + public JsonSchemaBasedFilter(JsonIntermediateSchema schema) { + super(schema); + } + + /** + * top level filter function + * @param input the input row object + * @return the filtered row object + */ + @Override + public JsonObject filter(JsonObject input) { + return this.filter(schema, input); + } + + private JsonElement filter(JsonIntermediateSchema.JisDataType dataType, JsonElement input) { + if (dataType.isPrimitive()) { + return input.isJsonPrimitive() ? filter(dataType, input.getAsJsonPrimitive()) : null; + } else if (dataType.getType() == JsonElementTypes.RECORD) { + return filter(dataType.getChildRecord(), input.getAsJsonObject()); + } else if (dataType.getType() == JsonElementTypes.ARRAY) { + return filter(dataType.getItemType(), input.getAsJsonArray()); + } + return null; + } + + private JsonPrimitive filter(JsonIntermediateSchema.JisDataType dataType, JsonPrimitive input) { + return dataType.isPrimitive() ? JsonUtils.deepCopy(input).getAsJsonPrimitive() : null; + } + + /** + * process the JsonArray + * + * @param dataType should be the item type of the JsonArray + * @param input JsonArray object + * @return filtered JsonArray object + */ + private JsonArray filter(JsonIntermediateSchema.JisDataType dataType, JsonArray input) { + JsonArray output = new JsonArray(); + for (JsonElement element: input) { + output.add(filter(dataType, element)); + } + return output; + } + + private JsonObject filter(JsonIntermediateSchema schema, JsonObject input) { + JsonObject output = new JsonObject(); + for (Map.Entry entry: input.entrySet()) { + if (schema.getColumns().containsKey(entry.getKey())) { + output.add(entry.getKey(), + filter(schema.getColumns().get(entry.getKey()).getDataType(), entry.getValue())); + } + } + return output; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/filter/MultistageSchemaBasedFilter.java b/dil/src/main/java/com/linkedin/dil/filter/MultistageSchemaBasedFilter.java new file mode 100644 index 0000000..fdc15ec --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/filter/MultistageSchemaBasedFilter.java @@ -0,0 +1,26 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + +import com.linkedin.dil.util.JsonIntermediateSchema; + + +/** + * Base filter class + * + * Each extractor shall call a derived filter of this class to process its data + */ +public class MultistageSchemaBasedFilter implements SchemaBasedFilter { + protected JsonIntermediateSchema schema; + + public MultistageSchemaBasedFilter(JsonIntermediateSchema schema) { + this.schema = schema; + } + + @Override + public T filter(T input) { + return null; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/filter/SchemaBasedFilter.java b/dil/src/main/java/com/linkedin/dil/filter/SchemaBasedFilter.java new file mode 100644 index 0000000..8a3e673 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/filter/SchemaBasedFilter.java @@ -0,0 +1,14 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + + +/** + * Base filter interface + * + */ +public interface SchemaBasedFilter { + T filter(T input); +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/AvroExtractorKeys.java b/dil/src/main/java/com/linkedin/dil/keys/AvroExtractorKeys.java new file mode 100644 index 0000000..5a0c6ef --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/AvroExtractorKeys.java @@ -0,0 +1,67 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.collect.Lists; +import java.util.List; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import org.apache.gobblin.source.workunit.WorkUnit; + + +/** + * These attributes are defined and maintained in AvroExtractor + * + * @author esong + */ +@Slf4j +@Getter(AccessLevel.PUBLIC) +@Setter +public class AvroExtractorKeys extends ExtractorKeys { + final private static List ESSENTIAL_PARAMETERS = Lists.newArrayList( + MultistageProperties.MSTAGE_DATA_FIELD, + MultistageProperties.MSTAGE_TOTAL_COUNT_FIELD); + + private DataFileStream avroRecordIterator = null; + private long processedCount; + private long totalCount; + // TODO: move this to ExtractorKeys if pagination is needed + private long currentPageNumber = 0; + private Schema avroOutputSchema = null; + private Boolean isValidOutputSchema = true; + + public void incrCurrentPageNumber() { + currentPageNumber++; + } + public void incrProcessedCount() { + processedCount++; + } + + + + @Override + public void logDebugAll(WorkUnit workUnit) { + super.logDebugAll(workUnit); + log.debug("These are values of JsonExtractor regarding to Work Unit: {}", + workUnit == null ? "testing" : workUnit.getProp(MultistageProperties.DATASET_URN_KEY.toString())); + log.debug("Total rows expected or processed: {}", totalCount); + log.debug("Total rows processed: {}", processedCount); + } + + @Override + public void logUsage(State state) { + super.logUsage(state); + for (MultistageProperties p: ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/CsvExtractorKeys.java b/dil/src/main/java/com/linkedin/dil/keys/CsvExtractorKeys.java new file mode 100644 index 0000000..62f8339 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/CsvExtractorKeys.java @@ -0,0 +1,81 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.collect.Lists; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import org.apache.gobblin.source.workunit.WorkUnit; + + +/** + * These attributes are defined and maintained in CsvExtractor + * + * @author chrli + */ +@Slf4j +@Getter(AccessLevel.PUBLIC) +@Setter +public class CsvExtractorKeys extends ExtractorKeys { + final private static List ESSENTIAL_PARAMETERS = Lists.newArrayList( + MultistageProperties.MSTAGE_CSV_COLUMN_HEADER, + MultistageProperties.MSTAGE_CSV_SEPARATOR, + MultistageProperties.MSTAGE_CSV_SKIP_LINES, + MultistageProperties.MSTAGE_CSV_QUOTE_CHARACTER, + MultistageProperties.MSTAGE_CSV_ESCAPE_CHARACTER); + + private Iterator csvIterator = null; + private long processedCount = 0; + private long currentPageNumber = 0; + private Boolean columnHeader = false; + private int rowsToSkip = 0; + private String separator = MultistageProperties.MSTAGE_CSV_SEPARATOR.getDefaultValue(); + private String quoteCharacter = MultistageProperties.MSTAGE_CSV_QUOTE_CHARACTER.getDefaultValue(); + private String escapeCharacter = MultistageProperties.MSTAGE_CSV_ESCAPE_CHARACTER.getDefaultValue(); + // column name --> index mapping created based on the output or inferred schema + private Map columnToIndexMap = new HashMap<>(); + // A queue that stores sample rows read in during schema inference + // This is necessary as the input stream can only be read once + private Deque sampleRows = new ArrayDeque<>(); + private String[] headerRow; + private Set columnProjection = new HashSet<>(); + private Boolean isValidOutputSchema = true; + + public void incrCurrentPageNumber() { + currentPageNumber++; + } + public void incrProcessedCount() { + processedCount++; + } + + @Override + public void logDebugAll(WorkUnit workUnit) { + super.logDebugAll(workUnit); + log.debug("These are values of CsvExtractor regarding to Work Unit: {}", + workUnit == null ? "testing" : workUnit.getProp(MultistageProperties.DATASET_URN_KEY.toString())); + log.debug("Is column header present: {}", columnHeader); + log.debug("Total rows to skip: {}", rowsToSkip); + } + + @Override + public void logUsage(State state) { + super.logUsage(state); + for (MultistageProperties p: ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/ExtractorKeys.java b/dil/src/main/java/com/linkedin/dil/keys/ExtractorKeys.java new file mode 100644 index 0000000..b2dcfe3 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/ExtractorKeys.java @@ -0,0 +1,71 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.collect.Lists; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.util.ArrayList; +import java.util.List; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.preprocessor.StreamProcessor; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.joda.time.DateTime; + + +/** + * each of these keys provide information how to populate corresponding values + * + * each format Extractor is responsible for populating these key with proper values + * so that their those values can be pull by the Source + * + * @author chrli + */ +@Slf4j +@Getter (AccessLevel.PUBLIC) +@Setter +public class ExtractorKeys { + final static private List ESSENTIAL_PARAMETERS = Lists.newArrayList( + MultistageProperties.EXTRACT_TABLE_NAME_KEY, + MultistageProperties.MSTAGE_ACTIVATION_PROPERTY, + MultistageProperties.MSTAGE_PARAMETERS + ); + + private JsonObject activationParameters = new JsonObject(); + private long startTime = DateTime.now().getMillis(); + private long delayStartTime; + private String signature; + private JsonArray inferredSchema = null; + private String sessionKeyValue; + private List> preprocessors = new ArrayList<>(); + private JsonObject dynamicParameters = new JsonObject(); + private Boolean explictEof; + private JsonArray payloads = new JsonArray(); + + public void logDebugAll(WorkUnit workUnit) { + log.debug("These are values in MultistageExtractor regarding to Work Unit: {}", + workUnit == null ? "testing" : workUnit.getProp(MultistageProperties.DATASET_URN_KEY.toString())); + log.debug("Activation parameters: {}", activationParameters); + log.debug("Payload size: {}", payloads.size()); + log.debug("Starting time: {}", startTime); + log.debug("Signature of the work unit: {}", signature); + if (inferredSchema != null) { + log.info("Inferred schema: {}", inferredSchema.toString()); + log.info("Avro-flavor schema: {}", inferredSchema.toString()); + } + log.debug("Session Status: {}", sessionKeyValue); + } + + public void logUsage(State state) { + for (MultistageProperties p: ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/FileDumpExtractorKeys.java b/dil/src/main/java/com/linkedin/dil/keys/FileDumpExtractorKeys.java new file mode 100644 index 0000000..d7accf0 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/FileDumpExtractorKeys.java @@ -0,0 +1,37 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.source.workunit.WorkUnit; + + +@Slf4j +@Getter(AccessLevel.PUBLIC) +@Setter +public class FileDumpExtractorKeys extends ExtractorKeys { + String fileName; + String fileWritePermissions; + String fileDumpLocation; + @Getter + private long currentFileNumber = 0; + + public long incrCurrentFileNumber() { + return currentFileNumber++; + } + + @Override + public void logDebugAll(WorkUnit workUnit) { + super.logDebugAll(workUnit); + log.debug("These are values in FileDumpExtractor:"); + log.debug("Dumping data with file name - " + fileName); + log.debug("Dumping data with permissions - " + fileWritePermissions); + log.debug("Dumping data at location - " + fileDumpLocation); + log.debug("Current file number - {}", currentFileNumber); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/HdfsKeys.java b/dil/src/main/java/com/linkedin/dil/keys/HdfsKeys.java new file mode 100644 index 0000000..23873a8 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/HdfsKeys.java @@ -0,0 +1,39 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.collect.Lists; +import java.util.List; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; + + +/** + * This structure holds HDFS related parameters that could used to read from + * or write to HDFS + * + * @author chrli + */ + +@Slf4j +@Getter(AccessLevel.PUBLIC) +@Setter(AccessLevel.PUBLIC) +public class HdfsKeys extends JobKeys { + final private static List ESSENTIAL_PARAMETERS = Lists.newArrayList( + // HDFS essential parameters + ); + + @Override + public void logUsage(State state) { + super.logUsage(state); + for (MultistageProperties p : ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/HttpKeys.java b/dil/src/main/java/com/linkedin/dil/keys/HttpKeys.java new file mode 100644 index 0000000..12ca627 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/HttpKeys.java @@ -0,0 +1,64 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.collect.Lists; +import com.google.gson.JsonObject; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.util.HttpRequestMethod; + + +/** + * This structure holds static parameters that are commonly used in HTTP protocol. + * + * @author chrli + */ +@Slf4j +@Getter (AccessLevel.PUBLIC) +@Setter(AccessLevel.PUBLIC) +public class HttpKeys extends JobKeys { + final private static List ESSENTIAL_PARAMETERS = Lists.newArrayList( + MultistageProperties.SOURCE_CONN_USERNAME, + MultistageProperties.SOURCE_CONN_PASSWORD, + MultistageProperties.MSTAGE_AUTHENTICATION, + MultistageProperties.MSTAGE_HTTP_REQUEST_METHOD, + MultistageProperties.MSTAGE_HTTP_REQUEST_HEADERS, + MultistageProperties.MSTAGE_SESSION_KEY_FIELD); + + private JsonObject authentication = new JsonObject(); + private JsonObject httpRequestHeaders = new JsonObject(); + private Map httpRequestHeadersWithAuthentication = new HashMap<>(); + private String httpRequestMethod = HttpRequestMethod.GET.toString(); + private JsonObject initialParameters = new JsonObject(); + private Map> httpStatuses = new HashMap<>(); + private Map> httpStatusReasons = new HashMap<>(); + + @Override + public void logDebugAll() { + super.logDebugAll(); + log.debug("These are values in HttpSource"); + log.debug("Http Request Headers: {}", httpRequestHeaders); + //log.debug("Http Request Headers with Authentication: {}", httpRequestHeadersWithAuthentication.toString()); + log.debug("Http Request Method: {}", httpRequestMethod); + log.debug("Http Statuses: {}", httpStatuses); + log.debug("Initial values of dynamic parameters: {}", initialParameters); + } + + @Override + public void logUsage(State state) { + super.logUsage(state); + for (MultistageProperties p: ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/JdbcKeys.java b/dil/src/main/java/com/linkedin/dil/keys/JdbcKeys.java new file mode 100644 index 0000000..e0d1d84 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/JdbcKeys.java @@ -0,0 +1,54 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.collect.Lists; +import com.google.gson.JsonObject; +import java.util.List; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; + + +/** + * This structure holds static Source parameters that are commonly used in JDBC Sources. + * + * @author chrli + */ +@Slf4j +@Getter (AccessLevel.PUBLIC) +@Setter(AccessLevel.PUBLIC) +public class JdbcKeys extends JobKeys { + final private static List ESSENTIAL_PARAMETERS = Lists.newArrayList( + MultistageProperties.MSTAGE_JDBC_STATEMENT, + MultistageProperties.SOURCE_CONN_USERNAME, + MultistageProperties.SOURCE_CONN_PASSWORD); + + private String jdbcStatement = null; + private JsonObject initialParameterValues = new JsonObject(); + private String separator = MultistageProperties.MSTAGE_CSV_SEPARATOR.getDefaultValue(); + private String quoteCharacter = MultistageProperties.MSTAGE_CSV_QUOTE_CHARACTER.getDefaultValue(); + private String escapeCharacter = MultistageProperties.MSTAGE_CSV_ESCAPE_CHARACTER.getDefaultValue(); + private String schemaRefactorFunction = MultistageProperties.MSTAGE_JDBC_SCHEMA_REFACTOR.getDefaultValue(); + + @Override + public void logDebugAll() { + super.logDebugAll(); + log.debug("These are values in JdbcSource"); + log.debug("JDBC statement: {}", jdbcStatement); + log.debug("Initial values of dynamic parameters: {}", initialParameterValues); + } + + @Override + public void logUsage(State state) { + super.logUsage(state); + for (MultistageProperties p: ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/JobKeys.java b/dil/src/main/java/com/linkedin/dil/keys/JobKeys.java new file mode 100644 index 0000000..73263af --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/JobKeys.java @@ -0,0 +1,609 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.reflect.TypeToken; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.factory.SchemaReaderFactory; +import com.linkedin.dil.factory.reader.SchemaReader; +import com.linkedin.dil.util.DateTimeUtils; +import com.linkedin.dil.util.HdfsReader; +import com.linkedin.dil.util.JsonUtils; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.WorkUnitPartitionTypes; +import org.joda.time.DateTime; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * This class holds static Job parameters and it is initialized in the Source as part of + * planning process, yet it can contain destination parameters as well in a egress scenario. + * + * Each of these keys provide information how to populate corresponding values in protocol + * sub-classes. Each protocol is responsible for proper usage of these keys. + * + * The JobKeys class has 3 categories of functions: + * 1. parsing: parse the complex job properties + * 2. validating: validate job properties + * 3. logging: log configurations + * + * @author chrli + */ + +@Slf4j +@Getter(AccessLevel.PUBLIC) +@Setter(AccessLevel.PUBLIC) +public class JobKeys { + final static public Gson GSON = new Gson(); + final static public List ESSENTIAL_PARAMETERS = Lists.newArrayList( + MultistageProperties.SOURCE_CLASS, + MultistageProperties.EXTRACTOR_CLASSES, + MultistageProperties.CONVERTER_CLASSES, + MultistageProperties.EXTRACT_IS_FULL, + MultistageProperties.EXTRACT_TABLE_TYPE_KEY, + MultistageProperties.STATE_STORE_ENABLED, + MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS, + MultistageProperties.MSTAGE_DERIVED_FIELDS, + MultistageProperties.MSTAGE_ENABLE_CLEANSING, + MultistageProperties.MSTAGE_ENABLE_DYNAMIC_FULL_LOAD, + MultistageProperties.MSTAGE_ENABLE_SCHEMA_BASED_FILTERING, + MultistageProperties.MSTAGE_ENCODING, + MultistageProperties.MSTAGE_ENCRYPTION_FIELDS, + MultistageProperties.MSTAGE_GRACE_PERIOD_DAYS, + MultistageProperties.MSTAGE_OUTPUT_SCHEMA, + MultistageProperties.MSTAGE_PAGINATION, + MultistageProperties.MSTAGE_PARAMETERS, + MultistageProperties.MSTAGE_RETENTION, + MultistageProperties.MSTAGE_SECONDARY_INPUT, + MultistageProperties.MSTAGE_SESSION_KEY_FIELD, + MultistageProperties.MSTAGE_SOURCE_DATA_CHARACTER_SET, + MultistageProperties.MSTAGE_SOURCE_SCHEMA_URN, + MultistageProperties.MSTAGE_SOURCE_URI, + MultistageProperties.MSTAGE_TOTAL_COUNT_FIELD, + MultistageProperties.MSTAGE_WAIT_TIMEOUT_SECONDS, + MultistageProperties.MSTAGE_WORK_UNIT_PACING_SECONDS, + MultistageProperties.MSTAGE_WORK_UNIT_PARALLELISM_MAX, + MultistageProperties.MSTAGE_WORK_UNIT_PARTIAL_PARTITION, + MultistageProperties.MSTAGE_WATERMARK); + final private static int RETRY_DELAY_IN_SEC_DEFAULT = 300; + final private static int RETRY_COUNT_DEFAULT = 3; + final private static String ITEMS_KEY = "items"; + + private Map> derivedFields = new HashMap<>(); + private Map defaultFieldTypes = new HashMap<>(); + + // sourceSchema is the schema provided or retrieved from source + private JsonArray sourceSchema = new JsonArray(); + + // outputSchema is the schema to be supplied to converters + private JsonArray outputSchema = new JsonArray(); + + // targetSchema is the schema to be supplied to writers + private JsonArray targetSchema = new JsonArray(); + + private JsonObject sessionKeyField = new JsonObject(); + private String totalCountField = StringUtils.EMPTY; + private JsonArray sourceParameters = new JsonArray(); + private Map paginationFields = new HashMap<>(); + private Map paginationInitValues = new HashMap<>(); + private long sessionTimeout; + private long callInterval; + private JsonArray encryptionField = new JsonArray(); + private boolean enableCleansing; + String dataField = StringUtils.EMPTY; + private JsonArray watermarkDefinition = new JsonArray(); + private long retryDelayInSec; + private long retryCount; + private Boolean isPartialPartition; + private JsonArray secondaryInputs = new JsonArray(); + private WorkUnitPartitionTypes workUnitPartitionType; + private Boolean isSecondaryAuthenticationEnabled = false; + private String sourceUri = StringUtils.EMPTY; + private SchemaReader schemaReader; + private String schemaCleansingPattern = "(\\s|\\$|@)"; + private String schemaCleansingReplacement = "_"; + private Boolean schemaCleansingNullable = false; + + public void initialize(State state) { + parsePaginationFields(state); + parsePaginationInitialValues(state); + setSessionKeyField(MultistageProperties.MSTAGE_SESSION_KEY_FIELD.getValidNonblankWithDefault(state)); + setTotalCountField(MultistageProperties.MSTAGE_TOTAL_COUNT_FIELD.getValidNonblankWithDefault(state)); + setSourceParameters(MultistageProperties.MSTAGE_PARAMETERS.getValidNonblankWithDefault(state)); + setSourceUri(MultistageProperties.MSTAGE_SOURCE_URI.getValidNonblankWithDefault(state)); + setDefaultFieldTypes(parseDefaultFieldTypes(state)); + setDerivedFields(parseDerivedFields(state)); + setOutputSchema(parseOutputSchema(state)); + setTargetSchema(MultistageProperties.MSTAGE_TARGET_SCHEMA.getValidNonblankWithDefault(state)); + setEncryptionField(MultistageProperties.MSTAGE_ENCRYPTION_FIELDS.getValidNonblankWithDefault(state)); + setDataField(MultistageProperties.MSTAGE_DATA_FIELD.getValidNonblankWithDefault(state)); + setCallInterval(MultistageProperties.MSTAGE_CALL_INTERVAL.getProp(state)); + setSessionTimeout(MultistageProperties.MSTAGE_WAIT_TIMEOUT_SECONDS.getMillis(state)); + + setEnableCleansing(MultistageProperties.MSTAGE_ENABLE_CLEANSING.getValidNonblankWithDefault(state)); + JsonObject schemaCleansing = MultistageProperties.MSTAGE_SCHEMA_CLENSING.getValidNonblankWithDefault(state); + if (schemaCleansing.has("enabled")) { + setEnableCleansing(Boolean.parseBoolean(schemaCleansing.get("enabled").getAsString())); + if (enableCleansing && schemaCleansing.has("pattern")) { + setSchemaCleansingPattern(schemaCleansing.get("pattern").getAsString()); + } + if (enableCleansing && schemaCleansing.has("replacement")) { + setSchemaCleansingPattern(schemaCleansing.get("replacement").getAsString()); + } + if (enableCleansing && schemaCleansing.has("nullable")) { + setSchemaCleansingNullable(Boolean.parseBoolean(schemaCleansing.get("nullable").getAsString())); + } + } + + setIsPartialPartition(MultistageProperties.MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getValidNonblankWithDefault(state)); + setWorkUnitPartitionType(parsePartitionType(state)); + setWatermarkDefinition(MultistageProperties.MSTAGE_WATERMARK.getValidNonblankWithDefault(state)); + Map retry = parseSecondaryInputRetry( + MultistageProperties.MSTAGE_SECONDARY_INPUT.getValidNonblankWithDefault(state)); + setRetryDelayInSec(retry.get(KEY_WORD_RETRY_DELAY_IN_SEC)); + setRetryCount(retry.get(KEY_WORD_RETRY_COUNT)); + setSecondaryInputs(MultistageProperties.MSTAGE_SECONDARY_INPUT.getValidNonblankWithDefault(state)); + setIsSecondaryAuthenticationEnabled(checkSecondaryAuthenticationEnabled()); + + setSourceSchema(readSourceSchemaFromUrn(state, + MultistageProperties.MSTAGE_SOURCE_SCHEMA_URN.getValidNonblankWithDefault(state))); + setTargetSchema(readTargetSchemaFromUrn(state, + MultistageProperties.MSTAGE_TARGET_SCHEMA_URN.getValidNonblankWithDefault(state))); + + // closing out schema reader if it was created because of reading + // output schema or target schema. + if (schemaReader != null) { + schemaReader.close(); + schemaReader = null; + } + } + + + public boolean isPaginationEnabled() { + // if a pagination key or an initial value is defined, then we have pagination enabled. + // this flag will impact how session be handled, and each protocol can implement it + // accordingly + return paginationFields.size() > 0 || paginationInitValues.size() > 0; + } + + public boolean isSessionStateEnabled() { + return sessionKeyField != null + && sessionKeyField.entrySet().size() > 0 + && sessionKeyField.has("condition") + && sessionKeyField.get("condition").getAsJsonObject().has("regexp"); + } + + public String getSessionStateCondition() { + if (isSessionStateEnabled()) { + return sessionKeyField.get("condition").getAsJsonObject().get("regexp").getAsString(); + } + return StringUtils.EMPTY; + } + + /** + * failCondition is optional in the definition + * @return failCondition if it is defined + */ + public String getSessionStateFailCondition() { + String retValue = StringUtils.EMPTY; + if (isSessionStateEnabled()) { + try { + retValue = sessionKeyField.get("failCondition").getAsJsonObject().get("regexp").getAsString(); + } catch (Exception e) { + log.debug("failCondition is not defined: {}", sessionKeyField); + } + } + return retValue; + } + + public boolean hasSourceSchema() { + return sourceSchema.size() > 0; + } + + public boolean hasOutputSchema() { + return outputSchema.size() > 0; + } + + public boolean hasTargetSchema() { + return targetSchema.size() > 0; + } + + /** + * override the setter and update output schema when source schema is available + * @param sourceSchema source provided schema + */ + public JobKeys setSourceSchema(JsonArray sourceSchema) { + this.sourceSchema = sourceSchema; + if (!this.hasOutputSchema()) { + setOutputSchema(JsonUtils.deepCopy(sourceSchema).getAsJsonArray()); + } + log.debug("Source Schema: {}", sourceSchema.toString()); + log.debug("Output Schema: {}", outputSchema.toString()); + return this; + } + + /** + * Validate the configuration + * @param state configuration state + * @return true if validation was successful, otherwise false + */ + public boolean validate(State state) { + /** + * If pagination is enabled, we need one of following ways to stop pagination + * 1. through a total count field, i.e. ms.total.count.field = data. + * This doesn't validate the correctness of the field. The correctness of this + * field will be validated at extraction time in extractor classes + * 2. through a session cursor with a stop condition, + * i.e. ms.session.key.field = {"name": "status", "condition": {"regexp": "success"}}. + * This doesn't validate whether the stop condition can truly be met. + * If a condition cannot be met because of incorrect specification, eventually + * it will timeout and fail the task. + * 3. through a condition that will eventually lead to a empty response from the source + * This condition cannot be done through a static check, therefore, here only a warning is + * provided. + */ + if (isPaginationEnabled()) { + if (totalCountField == null && !isSessionStateEnabled()) { + log.warn("Pagination is enabled, but there is no total count field or session \n" + + "control to stop it. Pagination will stop only when a blank page is returned from source. \n" + + "Please check the configuration of essential parameters if such condition can happen."); + } + } + + /** + * Check if output schema is correct. + * When a string is present but cannot be parsed, log an error. + * It is OK if output schema is intentionally left blank. + */ + if (!hasOutputSchema()) { + if (!state.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), StringUtils.EMPTY).isEmpty()) { + log.error("Output schema is specified but it is an invalid or empty JsonArray"); + return false; + } + } + + /** + * Check if partitioning property is correct + */ + if (getWorkUnitPartitionType() == null) { + String partTypeString = state.getProp(MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.getConfig()); + if (!StringUtils.isBlank(partTypeString)) { + log.error("ms.work.unit.partition has a unaccepted value: {}", partTypeString); + return false; + } + } else if (getWorkUnitPartitionType() == WorkUnitPartitionTypes.COMPOSITE) { + /** + * for a broad range like this, it must generate at least 1 partition, otherwise + * the partitioning ranges must have incorrect date strings + */ + if (WorkUnitPartitionTypes.COMPOSITE.getRanges( + DateTime.parse("2001-01-01"), + DateTime.now(), true).size() < 1) { + log.error("ms.work.unit.partition has incorrect or non-ISO-formatted date time values"); + return false; + } + } + // TODO other checks + // TODO validate master key location + // TODO validate secondary input structure + // TODO validate watermark structure + // TODO validate parameters structure + // TODO validate authentication structure + + return true; + } + + public void logDebugAll() { + log.debug("These are values in MultistageSource"); + log.debug("Source Uri: {}", sourceUri); + log.debug("Total count field: {}", totalCountField); + log.debug("Pagination: fields {}, initial values {}", paginationFields.toString(), paginationInitValues.toString()); + log.debug("Session key field definition: {}", sessionKeyField.toString()); + log.debug("Call interval in milliseconds: {}", callInterval); + log.debug("Session timeout: {}", sessionTimeout); + log.debug("Derived fields definition: {}", derivedFields.toString()); + log.debug("Output schema definition: {}", outputSchema.toString()); + log.debug("Watermark definition: {}", watermarkDefinition.toString()); + log.debug("Encrypted fields: {}", encryptionField); + log.debug("Retry Delay: {}", retryDelayInSec); + log.debug("Retry Count: {}", retryCount); + } + + public void logUsage(State state) { + for (MultistageProperties p: ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } + + private void parsePaginationFields(State state) { + List paramTypes = Lists.newArrayList( + ParameterTypes.PAGESTART, + ParameterTypes.PAGESIZE, + ParameterTypes.PAGENO + ); + if (MultistageProperties.MSTAGE_PAGINATION.validateNonblank(state)) { + JsonObject p = MultistageProperties.MSTAGE_PAGINATION.getProp(state); + if (p.has("fields")) { + JsonArray fields = p.get("fields").getAsJsonArray(); + for (int i = 0; i < fields.size(); i++) { + if (StringUtils.isNoneBlank(fields.get(i).getAsString())) { + paginationFields.put(paramTypes.get(i), fields.get(i).getAsString()); + } + } + } + } + } + + private void parsePaginationInitialValues(State state) { + List paramTypes = Lists.newArrayList( + ParameterTypes.PAGESTART, + ParameterTypes.PAGESIZE, + ParameterTypes.PAGENO + ); + if (MultistageProperties.MSTAGE_PAGINATION.validateNonblank(state)) { + JsonObject p = MultistageProperties.MSTAGE_PAGINATION.getProp(state); + if (p.has("initialvalues")) { + JsonArray values = p.get("initialvalues").getAsJsonArray(); + for (int i = 0; i < values.size(); i++) { + paginationInitValues.put(paramTypes.get(i), values.get(i).getAsLong()); + } + } + } else { + setPaginationInitValues(new HashMap<>()); + } + } + + /** + * Default field types can be used in schema inferrence, this method + * collect default field types if they are specified in configuration. + * + * @return A map of fields and their default types + */ + private Map parseDefaultFieldTypes(State state) { + if (MultistageProperties.MSTAGE_DATA_DEFAULT_TYPE.validateNonblank(state)) { + return GSON.fromJson(MultistageProperties.MSTAGE_DATA_DEFAULT_TYPE.getProp(state).toString(), + new TypeToken>() { + }.getType()); + } + return new HashMap<>(); + } + + /** + * Sample derived field configuration: + * [{"name": "activityDate", "formula": {"type": "epoc", "source": "fromDateTime", "format": "yyyy-MM-dd'T'HH:mm:ss'Z'"}}] + * + * Currently, only "epoc" and "string" are supported as derived field type. + * For epoc type: + * - Data will be saved as milliseconds in long data type. + * - And the source data is supposed to be a date formatted as a string. + * + * TODO: support more types. + * + * @return derived fields and their definitions + */ + @VisibleForTesting + Map> parseDerivedFields(State state) { + if (!MultistageProperties.MSTAGE_DERIVED_FIELDS.validateNonblank(state)) { + return new HashMap<>(); + } + + Map> derivedFields = new HashMap<>(); + JsonArray jsonArray = MultistageProperties.MSTAGE_DERIVED_FIELDS.getProp(state); + for (JsonElement field: jsonArray) { + + // change the formula part, which is JsonObject, into map + derivedFields.put(field.getAsJsonObject().get("name").getAsString(), + GSON.fromJson( + field.getAsJsonObject().get("formula").getAsJsonObject().toString(), + new TypeToken>() { }.getType())); + } + + return derivedFields; + } + + /** + * Parse output schema defined in ms.output.schema parameter + * + * @param state the Gobblin configurations + * @return the output schema + */ + public JsonArray parseOutputSchema(State state) { + return JsonUtils.deepCopy(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getValidNonblankWithDefault(state)).getAsJsonArray(); + } + + + /** + * This helper function parse out the WorkUnitPartitionTypes from ms.work.unit.partition property + * @param state the State with all configurations + * @return the WorkUnitPartitionTypes + */ + WorkUnitPartitionTypes parsePartitionType(State state) { + WorkUnitPartitionTypes partitionType = WorkUnitPartitionTypes.fromString( + MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.getValidNonblankWithDefault(state)); + + if (partitionType != WorkUnitPartitionTypes.COMPOSITE) { + return partitionType; + } + + // add sub ranges for composite partition type + WorkUnitPartitionTypes.COMPOSITE.resetSubRange(); + try { + JsonObject jsonObject = GSON.fromJson( + MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.getValidNonblankWithDefault(state).toString(), + JsonObject.class); + + for (Map.Entry entry : jsonObject.entrySet()) { + String partitionTypeString = entry.getKey(); + DateTime start = DateTimeUtils.parse(jsonObject.get(entry.getKey()).getAsJsonArray().get(0).getAsString()); + String endDateTimeString = jsonObject.get(entry.getKey()).getAsJsonArray().get(1).getAsString(); + DateTime end; + if (endDateTimeString.matches("-")) { + end = DateTime.now(); + } else { + end = DateTimeUtils.parse(endDateTimeString); + } + partitionType.addSubRange(start, end, WorkUnitPartitionTypes.fromString(partitionTypeString)); + } + } catch (Exception e) { + log.error("Error parsing composite partition string: " + + MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.getValidNonblankWithDefault(state).toString() + + "\n partitions may not be generated properly.", + e); + } + return partitionType; + } + + /** + * This method populates the retry parameters (delayInSec, retryCount) via the secondary input. + * These values are used to retry connection whenever the "authentication" type category is defined and the token hasn't + * been populated yet. If un-defined, they will retain the default values as specified by RETRY_DEFAULT_DELAY and + * RETRY_DEFAULT_COUNT. + * + * For e.g. + * ms.secondary.input : "[{"path": "/util/avro_retry", "fields": ["uuid"], + * "category": "authentication", "retry": {"delayInSec" : "1", "retryCount" : "2"}}]" + * @param jsonArray the raw secondary input + * @return the retry delay and count in a map structure + */ + private Map parseSecondaryInputRetry(JsonArray jsonArray) { + long retryDelay = RETRY_DELAY_IN_SEC_DEFAULT; + long retryCount = RETRY_COUNT_DEFAULT; + Map retry = new HashMap<>(); + for (JsonElement field: jsonArray) { + JsonObject retryFields = (JsonObject) field.getAsJsonObject().get(KEY_WORD_RETRY); + if (retryFields != null && !retryFields.isJsonNull()) { + retryDelay = retryFields.has(KEY_WORD_RETRY_DELAY_IN_SEC) + ? retryFields.get(KEY_WORD_RETRY_DELAY_IN_SEC).getAsLong() : retryDelay; + retryCount = retryFields.has(KEY_WORD_RETRY_COUNT) + ? retryFields.get(KEY_WORD_RETRY_COUNT).getAsLong() : retryCount; + } + } + retry.put(KEY_WORD_RETRY_DELAY_IN_SEC, retryDelay); + retry.put(KEY_WORD_RETRY_COUNT, retryCount); + return retry; + } + + /** + * Check if authentication is configured in secondary input + * @return true if secondary input contains an authentication definition + */ + protected boolean checkSecondaryAuthenticationEnabled() { + for (JsonElement entry: getSecondaryInputs()) { + if (entry.isJsonObject() + && entry.getAsJsonObject().has(KEY_WORD_CATEGORY) + && entry.getAsJsonObject().get(KEY_WORD_CATEGORY).getAsString() + .equalsIgnoreCase(KEY_WORD_AUTHENTICATION)) { + return true; + } + } + return false; + } + + public Map readSecondaryInputs(State state, final long retries) throws InterruptedException { + log.info("Trying to read secondary input with retry = {}", retries); + Map secondaryInputs = readContext(state); + + // Check if authentication is ready, and if not, whether retry is required + JsonArray authentications = secondaryInputs.get(KEY_WORD_AUTHENTICATION); + if ((authentications == null || authentications.size() == 0) && this.getIsSecondaryAuthenticationEnabled() + && retries > 0) { + log.info("Authentication tokens are expected from secondary input, but not ready"); + log.info("Will wait for {} seconds and then retry reading the secondary input", this.getRetryDelayInSec()); + TimeUnit.SECONDS.sleep(this.getRetryDelayInSec()); + return readSecondaryInputs(state, retries - 1); + } + log.info("Successfully read secondary input, no more retry"); + return secondaryInputs; + } + + private Map readContext(State state) { + return new HdfsReader(state, this.getSecondaryInputs()).readAll(); + } + + /** + * Call the reader factory and read schema of the URN + * @param urn the dataset URN + * @param state gobblin configuration + * @return schema in a JsonArray + */ + @VisibleForTesting + public JsonArray readSchemaFromUrn(State state, String urn) { + try { + // Schema Reader could be plugged in before the initialization on JobKeys + if (schemaReader == null) { + schemaReader = SchemaReaderFactory.create(state); + } + return schemaReader.read(state, urn).getAsJsonArray(); + } catch (Exception e) { + log.error("Error reading schema based on urn: {}", urn); + throw new RuntimeException(e); + } + } + + /** + * Filter out derived fields that will be added later on + * @param inSchema the schema array from reader + * @return the filtered schema + */ + public JsonArray removeDerivedFieldsFromSchema(JsonArray inSchema) { + Set derived = getDerivedFields().keySet(); + JsonArray output = new JsonArray(); + inSchema.forEach(column -> { + if (!derived.contains(column.getAsJsonObject().get(KEY_WORD_COLUMN_NAME).getAsString())) { + output.add(column); + } + }); + return output; + } + + /** + * Read source schema if output schema is not present + * + * @param state the Gobblin configurations + * @param urn the source schema URN + * @return the source schema from URN if output schema is not present, + * otherwise return the output schema directly + */ + public JsonArray readSourceSchemaFromUrn(State state, String urn) { + if (!hasOutputSchema() && StringUtils.isNotBlank(urn)) { + JsonArray schema = removeDerivedFieldsFromSchema(readSchemaFromUrn(state, urn)); + return JsonUtils.deepCopy(schema).getAsJsonArray(); + } + return getOutputSchema(); + } + + /** + * Target schema can come from 2 sources + * + * 1. actual schema defined in ms.target.schema parameter + * 2. a URN or source defined in ms.target.schema.urn + * + * @param state the Gobblin configurations + * @return the target schema + */ + public JsonArray readTargetSchemaFromUrn(State state, String urn) { + return !hasTargetSchema() && StringUtils.isNotBlank(urn) + ? readSchemaFromUrn(state, urn) + : getTargetSchema(); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/JsonExtractorKeys.java b/dil/src/main/java/com/linkedin/dil/keys/JsonExtractorKeys.java new file mode 100644 index 0000000..9f24359 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/JsonExtractorKeys.java @@ -0,0 +1,56 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.common.collect.Lists; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.util.Iterator; +import java.util.List; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import org.apache.gobblin.source.workunit.WorkUnit; + + +/** + * These attributes are defined and maintained in JsonExtractor + * + * @author chrli + */ +@Slf4j +@Getter(AccessLevel.PUBLIC) +@Setter +public class JsonExtractorKeys extends ExtractorKeys { + final private static List ESSENTIAL_PARAMETERS = Lists.newArrayList( + MultistageProperties.MSTAGE_DATA_FIELD, + MultistageProperties.MSTAGE_TOTAL_COUNT_FIELD); + + private Iterator jsonElementIterator = null; + private long processedCount; + private long totalCount; + private long currentPageNumber = 0; + private JsonObject pushDowns = new JsonObject(); + + @Override + public void logDebugAll(WorkUnit workUnit) { + super.logDebugAll(workUnit); + log.debug("These are values of JsonExtractor regarding to Work Unit: {}", + workUnit == null ? "testing" : workUnit.getProp(MultistageProperties.DATASET_URN_KEY.toString())); + log.debug("Total rows expected or processed: {}", totalCount); + log.debug("Total rows processed: {}", processedCount); + } + + @Override + public void logUsage(State state) { + super.logUsage(state); + for (MultistageProperties p: ESSENTIAL_PARAMETERS) { + log.info("Property {} ({}) has value {} ", p.toString(), p.getClassName(), p.getValidNonblankWithDefault(state)); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/keys/S3Keys.java b/dil/src/main/java/com/linkedin/dil/keys/S3Keys.java new file mode 100644 index 0000000..7608c1d --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/keys/S3Keys.java @@ -0,0 +1,38 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import software.amazon.awssdk.regions.Region; + + +@Getter (AccessLevel.PUBLIC) +@Setter (AccessLevel.PUBLIC) +@Slf4j +public class S3Keys extends JobKeys { + private String bucket = ""; + private String endpoint = ""; + private String prefix = ""; + private String filesPattern = ".*"; + private Region region = Region.AWS_GLOBAL; + private Integer maxKeys = 0; + private String accessKey; + private String secretId; + private Integer connectionTimeout; + String targetFilePattern; + + @Override + public void logDebugAll() { + super.logDebugAll(); + log.debug("These are values in S3SourceV2:"); + log.debug("S3 Bucket: {}", bucket); + log.debug("S3 endpoint: {}", endpoint); + log.debug("S3 prefix: {}", prefix); + log.debug("S3 files pattern: {}", filesPattern); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/preprocessor/GpgDecryptProcessor.java b/dil/src/main/java/com/linkedin/dil/preprocessor/GpgDecryptProcessor.java new file mode 100644 index 0000000..81301eb --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/preprocessor/GpgDecryptProcessor.java @@ -0,0 +1,55 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.InputStream; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FilenameUtils; +import org.apache.gobblin.annotation.Alias; +import org.apache.gobblin.codec.StreamCodec; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.util.EncryptionUtils; + + +/** + * Preprocessor to handle InputStream that is encrypted with + * GPG compatible algorithm and needs decryption + * + * This is backwards compatible with PGP algorithms + */ +@Slf4j +@Alias("GpgProcessor") +public class GpgDecryptProcessor extends InputStreamProcessor { + @Getter + @Setter + private StreamCodec codec; + + /** + * @param params See {@link MultistageProperties} + */ + public GpgDecryptProcessor(JsonObject params) { + super(params); + this.codec = EncryptionUtils.getGpgCodec(parameters); + } + + @Override + public InputStream process(InputStream inputStream) throws IOException { + return this.codec.decodeInputStream(inputStream); + } + + /** + * TODO: Allow appending an optional file extension + * @param fileName + * @return transformed file name + */ + @Override + public String convertFileName(String fileName) { + return FilenameUtils.removeExtension(fileName); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/preprocessor/GpgEncryptProcessor.java b/dil/src/main/java/com/linkedin/dil/preprocessor/GpgEncryptProcessor.java new file mode 100644 index 0000000..6df7173 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/preprocessor/GpgEncryptProcessor.java @@ -0,0 +1,46 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.OutputStream; +import lombok.Getter; +import lombok.Setter; +import org.apache.commons.io.FilenameUtils; +import org.apache.gobblin.codec.StreamCodec; +import com.linkedin.dil.util.EncryptionUtils; + + +/** + * Preprocessor to encrypted OutputStream using GPG codec + * + * This is backwards compatible with PGP algorithms + * + */ +public class GpgEncryptProcessor extends OutputStreamProcessor { + private static final String FILE_EXT = "gpg"; + @Getter + @Setter + private StreamCodec codec; + + public GpgEncryptProcessor(JsonObject params) { + super(params); + this.codec = EncryptionUtils.getGpgCodec(parameters); + } + + @Override + public OutputStream process(OutputStream origStream) throws IOException { + return codec.encodeOutputStream(origStream); + } + + @Override + public String convertFileName(String fileName) { + if (!FilenameUtils.getExtension(fileName).equals(FILE_EXT)) { + return fileName + ".gpg"; + } + return fileName; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/preprocessor/GunzipProcessor.java b/dil/src/main/java/com/linkedin/dil/preprocessor/GunzipProcessor.java new file mode 100644 index 0000000..57f3cfe --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/preprocessor/GunzipProcessor.java @@ -0,0 +1,35 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.InputStream; +import java.util.zip.GZIPInputStream; +import org.apache.commons.io.FilenameUtils; + + +/** + * a preprocessor that transforms a Gzipped InputStream to unzipped format + */ +public class GunzipProcessor extends InputStreamProcessor { + + private static final String FILE_EXT = "gz"; + + public GunzipProcessor(JsonObject params) { + super(params); + } + + @Override + public InputStream process(InputStream input) throws IOException { + return new GZIPInputStream(input); + } + + @Override + public String convertFileName(String fileName) { + String extension = FilenameUtils.getExtension(fileName); + return FILE_EXT.equals(extension) ? FilenameUtils.removeExtension(fileName) : fileName; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/preprocessor/InputStreamProcessor.java b/dil/src/main/java/com/linkedin/dil/preprocessor/InputStreamProcessor.java new file mode 100644 index 0000000..a783186 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/preprocessor/InputStreamProcessor.java @@ -0,0 +1,24 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.InputStream; + + +/** + * a base class for dynamic InputStream preprocessor + */ +abstract public class InputStreamProcessor implements StreamProcessor { + protected JsonObject parameters; + + public InputStreamProcessor(JsonObject params) { + this.parameters = params; + } + abstract public InputStream process(InputStream input) throws IOException; + + abstract public String convertFileName(String fileName); +} diff --git a/dil/src/main/java/com/linkedin/dil/preprocessor/OutputStreamProcessor.java b/dil/src/main/java/com/linkedin/dil/preprocessor/OutputStreamProcessor.java new file mode 100644 index 0000000..c932925 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/preprocessor/OutputStreamProcessor.java @@ -0,0 +1,24 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.OutputStream; + + +/** + * a base class for dynamic OutputStream preprocessor + */ +abstract public class OutputStreamProcessor implements StreamProcessor { + protected JsonObject parameters; + + public OutputStreamProcessor(JsonObject params) { + this.parameters = params; + } + abstract public OutputStream process(OutputStream origStream) throws IOException; + + abstract public String convertFileName(String fileName); +} diff --git a/dil/src/main/java/com/linkedin/dil/preprocessor/StreamProcessor.java b/dil/src/main/java/com/linkedin/dil/preprocessor/StreamProcessor.java new file mode 100644 index 0000000..32b38fc --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/preprocessor/StreamProcessor.java @@ -0,0 +1,18 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import java.io.Closeable; +import java.io.IOException; + + +/** + * A common interface for all preprocessors + * + * @param can be either InputStream or OutputStream + */ +public interface StreamProcessor { + T process(T origin) throws IOException; +} diff --git a/dil/src/main/java/com/linkedin/dil/source/HdfsSource.java b/dil/src/main/java/com/linkedin/dil/source/HdfsSource.java new file mode 100644 index 0000000..8319c33 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/source/HdfsSource.java @@ -0,0 +1,57 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.connection.HdfsConnection; +import com.linkedin.dil.extractor.MultistageExtractor; +import com.linkedin.dil.keys.HdfsKeys; +import org.apache.gobblin.source.extractor.Extractor; + + +/** + * This class supports HDFS as just another protocol. The main function + * of it is to launch a proper extractor with a HdfsConnection + */ +@Slf4j +public class HdfsSource extends MultistageSource { + @Getter @Setter + private HdfsKeys hdfsKeys; + + public HdfsSource() { + hdfsKeys = new HdfsKeys(); + jobKeys = hdfsKeys; + } + + protected void initialize(State state) { + super.initialize(state); + hdfsKeys.logUsage(state); + hdfsKeys.logDebugAll(); + } + + /** + * Create extractor based on the input WorkUnitState, the extractor.class + * configuration, and a new HdfsConnection + * + * @param state WorkUnitState passed in from Gobblin framework + * @return the MultistageExtractor object + */ + + @Override + public Extractor getExtractor(WorkUnitState state) { + initialize(state); + MultistageExtractor extractor = + (MultistageExtractor) super.getExtractor(state); + extractor.setConnection(new HdfsConnection(state, hdfsKeys, extractor.getExtractorKeys())); + return extractor; + + } +} diff --git a/dil/src/main/java/com/linkedin/dil/source/HttpSource.java b/dil/src/main/java/com/linkedin/dil/source/HttpSource.java new file mode 100644 index 0000000..61f2428 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/source/HttpSource.java @@ -0,0 +1,186 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.reflect.TypeToken; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.codec.binary.Base64; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.HttpConnection; +import com.linkedin.dil.extractor.MultistageExtractor; +import com.linkedin.dil.keys.HttpKeys; +import com.linkedin.dil.util.EncryptionUtils; +import org.apache.gobblin.source.extractor.Extractor; + + +/** + * + * HttpSource is a generic Gobblin source connector for HTTP based data sources including + * Rest API + * + * @author chrli + */ +@Slf4j +@SuppressWarnings("unchecked") +public class HttpSource extends MultistageSource { + private final static Gson GSON = new Gson(); + private final static String BASIC_TOKEN_PREFIX = "Basic"; + private final static String BEARER_TOKEN_PREFIX = "Bearer"; + final static String OAUTH_TOKEN_PREFIX = "OAuth"; + final static String TOKEN_PREFIX_SEPARATOR = " "; + @VisibleForTesting + + @Getter @Setter + private HttpKeys httpSourceKeys; + + public HttpSource() { + httpSourceKeys = new HttpKeys(); + jobKeys = httpSourceKeys; + } + + protected void initialize(State state) { + super.initialize(state); + httpSourceKeys.logUsage(state); + httpSourceKeys.setHttpRequestHeaders(getRequestHeader(state)); + httpSourceKeys.setHttpRequestMethod(MultistageProperties.MSTAGE_HTTP_REQUEST_METHOD.getProp(state)); + httpSourceKeys.setAuthentication(MultistageProperties.MSTAGE_AUTHENTICATION.getValidNonblankWithDefault(state)); + httpSourceKeys.setHttpRequestHeadersWithAuthentication(getHeadersWithAuthentication(state)); + httpSourceKeys.setHttpStatuses(getHttpStatuses(state)); + httpSourceKeys.setHttpStatusReasons(getHttpStatusReasons(state)); + httpSourceKeys.logDebugAll(); + } + + /** + * Create extractor based on the input WorkUnitState, the extractor.class + * configuration, and a new HttpConnection + * + * @param state WorkUnitState passed in from Gobblin framework + * @return the MultistageExtractor object + */ + @Override + public Extractor getExtractor(WorkUnitState state) { + initialize(state); + MultistageExtractor extractor = + (MultistageExtractor) super.getExtractor(state); + extractor.setConnection(new HttpConnection(state, this.httpSourceKeys, extractor.getExtractorKeys())); + return extractor; + } + + /** + * Support: + * Basic Http Authentication + * Bearer token with Authorization header only, not including access_token in URI or Entity Body + * + * see Bearer token reference: https://tools.ietf.org/html/rfc6750 + * + * @param state source state + * @return header tag with proper encryption of tokens + */ + @VisibleForTesting + Map getAuthenticationHeader(State state) { + if (httpSourceKeys.getAuthentication().entrySet().size() == 0) { + return new HashMap<>(); + } + + String authMethod = httpSourceKeys.getAuthentication().get("method").getAsString(); + if (!authMethod.toLowerCase().matches("basic|bearer|oauth|custom")) { + log.warn("Unsupported authentication type: " + authMethod); + return new HashMap<>(); + } + + String token = ""; + if (httpSourceKeys.getAuthentication().has("token")) { + token = EncryptionUtils.decryptGobblin(httpSourceKeys.getAuthentication().get("token").getAsString(), state); + } else { + String u = EncryptionUtils.decryptGobblin(MultistageProperties.SOURCE_CONN_USERNAME.getProp(state), state); + String p = EncryptionUtils.decryptGobblin(MultistageProperties.SOURCE_CONN_PASSWORD.getProp(state), state); + token = u + ":" + p; + } + + if (httpSourceKeys.getAuthentication().get("encryption").getAsString().equalsIgnoreCase("base64")) { + token = new String(Base64.encodeBase64((token).getBytes(StandardCharsets.UTF_8)), + StandardCharsets.UTF_8); + } + + String header = ""; + if (authMethod.equalsIgnoreCase(BASIC_TOKEN_PREFIX)) { + header = BASIC_TOKEN_PREFIX + TOKEN_PREFIX_SEPARATOR + token; + } else if (authMethod.equalsIgnoreCase(BEARER_TOKEN_PREFIX)) { + header = BEARER_TOKEN_PREFIX + TOKEN_PREFIX_SEPARATOR + token; + } else if (authMethod.equalsIgnoreCase(OAUTH_TOKEN_PREFIX)) { + header = OAUTH_TOKEN_PREFIX + TOKEN_PREFIX_SEPARATOR + token; + } else { + header = token; + } + return new ImmutableMap.Builder().put(httpSourceKeys.getAuthentication().get("header").getAsString(), header).build(); + } + + private Map getHeadersWithAuthentication(State state) { + Map headers = toStringStringMap(httpSourceKeys.getHttpRequestHeaders()); + headers.putAll(getAuthenticationHeader(state)); + return headers; + } + + private Map toStringStringMap(JsonObject json) { + return GSON.fromJson(json.toString(), + new TypeToken>() { }.getType()); + } + + private Map> getHttpStatuses(State state) { + Map> statuses = new HashMap<>(); + JsonObject jsonObject = MultistageProperties.MSTAGE_HTTP_STATUSES.getValidNonblankWithDefault(state); + for (Map.Entry entry: jsonObject.entrySet()) { + String key = entry.getKey(); + JsonElement value = jsonObject.get(key); + if (!value.isJsonNull() && value.isJsonArray()) { + statuses.put(key, GSON.fromJson(value.toString(), new TypeToken>() { }.getType())); + } + } + return statuses; + } + + private Map> getHttpStatusReasons(State state) { + Map> reasons = new HashMap<>(); + JsonObject jsonObject = MultistageProperties.MSTAGE_HTTP_STATUS_REASONS.getValidNonblankWithDefault(state); + for (Map.Entry entry: jsonObject.entrySet()) { + String key = entry.getKey(); + JsonElement value = jsonObject.get(key); + if (!value.isJsonNull() && value.isJsonArray()) { + reasons.put(key, GSON.fromJson(value.toString(), new TypeToken>() { }.getType())); + } + } + return reasons; + } + + /** + * read the ms.http.request.headers and decrypt values if encrypted + * @param state the source state + * @return the decrypted http request headers + */ + private JsonObject getRequestHeader(State state) { + JsonObject headers = MultistageProperties.MSTAGE_HTTP_REQUEST_HEADERS.getValidNonblankWithDefault(state); + JsonObject decrypted = new JsonObject(); + for (Map.Entry entry: headers.entrySet()) { + String key = entry.getKey(); + decrypted.addProperty(key, EncryptionUtils.decryptGobblin(headers.get(key).getAsString(), state)); + } + return decrypted; + } +} \ No newline at end of file diff --git a/dil/src/main/java/com/linkedin/dil/source/JdbcSource.java b/dil/src/main/java/com/linkedin/dil/source/JdbcSource.java new file mode 100644 index 0000000..7974256 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/source/JdbcSource.java @@ -0,0 +1,70 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import java.sql.Connection; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.JdbcConnection; +import com.linkedin.dil.extractor.MultistageExtractor; +import com.linkedin.dil.keys.JdbcKeys; +import com.linkedin.dil.util.CsvUtils; +import org.apache.gobblin.source.extractor.Extractor; + + +/*** + * JdbcSource handles JDBC protocol + * + */ + +@Slf4j +public class JdbcSource extends MultistageSource { + @Setter + private ConcurrentMap memberConnections = new ConcurrentHashMap<>(); + private JdbcKeys jdbcSourceKeys = null; + + public JdbcSource() { + jdbcSourceKeys = new JdbcKeys(); + jobKeys = jdbcSourceKeys; + } + + protected void initialize(State state) { + super.initialize(state); + jdbcSourceKeys.logUsage(state); + jdbcSourceKeys.setJdbcStatement(MultistageProperties.MSTAGE_JDBC_STATEMENT.getValidNonblankWithDefault(state)); + jdbcSourceKeys.setSeparator(CsvUtils.unescape(MultistageProperties.MSTAGE_CSV_SEPARATOR + .getValidNonblankWithDefault(state))); + jdbcSourceKeys.setQuoteCharacter(CsvUtils.unescape(MultistageProperties.MSTAGE_CSV_QUOTE_CHARACTER + .getValidNonblankWithDefault(state))); + jdbcSourceKeys.setEscapeCharacter(CsvUtils.unescape(MultistageProperties.MSTAGE_CSV_ESCAPE_CHARACTER + .getValidNonblankWithDefault(state))); + jdbcSourceKeys.setSchemaRefactorFunction(MultistageProperties.MSTAGE_JDBC_SCHEMA_REFACTOR + .getValidNonblankWithDefault(state)); + jdbcSourceKeys.logDebugAll(); + } + + /** + * Create extractor based on the input WorkUnitState, the extractor.class + * configuration, and a new JdbcConnection + * + * @param state WorkUnitState passed in from Gobblin framework + * @return the MultistageExtractor object + */ + @Override + public Extractor getExtractor(WorkUnitState state) { + initialize(state); + MultistageExtractor extractor = + (MultistageExtractor) super.getExtractor(state); + extractor.setConnection(new JdbcConnection(state, this.jdbcSourceKeys, extractor.getExtractorKeys())); + return extractor; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/source/MultistageSource.java b/dil/src/main/java/com/linkedin/dil/source/MultistageSource.java new file mode 100644 index 0000000..be367e0 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/source/MultistageSource.java @@ -0,0 +1,515 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.common.primitives.Longs; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.lang.reflect.Constructor; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.gobblin.configuration.ConfigurationKeys; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.extractor.MultistageExtractor; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.EndecoUtils; +import com.linkedin.dil.util.HdfsReader; +import com.linkedin.dil.util.WatermarkDefinition; +import org.apache.gobblin.source.extractor.Extractor; +import org.apache.gobblin.source.extractor.WatermarkInterval; +import org.apache.gobblin.source.extractor.extract.AbstractSource; +import org.apache.gobblin.source.extractor.extract.LongWatermark; +import org.apache.gobblin.source.workunit.Extract; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.joda.time.DateTime; +import org.testng.Assert; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * This is the base Source class of multi-stage connectors. + * + * MultistageSource, like other Gobblin Source classes, is responsible + * for planning. Specifically MultistageSource has following functions: + * + * 1. Generate work units when called by Gobblin Framework + * 2. Instantiate Extractors + * + * Gobblin first instantiate a MultistageSource from one of its sub-classes, + * then calls the getWorkUnits() method. The input to getWorkUnits() is SourceState. + * + * After getting a list of work units, Gobblin will instantiate again one + * MultistageSource from one of its sub-classes for each of the work unit, + * and then call the getExtractor() method on each instance. The input to + * getExtractor() is WorkUnitState. + * + * @author chrli + * @param The schema class + * @param The data class + */ + +@Slf4j +@SuppressWarnings("unchecked") +public class MultistageSource extends AbstractSource { + final static private Gson GSON = new Gson(); + final static private String PROPERTY_SEPARATOR = "."; + final static private String DUMMY_DATETIME_WATERMARK_START = "2019-01-01"; + final static private String CURRENT_DATE_SYMBOL = "-"; + final static private String ACTIVATION_WATERMARK_NAME = "activation"; + // Avoid too many partition created from misconfiguration, Months * Days * Hours + final private static int MAX_DATETIME_PARTITION = 3 * 30 * 24; + + @Getter(AccessLevel.PUBLIC) + @Setter(AccessLevel.MODULE) + protected SourceState sourceState = null; + @Getter(AccessLevel.PUBLIC) + @Setter(AccessLevel.PUBLIC) + JobKeys jobKeys = new JobKeys(); + @Getter(AccessLevel.PUBLIC) + @Setter(AccessLevel.MODULE) + + final private ConcurrentHashMap, WorkUnitState> extractorState = + new ConcurrentHashMap<>(); + + protected void initialize(State state) { + jobKeys.initialize(state); + } + + /** + * getWorkUnits() is the first place to receive the Source State object, therefore + * initialization of parameters cannot be complete in constructor. + */ + @SneakyThrows + @Override + public List getWorkunits(SourceState state) { + sourceState = state; + initialize(state); + + if (!jobKeys.validate(state)) { + return new ArrayList<>(); + } + + // Parse watermark settings if defined + List definedWatermarks = Lists.newArrayList(); + for (JsonElement definitionJson : jobKeys.getWatermarkDefinition()) { + Assert.assertTrue(definitionJson.isJsonObject()); + definedWatermarks.add(new WatermarkDefinition( + definitionJson.getAsJsonObject(), jobKeys.getIsPartialPartition(), + jobKeys.getWorkUnitPartitionType())); + } + + Map secondaryInputs = readSecondaryInputs(sourceState, jobKeys.getRetryCount()); + JsonArray authentications = secondaryInputs.get(KEY_WORD_AUTHENTICATION); + JsonArray activations = secondaryInputs.computeIfAbsent(KEY_WORD_ACTIVATION, x -> new JsonArray()); + JsonArray payloads = secondaryInputs.computeIfAbsent(KEY_WORD_PAYLOAD, x -> new JsonArray()); + + if (activations.size() == 0 && payloads.size() != 0) { + JsonObject simpleActivation = new JsonObject(); + activations.add(simpleActivation); + } + + if (activations.size() > 0) { + definedWatermarks.add(new WatermarkDefinition(ACTIVATION_WATERMARK_NAME, activations)); + } + + // get previous high watermarks by each watermark partition or partition combinations + // if there are multiple partitioned watermarks, such as one partitioned datetime watermark + // and one partitioned activation (unit) watermark + Map previousHighWatermarks = getPreviousHighWatermarks(); + state.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, checkFullExtractState(state, previousHighWatermarks)); + + // generated work units based on watermarks defined and previous high watermarks + List wuList = generateWorkUnits(definedWatermarks, previousHighWatermarks); + if (authentications != null && authentications.size() == 1) { + for (WorkUnit wu : wuList) { + wu.setProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.toString(), + getUpdatedWorkUnitActivation(wu, authentications.get(0).getAsJsonObject())); + + // unlike activation secondary inputs, payloads will be processed in each work unit + // and payloads will not be loaded until the Connection executes the command + wu.setProp(MultistageProperties.MSTAGE_PAYLOAD_PROPERTY.toString(), payloads); + } + } + return wuList; + } + + /** + * reads the multistage source to get the secondary input categories - authentication and activation + * In case the token is missing, it will retry accessing the tokens as per the retry parameters + * ("delayInSec", "retryCount") + */ + private Map readSecondaryInputs(State state, final long retries) + throws InterruptedException { + log.info("Trying to read secondary input with retry = {}", retries); + Map secondaryInputs = readContext(state); + + // Check if authentication is ready, and if not, whether retry is required + JsonArray authentications = secondaryInputs.get(KEY_WORD_AUTHENTICATION); + if ((authentications == null || authentications.size() == 0) + && jobKeys.getIsSecondaryAuthenticationEnabled() && retries > 0) { + log.info("Authentication tokens are expected from secondary input, but not ready"); + log.info("Will wait for {} seconds and then retry reading the secondary input", jobKeys.getRetryDelayInSec()); + TimeUnit.SECONDS.sleep(jobKeys.getRetryDelayInSec()); + return readSecondaryInputs(state, retries - 1); + } + log.info("Successfully read secondary input, no more retry"); + return secondaryInputs; + } + + /** + * Default multi-stage source behavior, each protocol shall override this with more concrete function + * @param state WorkUnitState passed in from Gobblin framework + * @return an MultistageExtractor instance + */ + @Override + public Extractor getExtractor(WorkUnitState state) { + try { + ClassLoader loader = this.getClass().getClassLoader(); + Class extractorClass = loader.loadClass(MultistageProperties.MSTAGE_EXTRACTOR_CLASS.getValidNonblankWithDefault(state)); + Constructor> constructor = (Constructor>) + extractorClass.getConstructor(WorkUnitState.class, JobKeys.class); + MultistageExtractor extractor = (MultistageExtractor) constructor.newInstance(state, this.jobKeys); + extractorState.put(extractor, state); + extractor.setConnection(null); + return extractor; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * provide a default implementation + * @param state Source State + */ + @Override + public void shutdown(SourceState state) { + log.info("MultistageSource Shutdown() called, instructing extractors to close connections"); + for (MultistageExtractor extractor: extractorState.keySet()) { + extractor.closeConnection(); + } + } + + List generateWorkUnits(List definitions, Map previousHighWatermarks) { + Assert.assertNotNull(sourceState); + WatermarkDefinition datetimeWatermark = null; + WatermarkDefinition unitWatermark = null; + + for (WatermarkDefinition wmd: definitions) { + if (wmd.getType() == WatermarkDefinition.WatermarkTypes.DATETIME) { + if (datetimeWatermark != null) { + throw new RuntimeException("1 and only datetime type watermark is allowed."); + } + datetimeWatermark = wmd; + } + if (wmd.getType() == WatermarkDefinition.WatermarkTypes.UNIT) { + if (unitWatermark != null) { + throw new RuntimeException("1 and only unit type watermark is allowed" + + ", including the unit watermark generated from secondary input."); + } + unitWatermark = wmd; + } + } + // Set default unit watermark + if (unitWatermark == null) { + JsonArray unitArray = new JsonArray(); + unitArray.add(new JsonObject()); + unitWatermark = new WatermarkDefinition("unit", unitArray); + } + // Set default datetime watermark + if (datetimeWatermark == null) { + datetimeWatermark = new WatermarkDefinition("datetime", DUMMY_DATETIME_WATERMARK_START, CURRENT_DATE_SYMBOL); + } + + List workUnits = new ArrayList<>(); + Extract extract = createExtractObject(checkFullExtractState(sourceState, previousHighWatermarks)); + List> datetimePartitions = getDatetimePartitions(datetimeWatermark.getRangeInDateTime()); + List unitPartitions = unitWatermark.getUnits(); + + JsonArray watermarkGroups = new JsonArray(); + String datetimeWatermarkName = datetimeWatermark.getLongName(); + String unitWatermarkName = unitWatermark.getLongName(); + watermarkGroups.add(datetimeWatermarkName); + watermarkGroups.add(unitWatermarkName); + + // only create work unit when the high range is greater than cutoff time + // cutoff time is moved backward by GRACE_PERIOD + // cutoff time is moved forward by ABSTINENT_PERIOD + Long cutoffTime = previousHighWatermarks.size() == 0 ? -1 : Collections.max(previousHighWatermarks.values()) + - MultistageProperties.MSTAGE_GRACE_PERIOD_DAYS.getMillis(sourceState); + log.debug("Overall cutoff time: {}", cutoffTime); + + for (ImmutablePair dtPartition : datetimePartitions) { + log.debug("dtPartition: {}", dtPartition); + for (String unitPartition: unitPartitions) { + + // adding the date time partition and unit partition combination to work units until + // it reaches ms.work.unit.parallelism.max. a combination is not added if its prior + // watermark doesn't require a rerun. + // a work unit signature is a date time partition and unit partition combination. + if (MultistageProperties.MSTAGE_WORK_UNIT_PARALLELISM_MAX.validateNonblank(sourceState) + && workUnits.size() >= (Integer) MultistageProperties.MSTAGE_WORK_UNIT_PARALLELISM_MAX.getProp(sourceState)) { + break; + } + + // each work unit has a watermark since we use dataset.urn to track state + // and the work unit can be uniquely identified by its signature + String wuSignature = getWorkUnitSignature( + datetimeWatermarkName, dtPartition.getLeft(), + unitWatermarkName, unitPartition); + log.debug("Checking work unit: {}", wuSignature); + + // if a work unit exists in state store, manage its watermark independently + long unitCutoffTime = -1L; + if (previousHighWatermarks.containsKey(wuSignature)) { + unitCutoffTime = previousHighWatermarks.get(wuSignature) + - MultistageProperties.MSTAGE_GRACE_PERIOD_DAYS.getMillis(sourceState) + + MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getMillis(sourceState); + } + log.debug(String.format("previousHighWatermarks.get(wuSignature): %s, unitCutoffTime: %s", + previousHighWatermarks.get(wuSignature), unitCutoffTime)); + + // for a dated work unit partition, we only need to redo it when its previous + // execution was not successful + // for recent work unit partitions, we might need to re-extract based on + // grace period logic, which is controlled by cut off time + if (unitCutoffTime == -1L + || dtPartition.getRight() >= Longs.max(unitCutoffTime, cutoffTime)) { + // prune the date range only if it is not partitioned + // note the nominal date range low boundary had been saved in signature + ImmutablePair dtPartitionModified = dtPartition; + if (datetimePartitions.size() == 1 && dtPartition.left < cutoffTime) { + dtPartitionModified = new ImmutablePair<>(cutoffTime, dtPartition.right); + } + log.debug("dtPartitionModified: {}", dtPartitionModified); + + log.info("Generating Work Unit: {}, watermark: {}", wuSignature, dtPartitionModified); + WorkUnit workUnit = WorkUnit.create(extract, + new WatermarkInterval( + new LongWatermark(dtPartitionModified.getLeft()), + new LongWatermark(dtPartitionModified.getRight()))); + + // save work unit signature for identification + // because each dataset URN key will have a state file on Hadoop, it cannot contain path separator + workUnit.setProp(MultistageProperties.MSTAGE_WATERMARK_GROUPS.toString(), + watermarkGroups.toString()); + workUnit.setProp(MultistageProperties.DATASET_URN_KEY.toString(), EndecoUtils.getHadoopFsEncoded(wuSignature)); + + // save the lower number of datetime watermark partition and the unit watermark partition + workUnit.setProp(datetimeWatermarkName, dtPartition.getLeft()); + workUnit.setProp(unitWatermarkName, unitPartition); + + workUnit.setProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.toString(), unitPartition); + workUnit.setProp(MultistageProperties.MSTAGE_WORKUNIT_STARTTIME_KEY.toString(), + DateTime.now().getMillis() + + workUnits.size() * MultistageProperties.MSTAGE_WORK_UNIT_PACING_SECONDS.getMillis(sourceState)); + + if (!MultistageProperties.MSTAGE_OUTPUT_SCHEMA.validateNonblank(sourceState) + && this.jobKeys.hasOutputSchema()) { + // populate the output schema read from URN reader to sub tasks + // so that the URN reader will not be called again + log.info("Populating output schema to work units:"); + log.info("Output schema: {}", this.jobKeys.getOutputSchema().toString()); + workUnit.setProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), + this.jobKeys.getOutputSchema().toString()); + + // populate the target schema read from URN reader to sub tasks + // so that the URN reader will not be called again + log.info("Populating target schema to work units:"); + log.info("Target schema: {}", jobKeys.getTargetSchema().toString()); + workUnit.setProp(MultistageProperties.MSTAGE_TARGET_SCHEMA.getConfig(), + jobKeys.getTargetSchema().toString()); + } + workUnits.add(workUnit); + } + } + } + return workUnits; + } + + /** + * breaks a date time range to smaller partitions per WORK_UNIT_PARTITION property setting + * if too many partitions created, truncate to the maximum partitions allowed + * @param datetimeRange a range of date time values + * @return a list of data time ranges in milliseconds + */ + private List> getDatetimePartitions(ImmutablePair datetimeRange) { + List> partitions = Lists.newArrayList(); + if (jobKeys.getWorkUnitPartitionType() != null) { + partitions = jobKeys.getWorkUnitPartitionType().getRanges(datetimeRange, + MultistageProperties.MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getValidNonblankWithDefault(sourceState)); + } else { + partitions.add(new ImmutablePair<>(datetimeRange.getLeft().getMillis(), datetimeRange.getRight().getMillis())); + } + // Safety check if too many partitions created + if (partitions.size() > MAX_DATETIME_PARTITION) { + // Preserve the last N partitions + partitions = partitions.subList(partitions.size() - MAX_DATETIME_PARTITION, partitions.size()); + log.warn("Too many partitions, created {}, only processing the last {}", partitions.size(), MAX_DATETIME_PARTITION); + } + return partitions; + } + + /** + * Read preceding job output, and return as a JsonArray. + * + * The location of preceding job output and fields of selection are + * configured in parameter multistagesource.secondary.input, which should + * have a path element and fields element. The path element shall contain + * a list of input paths, and the fields element contains columns to be + * returned. + * + * Assume this cannot be a Json primitive, and return null if so. + * + * @return a set of JsonArrays of data read from locations specified in SECONDARY_INPUT + * property organized by category, in a Map structure + */ + private Map readContext(State state) { + Map secondaryInputs = new HashMap<>(); + for (JsonElement entry: jobKeys.getSecondaryInputs()) { + if (!entry.getAsJsonObject().has(KEY_WORD_PATH)) { + continue; + } + + String category = entry.getAsJsonObject().has(KEY_WORD_CATEGORY) + ? entry.getAsJsonObject().get(KEY_WORD_CATEGORY).getAsString() + : KEY_WORD_ACTIVATION; + + JsonArray categoryData = secondaryInputs.computeIfAbsent(category, x -> new JsonArray()); + if (category.equalsIgnoreCase(KEY_WORD_ACTIVATION) || category.equalsIgnoreCase(KEY_WORD_AUTHENTICATION)) { + categoryData.addAll(new HdfsReader(state).readSecondary(entry.getAsJsonObject())); + } + + if (entry.getAsJsonObject().has(KEY_WORD_PATH) && category.equalsIgnoreCase(KEY_WORD_PAYLOAD)) { + categoryData.add(entry); + } + + } + return secondaryInputs; + } + + /** + * Get all previous highest high watermarks, by dataset URN. If a dataset URN + * had multiple work units, the highest high watermark is retrieved for that + * dataset URN. + * + * @return the previous highest high watermarks by dataset URN + */ + private Map getPreviousHighWatermarks() { + Map watermarks = new HashMap<>(); + Map> wuStates = sourceState.getPreviousWorkUnitStatesByDatasetUrns(); + for (Map.Entry> entry: wuStates.entrySet()) { + Long highWatermark = Collections.max(Lists.newArrayList(entry.getValue().iterator()).stream() + .map(s -> s.getActualHighWatermark(LongWatermark.class).getValue()) + .collect(Collectors.toList())); + + // Unit watermarks might contain encoded file separator, + // in such case, we will decode the watermark name so that it can be compared with + // work unit signatures + log.debug("Dataset Signature: {}, High Watermark: {}", EndecoUtils.getHadoopFsDecoded(entry.getKey()), highWatermark); + watermarks.put(EndecoUtils.getHadoopFsDecoded(entry.getKey()), highWatermark); + } + return ImmutableMap.copyOf(watermarks); + } + + Extract createExtractObject(final boolean isFull) { + Extract extract = createExtract( + Extract.TableType.valueOf(MultistageProperties.EXTRACT_TABLE_TYPE_KEY.getValidNonblankWithDefault(sourceState)), + MultistageProperties.EXTRACT_NAMESPACE_NAME_KEY.getProp(sourceState), + MultistageProperties.EXTRACT_TABLE_NAME_KEY.getProp(sourceState)); + extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, isFull); + return extract; + } + + private String getWorkUnitSignature( + String datetimeWatermarkName, + Long datetimePartition, + String unitWatermarkName, + String unitPartition) { + List list = Lists.newArrayList(datetimeWatermarkName + PROPERTY_SEPARATOR + datetimePartition, + unitWatermarkName + PROPERTY_SEPARATOR + unitPartition); + return list.toString(); + } + + /** + * retrieve the authentication data from secondary input + * TODO there is a slight inefficiency here + * @param retries number of retries remaining + * @return the authentication JsonObject + */ + @SneakyThrows + protected JsonObject readSecondaryAuthentication(State state, final long retries) { + Map secondaryInputs = readSecondaryInputs(state, retries); + if (secondaryInputs.containsKey(KEY_WORD_ACTIVATION) + && secondaryInputs.get(KEY_WORD_AUTHENTICATION).isJsonArray() + && secondaryInputs.get(KEY_WORD_AUTHENTICATION).getAsJsonArray().size() > 0) { + return secondaryInputs.get(KEY_WORD_AUTHENTICATION).get(0).getAsJsonObject(); + } + return new JsonObject(); + } + + /** + * This updates the activation properties of the work unit if a new authentication token + * become available + * @param wu the work unit configuration + * @param authentication the authentication token from, usually, the secondary input + * @return the updated work unit configuration + */ + protected String getUpdatedWorkUnitActivation(WorkUnit wu, JsonObject authentication) { + log.debug("Activation property (origin): {}", wu.getProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.toString(), "")); + if (!wu.getProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.toString(), StringUtils.EMPTY).isEmpty()) { + JsonObject existing = GSON.fromJson(wu.getProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.toString()), JsonObject.class); + for (Map.Entry entry: authentication.entrySet()) { + existing.remove(entry.getKey()); + existing.addProperty(entry.getKey(), entry.getValue().getAsString()); + } + log.debug("Activation property (modified): {}", existing.toString()); + return existing.toString(); + } + log.debug("Activation property (new): {}", authentication.toString()); + return authentication.toString(); + } + + /** + * Check if a full extract is needed + * @param state source state + * @param previousHighWatermarks existing high watermarks + * @return true if all conditions met for a full extract, otherwise false + */ + private boolean checkFullExtractState(final State state, final Map previousHighWatermarks) { + if (MultistageProperties.EXTRACT_TABLE_TYPE_KEY.getValidNonblankWithDefault(state).toString() + .equalsIgnoreCase(KEY_WORD_SNAPSHOT_ONLY)) { + return true; + } + + if (MultistageProperties.MSTAGE_ENABLE_DYNAMIC_FULL_LOAD.getValidNonblankWithDefault(state)) { + if (previousHighWatermarks.isEmpty()) { + return true; + } + } + + return state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY, false); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/source/S3SourceV2.java b/dil/src/main/java/com/linkedin/dil/source/S3SourceV2.java new file mode 100644 index 0000000..65eee49 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/source/S3SourceV2.java @@ -0,0 +1,122 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.gson.JsonObject; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import okhttp3.HttpUrl; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.S3Connection; +import com.linkedin.dil.extractor.MultistageExtractor; +import com.linkedin.dil.keys.S3Keys; +import com.linkedin.dil.util.EndecoUtils; +import org.apache.gobblin.source.extractor.Extractor; +import software.amazon.awssdk.regions.Region; + + +@Slf4j +public class S3SourceV2 extends MultistageSource { + private static final String KEY_REGION = "region"; + private static final String KEY_CONNECTION_TIMEOUT = "connection_timeout"; + private static final HashSet S3_REGIONS_SET = + Region.regions().stream().map(region -> region.toString()).collect(Collectors.toCollection(HashSet::new)); + @Getter + private S3Keys s3SourceV2Keys = new S3Keys(); + + public S3SourceV2() { + s3SourceV2Keys = new S3Keys(); + jobKeys = s3SourceV2Keys; + } + protected void initialize(State state) { + super.initialize(state); + s3SourceV2Keys.logUsage(state); + HttpUrl url = HttpUrl.parse(MultistageProperties.MSTAGE_SOURCE_URI.getValidNonblankWithDefault(state)); + if (url == null || url.host().isEmpty()) { + throw new RuntimeException("Incorrect configuration in " + + MultistageProperties.MSTAGE_SOURCE_URI.toString()); + } + + // set region, note that aws SDK won't raise an error here if region is invalid, + // later on, an exception will be raised when the actual request is issued + JsonObject parameters = MultistageProperties.MSTAGE_SOURCE_S3_PARAMETERS.getValidNonblankWithDefault(state); + if (parameters.has(KEY_REGION)) { + String region = parameters.get(KEY_REGION).getAsString(); + if (!S3_REGIONS_SET.contains(region)) { + throw new IllegalArgumentException(region + " is not a valid S3 region."); + } + s3SourceV2Keys.setRegion(Region.of(region)); + } else { + // Default to us-west-2 + s3SourceV2Keys.setRegion(Region.US_WEST_2); + } + + // set S3 connection timeout, non-positive integers are rejected + if (parameters.has(KEY_CONNECTION_TIMEOUT)) { + int connectionTimeout = parameters.get(KEY_CONNECTION_TIMEOUT).getAsInt(); + if (connectionTimeout <= 0) { + throw new IllegalArgumentException(connectionTimeout + " is not a valid timeout value."); + } + s3SourceV2Keys.setConnectionTimeout(connectionTimeout); + } + + // separate the endpoint, which should be a URL without bucket name, from the domain name + s3SourceV2Keys.setEndpoint("https://" + getEndpointFromHost(url.host())); + + // URL path might have variables, by default HttpUrl will encode '{' and '}' + // Here we decode those back to their plain form + s3SourceV2Keys.setPrefix(EndecoUtils.decode(url.encodedPath().substring(1))); + + // separate the bucket name from URI domain name + s3SourceV2Keys.setBucket(url.host().split("\\.")[0]); + + s3SourceV2Keys.setFilesPattern(MultistageProperties.MSTAGE_SOURCE_FILES_PATTERN.getProp(state)); + s3SourceV2Keys.setMaxKeys(MultistageProperties.MSTAGE_S3_LIST_MAX_KEYS.getValidNonblankWithDefault(state)); + s3SourceV2Keys.setAccessKey(MultistageProperties.SOURCE_CONN_USERNAME.getValidNonblankWithDefault(state)); + s3SourceV2Keys.setSecretId(MultistageProperties.SOURCE_CONN_PASSWORD.getValidNonblankWithDefault(state)); + s3SourceV2Keys.setTargetFilePattern( + MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_NAME.getValidNonblankWithDefault(state)); + s3SourceV2Keys.logDebugAll(); + } + + /** + * Create extractor based on the input WorkUnitState, the extractor.class + * configuration, and a new S3Connection + * + * @param state WorkUnitState passed in from Gobblin framework + * @return the MultistageExtractor object + */ + + @Override + public Extractor getExtractor(WorkUnitState state) { + initialize(state); + MultistageExtractor extractor = + (MultistageExtractor) super.getExtractor(state); + extractor.setConnection(new S3Connection(state, this.s3SourceV2Keys, extractor.getExtractorKeys())); + return extractor; + } + + /** + * split the host name, and remove the bucket name from the beginning, and return the rest + * @param host hostname with bucket name in the beginning + * @return the endpoint name without the bucket name + */ + private String getEndpointFromHost(String host) { + List segments = Lists.newArrayList(host.split("\\.")); + Preconditions.checkArgument(segments.size() > 1, "Host name format is incorrect"); + segments.remove(0); + return Joiner.on(".").join(segments); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/AvroSchemaUtils.java b/dil/src/main/java/com/linkedin/dil/util/AvroSchemaUtils.java new file mode 100644 index 0000000..a236d94 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/AvroSchemaUtils.java @@ -0,0 +1,76 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.converter.avro.JsonElementConversionFactory; +import org.apache.gobblin.converter.avro.UnsupportedDateTypeException; +import org.apache.gobblin.converter.json.JsonSchema; + + +public interface AvroSchemaUtils { + /** + * Utility method to convert JsonArray schema to avro schema + * @param schema of JsonArray type + * @return avro schema + * @throws UnsupportedDateTypeException + */ + static Schema fromJsonSchema(JsonArray schema, WorkUnitState state) throws UnsupportedDateTypeException { + JsonSchema jsonSchema = new JsonSchema(schema); + jsonSchema.setColumnName(state.getExtract().getTable()); + JsonElementConversionFactory.RecordConverter recordConverter = + new JsonElementConversionFactory.RecordConverter(jsonSchema, state, state.getExtract().getNamespace()); + return recordConverter.schema(); + } + + /** + * Utility method to extract field names from an avro schema + * @param schema avro schema + * @return List of field names + */ + static List getSchemaFieldNames(Schema schema) { + return schema.getFields().stream().map(Schema.Field::name).collect( + Collectors.toList()); + } + + /** + * Make a deep copy of a schema field + * @param field schema field + * @return copy of schema field + */ + static Schema.Field deepCopySchemaField(Schema.Field field) { + Schema.Field f = new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal(), field.order()); + field.getProps().forEach(f::addProp); + return f; + } + + /** + * Utility method to create record + * @param state work unit state to get namespace info + * @return a record with EOF + */ + static GenericRecord createEOF(WorkUnitState state) { + JsonArray eofSchema = new Gson() + .fromJson("[{\"columnName\":\"EOF\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}]", + JsonArray.class); + Schema schema = null; + try { + schema = fromJsonSchema(eofSchema, state); + } catch (UnsupportedDateTypeException e) { + // impossible, since the schema is fixed here and string type is supported + } + assert (schema != null); + GenericRecord eofRecord = new GenericData.Record(schema); + eofRecord.put("EOF", "EOF"); + return eofRecord; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/CsvUtils.java b/dil/src/main/java/com/linkedin/dil/util/CsvUtils.java new file mode 100644 index 0000000..cc1299e --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/CsvUtils.java @@ -0,0 +1,28 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.commons.lang3.StringUtils; + + +public interface CsvUtils { + /** + * Method to convert string to supplement unicode values if exists. + * Ex: input u005c will be converted to \\u005c + * @param value input string + * @return unicode value + */ + static String unescape(String value) { + if (value != null) { + value = value.toLowerCase(); + if (value.matches("^u[A-Fa-f0-9]{4}")) { + return StringEscapeUtils.unescapeJava("\\" + value); + } + return value; + } + return StringUtils.EMPTY; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/Database.java b/dil/src/main/java/com/linkedin/dil/util/Database.java new file mode 100644 index 0000000..3bdfe11 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/Database.java @@ -0,0 +1,40 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.base.Preconditions; +import java.net.URI; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; + + +/** + * a list of databases + */ +@Slf4j +public enum Database { + MYSQL("MySql", "com.mysql.cj.jdbc.Driver"), + SQLSERVER("SqlServer", "com.microsoft.sqlserver.jdbc.SQLServerDriver"), + ORACLE("Oracle", "oracle.jdbc.driver.OracleDriver"), + HSQLDB("HSqlDb", "org.hsqldb.jdbcDriver"); + + final static String PROTOCOL_PREFIX = "jdbc:"; + + @Getter private String name; + @Getter private String dbType; + @Getter private String defaultDriver; + + Database(String name, String driver) { + this.name = name; + this.dbType = name.toLowerCase(); + this.defaultDriver = driver; + } + + static public Database fromUrl(String jdbcUrl) { + Preconditions.checkArgument(jdbcUrl.matches("jdbc:(mysql|sqlserver|oracle|hsqldb):.*"), "jdbcUrl"); + String uri = jdbcUrl.substring(PROTOCOL_PREFIX.length()); + return Database.valueOf(URI.create(uri).getScheme().toUpperCase()); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/DateTimeUtils.java b/dil/src/main/java/com/linkedin/dil/util/DateTimeUtils.java new file mode 100644 index 0000000..f93e188 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/DateTimeUtils.java @@ -0,0 +1,115 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.collect.ImmutableMap; +import java.util.Map; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + + +/** + * a general datetime parsing utility + * + * Note: Joda supports only up to milliseconds, if data has microseconds, it will be truncated + * + * Note: Joda doesn't like "America/Los_Angeles", but rather it accepts PST or -08:00, therefore + * long form timezone names are not supported. + */ + +public interface DateTimeUtils { + String DEFAULT_TIMEZONE = "America/Los_Angeles"; + DateTimeFormatter DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd"); + Map FORMATS = new ImmutableMap.Builder() + .put("\\d{4}-\\d{2}-\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{1}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.S")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SS")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{4}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSS")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{5}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSS")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSSS")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{1}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.S")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SS")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSS")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{4}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSS")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{5}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSS")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{6}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSS")) + .build(); + Map FORMATS_WITH_ZONE = new ImmutableMap.Builder() + // date time string with timezone specified as +/- hh:mm + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ssZ")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{1}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SZ")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{2}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSZ")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSZ")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{4}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSZ")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{5}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSSZ")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSSSZ")) + + // date time string with timezone specified with time zone ids, like PST + // date time string with timezone specified with long form time zone ids, like America/Los_Angeles, is not working + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ssz")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{1}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.Sz")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{2}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSz")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{3}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSz")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{4}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSz")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{5}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSSz")) + .put("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{6}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSSSz")) + + // date time string with timezone specified as +/- hh:mm + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ssZ")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{1}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SZ")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{2}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSZ")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{4}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSZ")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{5}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSZ")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{6}[-+]+\\d{2}:?\\d{2}", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSSZ")) + + // date time string with timezone specified with short form time zone ids, like PST + // date time string with timezone specified with long form time zone ids, like America/Los_Angeles, is not working + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ssz")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{1}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.Sz")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{2}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSz")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{3}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSz")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{4}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSz")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{5}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSz")) + .put("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{6}[a-zA-Z\\/\\_]+", DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSSSSSz")) + .build(); + + static DateTime parse(String dtString) { + return parse(dtString, DEFAULT_TIMEZONE); + } + + /** + * Parse the date time string against a predefined list of formats. If none of them match, + * the input string is truncated to first 10 characters in hope of matching to basic ISO date + * format of yyyy-MM-dd + * @param dtString the date time value string + * @param timezone the timezone of the string + * @return the parsed Date Time object + */ + static DateTime parse(String dtString, String timezone) { + DateTimeZone timeZone = DateTimeZone.forID(timezone.isEmpty() ? DEFAULT_TIMEZONE : timezone); + try { + for (String format : FORMATS.keySet()) { + if (dtString.matches(format)) { + return FORMATS.get(format).withZone(timeZone).parseDateTime(dtString); + } + } + // ignore timezone parameter if the date time string itself has time zone information + for (String format : FORMATS_WITH_ZONE.keySet()) { + if (dtString.matches(format)) { + return FORMATS_WITH_ZONE.get(format).parseDateTime(dtString); + } + } + } catch (Exception e) { + return DATE_FORMATTER.withZone(timeZone).parseDateTime(dtString.substring(0, 10)); + } + return DATE_FORMATTER.withZone(timeZone).parseDateTime(dtString.substring(0, 10)); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/EncryptionUtils.java b/dil/src/main/java/com/linkedin/dil/util/EncryptionUtils.java new file mode 100644 index 0000000..be9e010 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/EncryptionUtils.java @@ -0,0 +1,102 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.JsonObject; +import org.apache.gobblin.codec.StreamCodec; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.crypto.EncryptionConfigParser; +import org.apache.gobblin.crypto.GPGCodec; +import org.apache.gobblin.password.PasswordManager; +import org.apache.hadoop.fs.Path; + + +/** + * String encryption and decryption utilities + */ +public interface EncryptionUtils { + String PATTERN = "^ENC\\(.*\\)$"; + /** + * Decrypt the encrypted string using Gobblin utility + * @param input the encrypted string + * @param state Gobblin state object contains the master key location + * @return decrypted string if the input string is enclosed inside ENC() + */ + static String decryptGobblin(String input, State state) { + if (input.matches(PATTERN)) { + return PasswordManager.getInstance(state).readPassword(input); + } + return input; + } + + /** + * Encrypt the decrypted string using Gobblin utility + * @param input the deccrypted string + * @param state Gobblin state object contains the master key location + * @return encrypted string which is enclosed within ENC() - as Gobblin utility doesn't do that explicitly + */ + static String encryptGobblin(String input, State state) { + String encryptedString = PasswordManager.getInstance(state).encryptPassword(input); + if (encryptedString.matches(PATTERN)) { + return encryptedString; + } + return "ENC(" + encryptedString + ")"; + } + + /** + * Create a Gpg Codec per given parameters + * + * @param parameters the GPG decryption or encryption parameters + * @return A StreamCodec object, in this case, returns a @GPGCodec object + */ + static StreamCodec getGpgCodec(JsonObject parameters) { + + if (parameters == null) { + throw new IllegalArgumentException("Expect parameters to not be empty."); + } + if (!parameters.has(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY) + && !parameters.has(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PATH_KEY)) { + throw new IllegalArgumentException("Expect either password or key file in the parameters."); + } + + // keystore_password, optional if keystore_path is present + // default to empty string as this is what GpgCodec expects + String password = ""; + if (parameters.has(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY)) { + password = parameters.get(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY).getAsString(); + } + + // keystore_path, optional, needed for secret keyring based decryption + String keystorePathStr = null; + if (parameters.has(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PATH_KEY)) { + keystorePathStr = parameters.get(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PATH_KEY).getAsString(); + } + // If key file not present, then password must be provided. Otherwise, password is optional + if ((password == null || password.isEmpty()) && (keystorePathStr == null || keystorePathStr.isEmpty())) { + throw new IllegalArgumentException("Both key and password cannot be empty."); + } + + // key_name, optional, needed for encryption + String keyName = null; + if (parameters.has(EncryptionConfigParser.ENCRYPTION_KEY_NAME)) { + keyName = parameters.get(EncryptionConfigParser.ENCRYPTION_KEY_NAME).getAsString(); + } + + // cipher, null to default to CAST5 (128 bit key, as per RFC 2144) + String cipherName = null; + if (parameters.has(EncryptionConfigParser.ENCRYPTION_CIPHER_KEY)) { + cipherName = parameters.get(EncryptionConfigParser.ENCRYPTION_CIPHER_KEY).getAsString(); + } + + // if not using a keystore then use password based encryption + if (keystorePathStr == null) { + return new GPGCodec(password, cipherName); + } + // if a key name is not present then use a key id of 0. A GPGCodec may be configured without a key name + // when used only for decryption where the key name is retrieved from the encrypted file + return new GPGCodec(new Path(keystorePathStr), password, + keyName == null ? 0 : Long.parseUnsignedLong(keyName, 16), cipherName); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/EndecoUtils.java b/dil/src/main/java/com/linkedin/dil/util/EndecoUtils.java new file mode 100644 index 0000000..39fcab0 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/EndecoUtils.java @@ -0,0 +1,97 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import lombok.extern.slf4j.Slf4j; +import org.apache.hadoop.fs.Path; + + +/** + * a collection of encoding and decoding functions + */ +@Slf4j +public class EndecoUtils { + private EndecoUtils() { + // hide constructor + } + + /** + * Decode an encoded URL string, complete or partial + * @param encoded encoded URL string + * @return decoded URL string + */ + static public String decode(String encoded) { + return decode(encoded, StandardCharsets.UTF_8.toString()); + } + + static public String decode(String encoded, String enc) { + try { + return URLDecoder.decode(encoded, enc); + } catch (Exception e) { + log.error("URL decoding error: " + e); + return encoded; + } + } + + /** + * Encode a URL string, complete or partial + * @param plainUrl unencoded URL string + * @return encoded URL string + */ + static public String getEncodedUtf8(String plainUrl) { + return getEncodedUtf8(plainUrl, StandardCharsets.UTF_8.toString()); + } + + static public String getEncodedUtf8(String plainUrl, String enc) { + try { + return URLEncoder.encode(plainUrl, enc); + } catch (Exception e) { + log.error("URL encoding error: " + e); + return plainUrl; + } + } + + /** + * Encode a Hadoop file name to encode path separator so that the file name has no '/' + * @param fileName unencoded file name string + * @return encoded path string + */ + static public String getHadoopFsEncoded(String fileName) { + return getHadoopFsEncoded(fileName, StandardCharsets.UTF_8.toString()); + } + + static public String getHadoopFsEncoded(String fileName, String enc) { + try { + String encodedSeparator = URLEncoder.encode(Path.SEPARATOR, enc); + // we don't encode the whole string intentionally so that the state file name is more readable + return fileName.replace(Path.SEPARATOR, encodedSeparator); + } catch (Exception e) { + log.error("Hadoop FS encoding error: " + e); + return fileName; + } + } + + /** + * Decode an encoded Hadoop file name to restore path separator + * @param encodedFileName encoded file name string + * @return encoded path string + */ + static public String getHadoopFsDecoded(String encodedFileName) { + return getHadoopFsDecoded(encodedFileName, StandardCharsets.UTF_8.toString()); + } + + static public String getHadoopFsDecoded(String encodedFileName, String enc) { + try { + String encodedSeparator = URLEncoder.encode(Path.SEPARATOR, enc); + return encodedFileName.replace(encodedSeparator, Path.SEPARATOR); + } catch (Exception e) { + log.error("Hadoop FS decoding error: " + e); + return encodedFileName; + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/HdfsReader.java b/dil/src/main/java/com/linkedin/dil/util/HdfsReader.java new file mode 100644 index 0000000..2a3dc54 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/HdfsReader.java @@ -0,0 +1,250 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.annotations.VisibleForTesting; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonNull; +import com.google.gson.JsonObject; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.mapred.FsInput; +import org.apache.gobblin.configuration.ConfigurationKeys; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.source.extractor.filebased.TimestampAwareFileBasedHelper; +import org.apache.gobblin.source.extractor.hadoop.AvroFsHelper; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * This class is used to load data from HDFS based on location and fields of selection, + * and it returns the results as JsonArray. + * + * The reader will read sub-directories within the given location recursively, and pick up + * all files in AVRO format by default. + * + * All files in the location should have consistent format and contain the fields of selection. + * + * @author vbhrill chrli + */ + +public class HdfsReader { + private final static Logger LOG = LoggerFactory.getLogger(HdfsReader.class); + private final Gson gson = new Gson(); + private final State state; + + private JsonArray transientInputPayload = new JsonArray(); + + public HdfsReader(State state) { + this.state = state; + } + + public HdfsReader(State state, JsonArray secondaryInputs) { + this.transientInputPayload = secondaryInputs; + this.state = state; + } + + @VisibleForTesting + public List getFieldsAsList(JsonElement field) { + List fieldsList = new ArrayList<>(); + if (field.getAsJsonObject().has("fields")) { + Iterator iterator = field.getAsJsonObject() + .get("fields").getAsJsonArray().iterator(); + while (iterator.hasNext()) { + fieldsList.add(iterator.next().getAsString()); + } + } + return fieldsList; + } + + /** + * Reads secondary input paths one by one and return the JsonArrays by category + * @return a Map structure for records by category + */ + public Map readAll() { + if (transientInputPayload == null || transientInputPayload.size() == 0) { + return new HashMap<>(); + } + Map secondaryInput = new HashMap<>(); + for (JsonElement input: transientInputPayload) { + JsonArray transientData = new JsonArray(); + JsonElement path = input.getAsJsonObject().get("path"); + List fieldList = getFieldsAsList(input); + String category = input.getAsJsonObject().has(KEY_WORD_CATEGORY) + ? input.getAsJsonObject().get(KEY_WORD_CATEGORY).getAsString() + : KEY_WORD_ACTIVATION; + if (path != null) { + transientData.addAll(readRecordsFromPath(path.getAsString(), fieldList, getFilters(input))); + if (secondaryInput.containsKey(category)) { + transientData.addAll(secondaryInput.get(category)); + } + secondaryInput.put(category, transientData); + } + } + return secondaryInput; + } + + /** + * Reads a secondary input based on the entry specification. A secondary input + * may include one or more files in the path. + * + * @param secondaryEntry one entry in the ms.secondary.input parameter + * @return the data read + */ + public JsonArray readSecondary(JsonObject secondaryEntry) { + if (!secondaryEntry.has(("path"))) { + return new JsonArray(); + } + + JsonArray secondaryData = new JsonArray(); + JsonElement path = secondaryEntry.get("path"); + List fieldList = getFieldsAsList(secondaryEntry); + secondaryData.addAll(readRecordsFromPath(path.getAsString(), fieldList, getFilters(secondaryEntry))); + return secondaryData; + } + + + public JsonArray toJsonArray(String transientDataInputPayload) { + try { + return new Gson().fromJson(transientDataInputPayload, JsonArray.class); + } catch (Exception e) { + LOG.error("Error while processing transient input payload."); + throw new RuntimeException("Error while processing transient input payload. Cannot convert into JsonArray.", e); + } + } + + private DataFileReader createDataReader(String path) { + try { + GenericDatumReader genericDatumReader = new GenericDatumReader<>(); + FsInput fsInput = new FsInput(new Path(path), new Configuration()); + return new DataFileReader(fsInput, genericDatumReader); + } catch (Exception e) { + throw new RuntimeException("Error initializing transient data reader", e); + } + } + + /** + * process 1 secondary input path + * @param inputLocation the secondary input path + * @param fields the list of fields to be output + * @param filters the list of filters to ber applied + * @return a filter list of records + */ + private JsonArray readRecordsFromPath( + String inputLocation, + List fields, + Map filters) { + JsonArray transientDataArray = new JsonArray(); + String sourceFileBasedFsUri = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI); + TimestampAwareFileBasedHelper fsHelper = new AvroFsHelper(state); + try { + state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, inputLocation); + fsHelper.connect(); + List filesToRead = fsHelper.ls(inputLocation); + for (String singleFile: filesToRead) { + DataFileReader reader = createDataReader(singleFile); + transientDataArray.addAll(readFileAsJsonArray(reader, fields, filters)); + } + return transientDataArray; + } catch (Exception e) { + throw new RuntimeException("Error while reading records from location " + inputLocation, e); + } finally { + if (sourceFileBasedFsUri != null) { + state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, sourceFileBasedFsUri); + } + } + } + + /** + * This method read 1 avro file and store the records in a JsonArray + * + * @param preparedReader The avro file reader + * @param fields The list of fields to output + * @param filters The filters to apply to each records + * @return the filtered projection of the avro file in a JsonArray + */ + private JsonArray readFileAsJsonArray( + DataFileReader preparedReader, + List fields, + Map filters) { + JsonArray transientDataArray = new JsonArray(); + while (preparedReader.hasNext()) { + GenericRecord record = (GenericRecord) preparedReader.next(); + Schema schema = record.getSchema(); + boolean recordAccepted = true; + for (Schema.Field field: schema.getFields()) { + String name = field.name(); + String pattern = filters.getOrDefault(name, ".*"); + if (record.get(name) != null && !record.get(name).toString().matches(pattern) + || filters.containsKey(name) && record.get(name) == null) { + recordAccepted = false; + } + } + if (recordAccepted) { + transientDataArray.add(selectFieldsFromGenericRecord(record, fields)); + } + } + return transientDataArray; + } + + @VisibleForTesting + private JsonObject selectFieldsFromGenericRecord(GenericRecord record, List fields) { + JsonObject jsonObject = new JsonObject(); + for (String field: fields) { + Object valueObject = record.get(field); + Schema.Type fieldType = record.getSchema().getField(field).schema().getType(); + if (valueObject == null || fieldType == Schema.Type.NULL) { + jsonObject.add(field, JsonNull.INSTANCE); + } else if (fieldType == Schema.Type.STRING) { + jsonObject.addProperty(field, EncryptionUtils.decryptGobblin(valueObject.toString(), state)); + } else if (fieldType == Schema.Type.ARRAY) { + jsonObject.add(field, gson.fromJson(valueObject.toString(), JsonArray.class)); + } else if (fieldType == Schema.Type.RECORD) { + jsonObject.add(field, gson.fromJson(valueObject.toString(), JsonObject.class)); + } else if (fieldType == Schema.Type.INT || fieldType == Schema.Type.LONG) { + jsonObject.addProperty(field, Long.valueOf(valueObject.toString())); + } else if (fieldType == Schema.Type.DOUBLE || fieldType == Schema.Type.FLOAT) { + jsonObject.addProperty(field, Double.valueOf(valueObject.toString())); + } else if (fieldType == Schema.Type.BOOLEAN) { + jsonObject.addProperty(field, Boolean.valueOf(valueObject.toString())); + } else { + jsonObject.addProperty(field, valueObject.toString()); + } + } + return jsonObject; + } + + /** + * retrieve the filters from the secondary input definition + * + * @param field a single secondary input source + * @return the filters defined as a map of FieldName: RegEx + */ + @VisibleForTesting + private Map getFilters(JsonElement field) { + Map filtersMap = new HashMap<>(); + if (field.getAsJsonObject().has("filters")) { + JsonObject filterDefinition = field.getAsJsonObject().get("filters").getAsJsonObject(); + for (Map.Entry entry : filterDefinition.entrySet()) { + filtersMap.put(entry.getKey(), entry.getValue().getAsString()); + } + } + return filtersMap; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/HttpRequestMethod.java b/dil/src/main/java/com/linkedin/dil/util/HttpRequestMethod.java new file mode 100644 index 0000000..1f80876 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/HttpRequestMethod.java @@ -0,0 +1,257 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.io.UnsupportedEncodingException; +import java.util.HashMap; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import okhttp3.HttpUrl; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.http.client.methods.HttpDelete; +import org.apache.http.client.methods.HttpEntityEnclosingRequestBase; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpPut; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.entity.StringEntity; + + +/** + * Enum object to facilitate the handling of different types of HTTP requests. + * + * The difference between GET and POST/PUT is that URI parameters are coded differently. + * + * However, in all request types, an URI path might be dynamically constructed. For + * example, https://domain/api/v1.5/surveys/{{id}} is a dynamic URI. The end point might + * support GET or POST. + * + * So if a GET request has 2 parameters, id=1 and format=avro, then the URI will be transformed to + * https://domain/api/v1.5/surveys/1?format=avro. + * + * However if a POST request has 2 parameters, id=1 and name=xxx, then the URI will be transformed to + * https://domain/api/v1.5/surveys/1, and the name=xxx will be in the POST request's entity content. + * + * Note: + * + * - URI path variables or placeholders are defined using {{placeholder-name}} + * - Placeholders or URI variables have to be alpha numeric + * + * @author chrli + */ + +@Slf4j +public enum HttpRequestMethod { + GET("GET") { + @Override + protected HttpUriRequest getHttpRequestContentJson(String uriTemplate, + JsonObject parameters, JsonObject payloads) + throws UnsupportedEncodingException { + Pair replaced = replaceVariables(uriTemplate, parameters); + //ignore payloads + return new HttpGet(appendParameters(replaced.getKey(), replaced.getValue())); + } + + @Override + protected HttpUriRequest getHttpRequestContentUrlEncoded(String uriTemplate, JsonObject parameters) + throws UnsupportedEncodingException { + return getHttpRequestContentJson(uriTemplate, parameters, new JsonObject()); + } + }, + + POST("POST") { + @Override + protected HttpUriRequest getHttpRequestContentJson(String uriTemplate, + JsonObject parameters, JsonObject payloads) + throws UnsupportedEncodingException { + Pair replaced = replaceVariables(uriTemplate, parameters); + for (Map.Entry entry: payloads.entrySet()) { + replaced.getValue().add(entry.getKey(), entry.getValue()); + } + return setEntity(new HttpPost(replaced.getKey()), replaced.getValue().toString()); + } + + @Override + protected HttpUriRequest getHttpRequestContentUrlEncoded(String uriTemplate, JsonObject parameters) + throws UnsupportedEncodingException { + Pair replaced = replaceVariables(uriTemplate, parameters); + String urlEncoded = jsonToUrlEncoded(replaced.getValue()); + return setEntity(new HttpPost(replaced.getKey()), urlEncoded); + } + }, + + PUT("PUT") { + @Override + protected HttpUriRequest getHttpRequestContentJson(String uriTemplate, + JsonObject parameters, JsonObject payloads) + throws UnsupportedEncodingException { + Pair replaced = replaceVariables(uriTemplate, parameters); + for (Map.Entry entry: payloads.entrySet()) { + replaced.getValue().add(entry.getKey(), entry.getValue()); + } + return setEntity(new HttpPut(replaced.getKey()), replaced.getValue().toString()); + } + + @Override + protected HttpUriRequest getHttpRequestContentUrlEncoded(String uriTemplate, JsonObject parameters) + throws UnsupportedEncodingException { + Pair replaced = replaceVariables(uriTemplate, parameters); + return setEntity(new HttpPut(replaced.getKey()), jsonToUrlEncoded(replaced.getValue())); + } + }, + + DELETE("DELETE") { + @Override + protected HttpUriRequest getHttpRequestContentJson(String uriTemplate, + JsonObject parameters, JsonObject payloads) + throws UnsupportedEncodingException { + Pair replaced = replaceVariables(uriTemplate, parameters); + for (Map.Entry entry: payloads.entrySet()) { + replaced.getValue().add(entry.getKey(), entry.getValue()); + } + return new HttpDelete(replaced.getKey()); + } + + @Override + protected HttpUriRequest getHttpRequestContentUrlEncoded(String uriTemplate, JsonObject parameters) + throws UnsupportedEncodingException { + Pair replaced = replaceVariables(uriTemplate, parameters); + return new HttpDelete(replaced.getKey()); + } + }; + + private final String name; + + HttpRequestMethod(String name) { + this.name = name; + } + + @Override + public String toString() { + return name; + } + + /** + * This is the public method to generate HttpUriRequest for each type of Http Method + * @param uriTemplate input URI, which might contain place holders + * @param parameters parameters to be add to URI or to request Entity + * @param headers Http header tags + * @return HttpUriRequest ready for connection + */ + public HttpUriRequest getHttpRequest(final String uriTemplate, final JsonObject parameters, final Map headers) + throws UnsupportedEncodingException { + return getHttpRequest(uriTemplate, parameters, headers, new JsonObject()); + } + + /** + * This is the public method to generate HttpUriRequest for each type of Http Method + * @param uriTemplate input URI, which might contain place holders + * @param parameters parameters to be add to URI or to request Entity + * @param headers Http header tags + * @param payloads additional payloads to be included in the body of the Http request + * @return HttpUriRequest ready for connection + */ + public HttpUriRequest getHttpRequest(final String uriTemplate, + final JsonObject parameters, + final Map headers, + final JsonObject payloads) + throws UnsupportedEncodingException { + HttpUriRequest request; + + // substitute variables in headers + Map headersCopy = new HashMap<>(); + JsonObject parametersCopy = JsonUtils.deepCopy(parameters).getAsJsonObject(); + for (Map.Entry entry: headers.entrySet()) { + Pair replaced = VariableUtils.replaceWithTracking(entry.getValue(), parameters); + if (!replaced.getLeft().equals(entry.getValue())) { + parametersCopy = JsonUtils.deepCopy(replaced.getRight()).getAsJsonObject(); + headersCopy.put(entry.getKey(), replaced.getLeft()); + log.info("Substituted header string: {} = {}", entry.getKey(), replaced.getLeft()); + } else { + headersCopy.put(entry.getKey(), entry.getValue()); + } + } + + log.info("Final parameters for HttpRequest: {}", parametersCopy.toString()); + if (headersCopy.containsKey("Content-Type") + && headersCopy.get("Content-Type").equals("application/x-www-form-urlencoded")) { + request = getHttpRequestContentUrlEncoded(uriTemplate, parametersCopy); + } else { + request = getHttpRequestContentJson(uriTemplate, parametersCopy, payloads); + } + + for (Map.Entry entry: headersCopy.entrySet()) { + request.addHeader(entry.getKey(), entry.getValue()); + } + return request; + } + + /** + * This method shall be overwritten by each enum element. + * @param uriTemplate input URI, which might contain place holders + * @param parameters parameters to be add to URI or to request Entity + * @param payloads additional payloads to be included in the body of the Http request + * @return HttpUriRequest object where content is set per application/json + */ + protected abstract HttpUriRequest getHttpRequestContentJson(String uriTemplate, + JsonObject parameters, JsonObject payloads) + throws UnsupportedEncodingException; + + /** + * This method shall be overwritten by each enum element. + * @param uriTemplate input URI, which might contain place holders + * @param parameters parameters to be add to URI or to request Entity + * @return HttpUriRequest object where content is set per application/x-www-form-urlencoded + */ + protected abstract HttpUriRequest getHttpRequestContentUrlEncoded(String uriTemplate, JsonObject parameters) + throws UnsupportedEncodingException; + + protected Pair replaceVariables(String uriTemplate, JsonObject parameters) + throws UnsupportedEncodingException { + return VariableUtils.replaceWithTracking(uriTemplate, parameters, true); + } + + protected String appendParameters(String uri, JsonObject parameters) { + HttpUrl url = HttpUrl.parse(uri); + if (url != null) { + HttpUrl.Builder builder = url.newBuilder(); + for (Map.Entry entry : parameters.entrySet()) { + String key = entry.getKey(); + builder.addQueryParameter(key, parameters.get(key).getAsString()); + } + url = builder.build(); + } + return url != null ? url.toString() : uri; + } + + /** + * Convert Json formatted parameter set to Url Encoded format as requested by + * Content-Type: application/x-www-form-urlencoded + * Json Example: + * {"param1": "value1", "param2": "value2"} + * + * URL Encoded Example: + * param1=value1¶m2=value2 + * + * @param parameters Json structured parameters + * @return URL encoded parameters + */ + protected String jsonToUrlEncoded(JsonObject parameters) { + HttpUrl.Builder builder = new HttpUrl.Builder().scheme("https").host("www.dummy.com"); + for (Map.Entry entry : parameters.entrySet()) { + String key = entry.getKey(); + builder.addQueryParameter(key, parameters.get(key).getAsString()); + } + return builder.build().encodedQuery(); + } + + protected HttpUriRequest setEntity(HttpEntityEnclosingRequestBase requestBase, String stringEntity) + throws UnsupportedEncodingException { + requestBase.setEntity(new StringEntity(stringEntity)); + return requestBase; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/InputStreamUtils.java b/dil/src/main/java/com/linkedin/dil/util/InputStreamUtils.java new file mode 100644 index 0000000..264f313 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/InputStreamUtils.java @@ -0,0 +1,47 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.List; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.io.IOUtils; + + +public interface InputStreamUtils { + /** + * Convert a list of strings to an InputStream + * @param stringList a list of strings + * @return an InputStream made of the list + */ + static InputStream convertListToInputStream(List stringList) { + return CollectionUtils.isEmpty(stringList) ? null + : new ByteArrayInputStream(String.join("\n", stringList).getBytes(StandardCharsets.UTF_8)); + } + + /** + * Extract the text from input stream using UTF-8 encoding + * @param input the InputStream, which most likely is from an HttpResponse + * @return the String extracted from InputStream, if the InputStream cannot be converted to a String + * then an exception is thrown + */ + static String extractText(InputStream input) throws IOException { + return extractText(input, StandardCharsets.UTF_8.name()); + } + + /** + * Extract the text from input stream using given character set encoding + * @param input the InputStream, which most likely is from an HttpResponse + * @param charSetName the character set name + * @return the String extracted from InputStream, if the InputStream cannot be converted to a String + * then an exception is thrown + */ + static String extractText(InputStream input, String charSetName) throws IOException { + return IOUtils.toString(input, charSetName); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/JdbcUtils.java b/dil/src/main/java/com/linkedin/dil/util/JdbcUtils.java new file mode 100644 index 0000000..e2128de --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/JdbcUtils.java @@ -0,0 +1,149 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.collect.ImmutableMap; +import java.sql.Blob; +import java.sql.Clob; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Types; +import java.util.Map; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.lang.StringUtils; + + +/** + * utility functions ported from JdbcExtractor + */ + +public interface JdbcUtils { + Map SQL_2_AVRO_TYPE_MAPPING = new ImmutableMap.Builder() + .put(Types.BOOLEAN, JsonElementTypes.BOOLEAN) + + .put(Types.DATE, JsonElementTypes.TIMESTAMP) + .put(Types.TIMESTAMP, JsonElementTypes.TIMESTAMP) + .put(Types.TIMESTAMP_WITH_TIMEZONE, JsonElementTypes.TIMESTAMP) + + .put(Types.TIME, JsonElementTypes.TIME) + .put(Types.TIME_WITH_TIMEZONE, JsonElementTypes.TIME) + + .put(Types.TINYINT, JsonElementTypes.INT) + .put(Types.SMALLINT, JsonElementTypes.INT) + .put(Types.INTEGER, JsonElementTypes.INT) + .put(Types.BIGINT, JsonElementTypes.LONG) + + .put(Types.DECIMAL, JsonElementTypes.DOUBLE) + .put(Types.DOUBLE, JsonElementTypes.DOUBLE) + .put(Types.FLOAT, JsonElementTypes.DOUBLE) + .put(Types.REAL, JsonElementTypes.DOUBLE) + .put(Types.NUMERIC, JsonElementTypes.DOUBLE) + + .put(Types.STRUCT, JsonElementTypes.RECORD) + .put(Types.ARRAY, JsonElementTypes.ARRAY) + + .build(); + + static String parseColumnAsString(final ResultSet resultset, final ResultSetMetaData resultsetMetadata, int i) + throws SQLException { + + if (isBlob(resultsetMetadata.getColumnType(i))) { + return readBlobAsString(resultset.getBlob(i)); + } + if (isClob(resultsetMetadata.getColumnType(i))) { + return readClobAsString(resultset.getClob(i)); + } + if ((resultsetMetadata.getColumnType(i) == Types.BIT + || resultsetMetadata.getColumnType(i) == Types.BOOLEAN) + && convertBitToBoolean()) { + String columnValue = Boolean.toString(resultset.getBoolean(i)); + // https://docs.oracle.com/javase/7/docs/api/java/sql/ResultSet.html#wasNull() + return resultset.wasNull() ? null : columnValue; + } + return resultset.getString(i); + } + + static boolean isBlob(int columnType) { + return columnType == Types.LONGVARBINARY || columnType == Types.BINARY; + } + + static boolean isClob(int columnType) { + return columnType == Types.CLOB; + } + + /** + * For Blob data, need to get the bytes and use base64 encoding to encode the byte[] + * When reading from the String, need to use base64 decoder + * String tmp = ... ( get the String value ) + * byte[] foo = Base64.decodeBase64(tmp); + */ + static String readBlobAsString(Blob logBlob) throws SQLException { + if (logBlob == null) { + return StringUtils.EMPTY; + } + + byte[] ba = logBlob.getBytes(1L, (int) (logBlob.length())); + + if (ba == null) { + return StringUtils.EMPTY; + } + return Base64.encodeBase64String(ba); + } + + /** + * For Clob data, we need to use the substring function to extract the string + */ + static String readClobAsString(Clob logClob) throws SQLException { + if (logClob == null) { + return StringUtils.EMPTY; + } + long length = logClob.length(); + return logClob.getSubString(1, (int) length); + } + + /** + * HACK: there is a bug in the MysqlExtractor where tinyint columns are always treated as ints. + * There are MySQL jdbc driver setting (tinyInt1isBit=true and transformedBitIsBoolean=false) that + * can cause tinyint(1) columns to be treated as BIT/BOOLEAN columns. The default behavior is to + * treat tinyint(1) as BIT. + * + * Currently, MysqlExtractor.getDataTypeMap() uses the information_schema to check types. + * That does not do the above conversion. {@link #parseColumnAsString(ResultSet, ResultSetMetaData, int)} + * which does the above type mapping. + * + * On the other hand, SqlServerExtractor treats BIT columns as Booleans. So we can be in a bind + * where sometimes BIT has to be converted to an int (for backwards compatibility in MySQL) and + * sometimes to a Boolean (for SqlServer). + * + * This function adds configurable behavior depending on the Extractor type. + **/ + static boolean convertBitToBoolean() { + return true; + } + + /** + * get a not nullable JsonElementType from a java.sql.Types + * @param columnSqlType java.sql.Types + * @return converted none nullable JsonElementType + */ + static JsonElementTypes parseColumnType(final int columnSqlType) { + return parseColumnType(columnSqlType, false); + } + + /** + * get a JsonElementType from a java.sql.Types and a nullability flag + * @param columnSqlType java.sql.Types + * @param nullable nullability flag + * @return converted JsonElementType + */ + static JsonElementTypes parseColumnType(final int columnSqlType, final boolean nullable) { + if (nullable) { + return SQL_2_AVRO_TYPE_MAPPING.getOrDefault(columnSqlType, JsonElementTypes.STRING).reverseNullability(); + } else { + return SQL_2_AVRO_TYPE_MAPPING.getOrDefault(columnSqlType, JsonElementTypes.STRING); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/JsonElementTypes.java b/dil/src/main/java/com/linkedin/dil/util/JsonElementTypes.java new file mode 100644 index 0000000..319b22b --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/JsonElementTypes.java @@ -0,0 +1,280 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.primitives.Doubles; +import com.google.common.primitives.Floats; +import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonPrimitive; +import org.apache.commons.lang.StringUtils; + + +/** + * This class is designed to bridge the gaps between Json and Avro in handling nullability and in key words. + * + * Nullable non-null elements are defined with NULLABLE prefixes. + * + * Different ways of defining a nullable string are: + * {"name": {"type": [null, "string"]}} + * {"columnName": "name", "isNullable", "true", "dataType": {"type": "string"}} + * + * Differences in naming the element types: + * - "object" is "record" in avro + * - "integer" is "int" in avro + * - "int64" is "long" in avro + * - "number" is a generic numeric type, and bet matched to "double" in avro + * + * we only need nullable version of elements for avro types + */ +public enum JsonElementTypes { + + ARRAY("array", "array", false), + BOOLEAN("boolean"), + DATE("date"), + DOUBLE("double"), + ENUM("enum"), + FLOAT("float", "double"), + INT("int"), + INTEGER("integer", "int"), + INT64("int64", "long"), + LONG("long"), + NUMBER("number", "double"), + OBJECT("object", "record", false), + PRIMITIVE("primitive"), + RECORD("record", "record", false), + STRING("string"), + TIME("time"), + TIMESTAMP("timestamp"), + UNION("union", "union", false), + UNKNOWN("unknown"), + NULL(true, "null", "null"), + NULLABLEARRAY(true, "array|null", "array", false), + NULLABLEBOOLEAN(true, "boolean|null", "boolean"), + NULLABLEDOUBLE(true, "double|null", "double"), + NULLABLEINT(true, "int|null", "int"), + NULLABLELONG(true, "long|null", "long"), + NULLABLEOBJECT(true, "object|null", "record", false), + NULLABLERECORD(true, "record|null", "record", false), + NULLABLESTRING(true, "string|null", "string"), + NULLABLETIME(true, "time|null", "time"), + NULLABLETIMESTAMP(true, "timestamp|null", "timestamp"); + + private final String name; + private final String altName; + private final boolean isPrimitive; + private final boolean isNullable; + + public static boolean isPrimitive(JsonElementTypes type) { + return type.isPrimitive(); + } + + /** + * initialize the enum item with a default name + * @param name the title of the enum item + */ + JsonElementTypes(String name) { + this.name = name; + this.altName = name; + this.isPrimitive = true; + this.isNullable = false; + } + + /** + * initialize the enum item with a default name and an alternative name + * @param name the title of the enum item + * @param altName the alternative Avro/Json Intermediate name + */ + JsonElementTypes(String name, String altName) { + this.name = name; + this.altName = altName; + this.isPrimitive = true; + this.isNullable = false; + } + + /** + * initialize the enum item with a default name, an alternative name, and a primitive type + * @param name the title of the enum item + * @param altName the alternative Avro/Json Intermediate name + * @param isPrimitive the primitive type indicator + */ + JsonElementTypes(String name, String altName, boolean isPrimitive) { + this.name = name; + this.altName = altName; + this.isPrimitive = isPrimitive; + this.isNullable = false; + } + + /** + * initialize the enum item with a nullable flag, a default name, and an alternative name + * @param name the title of the enum item + * @param altName the alternative Avro/Json Intermediate name + */ + JsonElementTypes(boolean isNullable, String name, String altName) { + this.name = name; + this.altName = altName; + this.isPrimitive = true; + this.isNullable = isNullable; + } + + /** + * initialize the enum item with a nullable flag, a default name, an alternative name, and a primitive type + * @param name the title of the enum item + * @param altName the alternative Avro/Json Intermediate name + * @param isPrimitive the primitive type indicator + */ + JsonElementTypes(boolean isNullable, String name, String altName, boolean isPrimitive) { + this.name = name; + this.altName = altName; + this.isPrimitive = isPrimitive; + this.isNullable = isNullable; + } + + @Override + public String toString() { + return this.name; + } + + public String getAltName() { + return this.altName; + } + + /** + * check if the enum is nullable + * + * @return true if the enum is in one of those non-nullable types + */ + public boolean isNullable() { + return isNullable; + } + + /** + * check if the enum is a primitive + * + * @return true if the enum is not an object or an array (nullable or non-nullable) + */ + public boolean isPrimitive() { + return isPrimitive; + } + + public boolean isArray() { + return this == ARRAY || this == NULLABLEARRAY; + } + + public boolean isObject() { + return this == OBJECT || this == NULLABLEOBJECT; + } + + public boolean isNull() { + return this == NULL; + } + + /** + * This function makes an element nullable by changing its nullability + * + * Note here, none avro types are also converted to avro types in the process + * except for OBJECT, its nullable counterpart remains NULLABLEOBJECT. + * + * @return nullability reversed element type + */ + public JsonElementTypes reverseNullability() { + if (isNullable) { + if (this == JsonElementTypes.NULLABLEOBJECT) { + return JsonElementTypes.OBJECT; + } else { + return JsonElementTypes.valueOf(StringUtils.upperCase(altName)); + } + } else { + if (this == JsonElementTypes.OBJECT) { + return JsonElementTypes.NULLABLEOBJECT; + } else { + return JsonElementTypes.valueOf(StringUtils.upperCase("NULLABLE" + altName)); + } + } + } + + /** + * Infers a ElementType from an array of values so that nullable columns are correctly inferred + * + * Currently, this doesn't support UNION of more than 1 solid types, like UNION of INT and STRING + * + * @param data a JsonArray of values + * @return inferred element type with nullability + */ + public static JsonElementTypes getTypeFromMultiple(JsonArray data) { + + boolean nullable = false; + JsonElementTypes itemType = NULL; + for (JsonElement arrayItem: data) { + if (arrayItem.isJsonNull()) { + nullable = true; + } else if (arrayItem.isJsonObject()) { + itemType = nullable ? NULLABLEOBJECT : OBJECT; + } else if (arrayItem.isJsonArray()) { + if (arrayItem.getAsJsonArray().size() == 0) { + nullable = true; + } + itemType = nullable ? NULLABLEARRAY : ARRAY; + } else if (arrayItem.toString().matches("^\".*\"$")) { + itemType = nullable ? NULLABLESTRING : STRING; + } else if (Ints.tryParse(arrayItem.getAsString()) != null) { + itemType = nullable ? NULLABLEINT : INT; + } else if (Longs.tryParse(arrayItem.getAsString()) != null) { + itemType = nullable ? NULLABLELONG : LONG; + } else if (Doubles.tryParse(arrayItem.getAsString()) != null) { + itemType = nullable ? NULLABLEDOUBLE : DOUBLE; + } else if (arrayItem.getAsString().toLowerCase().matches("(true|false)")) { + itemType = nullable ? NULLABLEBOOLEAN : BOOLEAN; + } else if (Floats.tryParse(arrayItem.getAsString()) != null) { + itemType = nullable ? NULLABLEDOUBLE : DOUBLE; + } else { + itemType = UNKNOWN; + } + } + + if (nullable && !itemType.isNullable()) { + return itemType.reverseNullability(); + } + + return itemType; + } + + /** + * Convert strings like ["array", "null"] to a JsonElementType object + * @param input a Json schema type value like "string", "integer", or ["string", "null"] + * @return a JsonElementTypes enum object + */ + public static JsonElementTypes forType(String input) { + // Strip off any extra double quote marks + String name = input.replaceAll("^\"|\"$", ""); + boolean isNullable = false; + if (name.matches("^\\[.*]$")) { + JsonArray jsonArray = new Gson().fromJson(name, JsonArray.class); + + // we will treat the JsonElement as UNION if + // 1. it has more than two types + // 2. any type is array or object + if (jsonArray.size() > 2) { + return UNION; + } + + for (JsonElement ele: jsonArray) { + if (ele.isJsonArray() || ele.isJsonObject()) { + return UNION; + } + } + + isNullable = jsonArray.contains(new JsonPrimitive("null")); + if (isNullable) { + jsonArray.remove(new JsonPrimitive("null")); + name = "nullable" + jsonArray.get(0).getAsString(); + } + } + return Enum.valueOf(JsonElementTypes.class, name.toUpperCase()); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/JsonIntermediateSchema.java b/dil/src/main/java/com/linkedin/dil/util/JsonIntermediateSchema.java new file mode 100644 index 0000000..f937927 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/JsonIntermediateSchema.java @@ -0,0 +1,244 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.Getter; +import lombok.NonNull; +import lombok.Setter; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * Recursively defined a Json Intermediate schema + * + * JsonIntermediateSchema := Map + * + * JisColumn := (columnName, nullability, JisDataType) + * + * JisDataType := RecordType | ArrayType | EnumType | UnionType + * + * RecordType := (JsonElementType, JsonIntermediateSchema) + * + * ArrayType := (JsonElementType, JisDataType) + * + * EnumType := (JsonElementType, symbolsArray) + * + * UnionType := (JsonElementType, List) + * + */ + + +public class JsonIntermediateSchema { + public static final String ROOT_RECORD_COLUMN_NAME = "root"; + public static final String CHILD_RECORD_COLUMN_NAME = "child"; + + // LinkedHashMap maintains insertion order, so the key set will be in the same order as the output schema + @Getter + Map columns = new LinkedHashMap<>(); + + @Getter @Setter + String schemaName; + + // a JIS schema contains JIS columns + public class JisColumn { + @NonNull @Getter @Setter + String columnName; + + @Getter @Setter + Boolean isNullable; + + @Getter @Setter + JisDataType dataType; + + // define a simple column + JisColumn(String name, Boolean isNullable, String type) { + this.setColumnName(name); + this.setIsNullable(isNullable); + this.setDataType(new JisDataType(type)); + } + + // define a complex column + JisColumn(JsonObject columnDefinition) { + try { + if (columnDefinition.has(KEY_WORD_COLUMN_NAME)) { + this.setColumnName(columnDefinition.get(KEY_WORD_COLUMN_NAME).getAsString()); + } else if (columnDefinition.has(KEY_WORD_NAME)) { + this.setColumnName(columnDefinition.get(KEY_WORD_COLUMN_NAME).getAsString()); + } else { + this.setColumnName(KEY_WORD_UNKNOWN); + } + + // set default as NULLABLE if column definition did not specify + if (columnDefinition.has(KEY_WORD_DATA_IS_NULLABLE)) { + this.setIsNullable(Boolean.valueOf(columnDefinition.get(KEY_WORD_DATA_IS_NULLABLE).getAsString())); + } else { + this.setIsNullable(Boolean.TRUE); + } + + this.setDataType(new JisDataType(columnDefinition.get(KEY_WORD_DATA_TYPE).getAsJsonObject())); + } catch (Exception e) { + throw new RuntimeException("Incorrect column definition in JSON: " + columnDefinition.toString(), e); + } + } + + /** + * Convert the column object to Json Object + * @return a Json Object presentation of the column + */ + public JsonObject toJson() { + JsonObject column = new JsonObject(); + column.addProperty(KEY_WORD_COLUMN_NAME, this.getColumnName()); + column.addProperty(KEY_WORD_DATA_IS_NULLABLE, this.isNullable ? "true" : "false"); + column.add(KEY_WORD_DATA_TYPE, this.getDataType().toJson()); + return column; + } + } + + // a JIS Column has a JIS Data Type + public class JisDataType { + @NonNull @Getter @Setter + JsonElementTypes type; + + // data type name is optional + @Getter @Setter + String name; + + // values have the array of field definitions when the type is record + @Getter @Setter + JsonIntermediateSchema childRecord; + + // items have the item definition + @Getter @Setter + JisDataType itemType; + + // unions have item types + @Getter @Setter + List itemTypes = Lists.newArrayList(); + + @Getter @Setter + JsonArray symbols; + + // this defines primitive data type + JisDataType(String type) { + this.setType(JsonElementTypes.forType(type)); + } + + JisDataType(JsonObject dataTypeDefinition) { + this.setType(JsonElementTypes.forType(dataTypeDefinition.get(KEY_WORD_TYPE).getAsString())); + if (dataTypeDefinition.has(KEY_WORD_NAME)) { + this.setName(dataTypeDefinition.get(KEY_WORD_NAME).getAsString()); + } + switch (type) { + case RECORD: + // a record field is will have child schema + this.setChildRecord(new JsonIntermediateSchema(CHILD_RECORD_COLUMN_NAME, + dataTypeDefinition.get(KEY_WORD_VALUES).getAsJsonArray())); + break; + case ARRAY: + // an array field will have a item type definition, which can be primitive or complex + JsonElement itemDefinition = dataTypeDefinition.get(KEY_WORD_ITEMS); + + if (itemDefinition.isJsonPrimitive()) { + this.setItemType(new JisDataType(itemDefinition.getAsString())); + } else { + // if not primitive, the item type is complex, and it has to be defined in a JsonObject + this.setItemType(new JisDataType(itemDefinition.getAsJsonObject().get(KEY_WORD_DATA_TYPE).getAsJsonObject())); + } + break; + case ENUM: + // an Enum has a list of symbols + this.setSymbols(dataTypeDefinition.get(KEY_WORD_SYMBOLS).getAsJsonArray()); + break; + case UNION: + // a Union has 2 or more different types + // TODO + break; + default: + break; + } + } + + /** Convert the data type object to Json Object + * @return a Json Object presentation of the data type + */ + public JsonObject toJson() { + JsonObject dataType = new JsonObject(); + dataType.addProperty(KEY_WORD_TYPE, this.getType().toString()); + dataType.addProperty(KEY_WORD_NAME, this.getName()); + switch (type) { + case RECORD: + dataType.add(KEY_WORD_VALUES, childRecord.toJson()); + break; + case ARRAY: + JsonObject itemsObject = new JsonObject(); + itemsObject.addProperty(KEY_WORD_NAME, this.getName()); + itemsObject.add(KEY_WORD_DATA_TYPE, itemType.toJson()); + dataType.add(KEY_WORD_ITEMS, itemsObject); + break; + default: + break; + } + return dataType; + } + + public boolean isPrimitive() { + return JsonElementTypes.isPrimitive(type); + } + } + + /** + * A Json Intermediate schema starts with a root column + * + * @param recordSchema the intermediate schema definition + */ + public JsonIntermediateSchema(JsonArray recordSchema) { + this.setSchemaName(ROOT_RECORD_COLUMN_NAME); + addColumns(recordSchema); + } + + /** + * A Json Intermediate schema record can be a nested field + * + * @param fieldName the field name of the nested record + * @param recordSchema the intermediate schema definition + */ + public JsonIntermediateSchema(String fieldName, JsonArray recordSchema) { + this.setSchemaName(fieldName); + addColumns(recordSchema); + } + + /** + * add columns of a record + * @param recordSchema the schema of the record + */ + private void addColumns(JsonArray recordSchema) { + for (JsonElement column: recordSchema) { + Preconditions.checkArgument(column != null && column.isJsonObject()); + JisColumn col = new JisColumn(column.getAsJsonObject()); + columns.put(col.getColumnName(), col); + } + } + + /** + * Convert the schema object to Json Array + * @return a Json Array presentation of the schema + */ + public JsonArray toJson() { + JsonArray schema = new JsonArray(); + for (Map.Entry entry: columns.entrySet()) { + schema.add(entry.getValue().toJson()); + } + return schema; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/JsonParameter.java b/dil/src/main/java/com/linkedin/dil/util/JsonParameter.java new file mode 100644 index 0000000..4cb3dd4 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/JsonParameter.java @@ -0,0 +1,305 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.base.Strings; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.util.HashMap; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + + +/** + * + * A sample parameter string with variables (fromDateTime, toDateTime + * cursor) in it. The variables are to be replaced with actual values. + * + * @author chrli + */ +@Slf4j +public class JsonParameter { + + private JsonObject paramJson; + private State state; + final private static String DEFAULT_TIMEZONE = "America/Los_Angeles"; + + /** + * See sample-parameter.json regarding to how to compose parameters + * + * @param inputJsonArrayString a JsonArray formatted as String + * @param values a list of input values for variable replacement + * @param state source state + */ + public JsonParameter(String inputJsonArrayString, JsonObject values, State state) { + this.state = state; + paramJson = new JsonObject(); + JsonArray jsonArray = new Gson().fromJson(inputJsonArrayString, JsonArray.class); + + if (jsonArray != null && !jsonArray.isJsonNull()) { + for (JsonElement element : jsonArray) { + JsonObject param = parseParameter(element.getAsJsonObject(), values); + if (param != null) { + for (Map.Entry entry : param.entrySet()) { + paramJson.add(entry.getKey(), entry.getValue()); + } + } + } + } + } + + /** + * @return reformatted parameters as a Json Object + */ + public JsonObject getParametersAsJson() { + return paramJson; + } + + /** + * @param inputString a parameter string in Json format with the needed data elements + * @param values the replacement values + * @param state source state + * @return reformatted parameters as a Json String + */ + public static JsonObject getParametersAsJson(String inputString, JsonObject values, State state) { + return new JsonParameter(inputString, values, state).getParametersAsJson(); + } + + /** + * @return reformatted parameters as a Json String + */ + public String getParametersAsJsonString() { + return paramJson.toString(); + } + + /** + * @param inputString a parameter string in Json format with the needed data elements + * @param values the replacement values + * @param state source state + * @return reformatted parameters as a Json String + */ + public static String getParametersAsJsonString(String inputString, JsonObject values, State state) { + return new JsonParameter(inputString, values, state).getParametersAsJsonString(); + } + + /** + * + * @param inputString a parameter string in Json format with the needed data elements + * @param values the replacement values + * @param state source state + * @return reformatted parameters in a Map object + */ + public static Map getParametersAsMap(String inputString, JsonObject values, State state) { + return new JsonParameter(inputString, values, state).getParametersAsMap(); + } + + /** + * @return reformatted parameters in a Map object + */ + public Map getParametersAsMap() { + Map params = new HashMap<>(); + for (Map.Entry entry: paramJson.entrySet()) { + params.put(entry.getKey(), entry.getValue().getAsString()); + } + return params; + } + + /** + * core function parsing the parameter string and replace with input values + * @param paramObject a JsonObject of parameter definitions + * @param values replacement values + * @return a JsonObject of parameters where watermarks are replaced with actual given values + */ + private JsonObject parseParameter(JsonObject paramObject, JsonObject values) { + ParameterTypes type = ParameterTypes.valueOf( + paramObject.has("type") + ? paramObject.get("type").getAsString().toUpperCase() : "LIST"); + + JsonObject parsedObject = new JsonObject(); + String name = paramObject.get("name").getAsString(); + switch (type) { + // an OBJECT would assume the elements could have substitution variables + case OBJECT: + if (paramObject.get("value").isJsonObject() + && !paramObject.get("value").getAsJsonObject() + .get("type").getAsString().equalsIgnoreCase(ParameterTypes.OBJECT.toString())) { + parsedObject.add(name, parseParameter(paramObject.get("value").getAsJsonObject(), values)); + } else if (paramObject.get("value").isJsonArray()) { + JsonObject members = new JsonObject(); + for (JsonElement member: paramObject.get("value").getAsJsonArray()) { + JsonObject converted = parseParameter(member.getAsJsonObject(), values); + if (converted != null) { + for (Map.Entry ele : converted.entrySet()) { + members.add(ele.getKey(), ele.getValue()); + } + } + } + parsedObject.add(name, members); + } + break; + case LIST: + // allow encryption on LIST type parameters + parsedObject.addProperty(name, EncryptionUtils.decryptGobblin( + parseListParameter(paramObject.get("value"), state), + state)); + break; + + case JSONARRAY: + parsedObject.add(name, paramObject.get("value").getAsJsonArray()); + break; + + // a JSONOBJECT, compared to OBJECT, would not allow substitution variables in its elements. + // this type would simplify configuration in such case as the syntax is more straightforward + case JSONOBJECT: + parsedObject.add(name, paramObject.get("value").getAsJsonObject()); + break; + + case WATERMARK: + + // String watermarkName = paramObject.get("watermark").getAsString(); + String watermarkValue = paramObject.get("value").getAsString(); + String format = paramObject.get("format").getAsString(); + String timeZoneId = paramObject.has("timezone") + ? paramObject.get("timezone").getAsString() : DEFAULT_TIMEZONE; + DateTimeZone timeZone = DateTimeZone.forID(timeZoneId); + if (values != null && values.get("watermark") != null) { + + // support only long watermark for now + // support only one watermark called "watermark" for now + // TODO: support multiple watermarks, each represented by a watermark name + + Long watermarkLow = values.get("watermark").getAsJsonObject().get("low").getAsLong(); + Long watermarkHigh = values.get("watermark").getAsJsonObject().get("high").getAsLong(); + log.debug("found watermark pair: {}, {} in replacement values.", watermarkLow, watermarkHigh); + + // ignore default watermarks + if (watermarkLow < 0) { + return null; + } + + if (format.equals("datetime")) { + String pattern = paramObject.get("pattern").getAsString(); + DateTimeFormatter datetimeFormatter = DateTimeFormat.forPattern(pattern).withZone(timeZone); + if (watermarkValue.equalsIgnoreCase("low")) { + parsedObject.addProperty(name, new DateTime().withMillis(watermarkLow).toString(datetimeFormatter)); + } else { + parsedObject.addProperty(name, new DateTime().withMillis(watermarkHigh).toString(datetimeFormatter)); + } + } else if (format.equals("epoc-second")) { + if (watermarkValue.equalsIgnoreCase("low")) { + parsedObject.addProperty(name, (watermarkLow / 1000)); + } else { + parsedObject.addProperty(name, watermarkHigh / 1000); + } + } else { + // By default, return the watermark in epoch millisecond format + if (watermarkValue.equalsIgnoreCase("low")) { + parsedObject.addProperty(name, watermarkLow); + } else { + parsedObject.addProperty(name, watermarkHigh); + } + } + } else { + return null; + } + break; + + case SESSION: + if (valueCheck(values, ParameterTypes.SESSION.toString(), true, false)) { + parsedObject.add(name, values.get(ParameterTypes.SESSION.toString())); + } + break; + + case PAGESTART: + if (valueCheck(values, ParameterTypes.PAGESTART.toString(), true, false)) { + parsedObject.add(name, values.get(ParameterTypes.PAGESTART.toString())); + } + break; + + case PAGESIZE: + if (valueCheck(values, ParameterTypes.PAGESIZE.toString(), true, false)) { + parsedObject.add(name, values.get(ParameterTypes.PAGESIZE.toString())); + } + break; + + case PAGENO: + if (valueCheck(values, ParameterTypes.PAGENO.toString(), true, false)) { + parsedObject.add(name, values.get(ParameterTypes.PAGENO.toString())); + } + break; + + default: + break; + } + + return parsedObject; + } + + /** + * check whether a component exists in a JsonObject and whether it meets requirements + * @param values a JsonObject of parameters + * @param element the element name + * @param bRequirePrimitive whether it is required to be primitive + * @param bAllowBlank whether blank / null is allowed + * @return true if all checks went through otherwise false + */ + private boolean valueCheck(JsonObject values, String element, boolean bRequirePrimitive, boolean bAllowBlank) { + if (!values.has(element)) { + return false; + } + + if (bRequirePrimitive && !values.get(element).isJsonPrimitive()) { + return false; + } + + if (!bAllowBlank && Strings.isNullOrEmpty(values.get(element).getAsString())) { + return false; + } + + return true; + } + + /** + * Support choices on LIST type parameters + * + * If the value given is an array, the first value will be used in a FULL load, + * and the second value will be used in Incremental load + * + * @param listValue the definition of LIST parameter, which can be a String or JsonArray + * @param state the State object + * @return the string value if it is primitive, or the chosen string value based on extract mode + */ + private String parseListParameter(JsonElement listValue, State state) { + String listValueString = ""; + + if (listValue == null || listValue.isJsonNull()) { + return listValueString; + } + + if (listValue.isJsonPrimitive()) { + listValueString = listValue.getAsString(); + } else if (listValue.isJsonArray() && listValue.getAsJsonArray().size() > 0) { + if (MultistageProperties.EXTRACT_IS_FULL.getValidNonblankWithDefault(state)) { + listValueString = listValue.getAsJsonArray().get(0).getAsString(); + } else { + listValueString = listValue.getAsJsonArray().size() > 1 + ? listValue.getAsJsonArray().get(1).getAsString() + : listValue.getAsJsonArray().get(0).getAsString(); + } + } else { + listValueString = ""; + log.warn("Unable to parse LIST parameter {}, will use a BLANK string", listValue.toString()); + } + return listValueString; + } +} \ No newline at end of file diff --git a/dil/src/main/java/com/linkedin/dil/util/JsonUtils.java b/dil/src/main/java/com/linkedin/dil/util/JsonUtils.java new file mode 100644 index 0000000..de12e57 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/JsonUtils.java @@ -0,0 +1,232 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.collect.Lists; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonNull; +import com.google.gson.JsonObject; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import org.testng.Assert; + + +public interface JsonUtils { + Gson GSON = new Gson(); + + /** + * This deepCopy is a workaround. When it is possible to upgrade Gson to 2.8.1+, + * we shall change code to use Gson deepCopy. + * + * This function is intended to use use small Json, like schema objects. It is not + * suitable to deep copy large Json objects. + * + * @param source the source Json object, can be JsonArray, JsonObject, or JsonPrimitive + * @return the deeply copied Json object + */ + static JsonElement deepCopy(JsonElement source) { + return GSON.fromJson(source.toString(), source.getClass()); + } + + /** + * Check if JsonObject A contains everything in b + * @param superObject the super set JsonObject + * @param subObject the sub set JsonObject + * @return if superObject doesn't have an element in b, or the value of an element in superObject differs with + * the same element in b, return false, else return true + */ + static boolean contains(JsonObject superObject, JsonObject subObject) { + for (Map.Entry entry: subObject.entrySet()) { + if (!superObject.has(entry.getKey()) + || !superObject.get(entry.getKey()).toString().equalsIgnoreCase(entry.getValue().toString())) { + return false; + } + } + return true; + } + + /** + * Check if JsonObject A contains everything in b + * @param superString the super set JsonObject + * @param subObject the sub set JsonObject + * @return if a doesn't have an element in b, or the value of an element in a differs with + * the same element in b, return false, else return true + */ + static boolean contains(String superString, JsonObject subObject) { + JsonObject a = GSON.fromJson(superString, JsonObject.class); + return contains(a, subObject); + } + + /** + * Replace parts of Original JsonObject with substitutes + * @param origObject the original JsonObject + * @param newComponent the substitution values + * @return the replaced JsonObject + */ + static JsonObject replace(JsonObject origObject, JsonObject newComponent) { + JsonObject replacedObject = new JsonObject(); + for (Map.Entry entry: origObject.entrySet()) { + if (newComponent.has(entry.getKey())) { + replacedObject.add(entry.getKey(), newComponent.get(entry.getKey())); + } else { + replacedObject.add(entry.getKey(), entry.getValue()); + } + } + return replacedObject; + } + + /** + * This function makes up the inefficiency in GSON by creating an + * JsonObject and add a pair of properties to it, and return the newly + * created JsonObject + * + * @param key the property key + * @param value the property value + * @return the newly created JsonObject + */ + static JsonObject createAndAddProperty(String key, String value) { + JsonObject newObject = new JsonObject(); + newObject.addProperty(key, value); + return newObject; + } + + /** + * This function makes up the inefficiency in GSON by creating an + * JsonObject and add a new element to it, and return the newly + * created JsonObject + * + * @param name the name of the new element + * @param element the new element + * @return the newly created JsonObject + */ + static JsonObject createAndAddElement(String name, JsonElement element) { + JsonObject newObject = new JsonObject(); + newObject.add(name, element); + return newObject; + } + + /** + * From an array of KV pairs, retrieve the record value + * @param key key name + * @param kvPairs the array or KV pairs + * @return the value if the key exists + */ + static JsonElement get(final String key, final JsonArray kvPairs) { + for (JsonElement element: kvPairs) { + if (element.isJsonObject() && element.getAsJsonObject().has(key)) { + return element.getAsJsonObject().get(key); + } + } + return JsonNull.INSTANCE; + } + + /** + * From an array of JsonObjects, retrieve a value by searching by key-value pair, and + * once the JsonObject is found, it returns the element located by the JsonPath, + * specified by returnKey. This utility is mostly used for schema manipulations. + * + * @param searchKey key name to search in order to identify the JsonObject + * @param value value to match in order to identify the JsonObject + * @param returnKey the Json path to identify the return value + * @param objArray the array of JsonObjects + * @return the identified element of a JsonObject within the array + */ + static JsonElement get(final String searchKey, final String value, final String returnKey, final JsonArray objArray) { + for (JsonElement element: objArray) { + if (element.isJsonObject() + && element.getAsJsonObject().has(searchKey) + && element.getAsJsonObject().get(searchKey).getAsString().equalsIgnoreCase(value)) { + return get(returnKey, element.getAsJsonObject()); + } + } + return JsonNull.INSTANCE; + } + + /** + * Get a JsonElement from a JsonObject based on the given JsonPath + * + * @param row the record contains the data element + * @param jsonPath the JsonPath (string) how to get the data element + * @return the data element at the JsonPath position, or JsonNull if error + */ + static JsonElement get(JsonObject row, String jsonPath) { + return get(jsonPath, row); + } + + /** + * Get a JsonElement from a JsonObject based on the given JsonPath + * + * @param nested the record contains the data element + * @param jsonPath the JsonPath (string) how to get the data element + * @return the data element at the JsonPath position, or JsonNull if error + */ + static JsonElement get(String jsonPath, JsonObject nested) { + Assert.assertNotNull(jsonPath); + List path = Lists.newArrayList(jsonPath.split("\\.")); + + if (path.size() == 0 || nested == null || nested.isJsonNull()) { + return JsonNull.INSTANCE; + } + return get(path.iterator(), nested); + } + + /** + * Get a JsonElement from an arbitrary JsonElement based on the given JsonPath + * + * @param nested the JsonElement to search + * @param jsonPath the JsonPath (Iterator of String) how to get the data element + * @return the data element at the JsonPath position, or JsonNull if error + */ + static JsonElement get(Iterator jsonPath, JsonElement nested) { + if (nested.isJsonObject()) { + return get(jsonPath, nested.getAsJsonObject()); + } else if (nested.isJsonArray()) { + return get(jsonPath, nested.getAsJsonArray()); + } + return jsonPath.hasNext() ? JsonNull.INSTANCE : nested; + } + + /** + * Get a JsonElement from a JsonObject based on the given JsonPath + * + * @param nested the JsonObject to search + * @param jsonPath the JsonPath (Iterator of String) how to get the data element + * @return the data element at the JsonPath position, or JsonNull if error + */ + static JsonElement get(Iterator jsonPath, JsonObject nested) { + if (!jsonPath.hasNext()) { + return nested; + } + String name = jsonPath.next(); + return nested != null && !nested.isJsonNull() && nested.has(name) + ? get(jsonPath, nested.get(name)) + : JsonNull.INSTANCE; + } + + /** + * Get a JsonElement from a JsonArray based on the given JsonPath + * + * @param nested the JsonArray to search + * @param jsonPath the JsonPath (Iterator of String) how to get the data element + * @return the data element at the JsonPath position, or JsonNull if error + */ + static JsonElement get(Iterator jsonPath, JsonArray nested) { + if (!jsonPath.hasNext()) { + return nested; + } + String indexStr = jsonPath.next(); + try { + int index = Integer.parseInt(indexStr); + return nested != null && !nested.isJsonNull() && index >= 0 && index < nested.size() + ? get(jsonPath, nested.get(index)) + : JsonNull.INSTANCE; + } catch (Exception e) { + return JsonNull.INSTANCE; + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/ParameterTypes.java b/dil/src/main/java/com/linkedin/dil/util/ParameterTypes.java new file mode 100644 index 0000000..c85f5b7 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/ParameterTypes.java @@ -0,0 +1,35 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +/** + * + * ParameterTypes defines a list of acceptable types of parameters in + * Gobblin configuration file for jobs using multi-stage connectors. + * + */ + +public enum ParameterTypes { + LIST("list"), + OBJECT("object"), + WATERMARK("watermark"), + SESSION("session"), + PAGESTART("pagestart"), + PAGESIZE("pagesize"), + PAGENO("pageno"), + JSONARRAY("jsonarray"), + JSONOBJECT("jsonobject"); + + private final String name; + + ParameterTypes(String name) { + this.name = name; + } + + @Override + public String toString() { + return name; + } +} \ No newline at end of file diff --git a/dil/src/main/java/com/linkedin/dil/util/SchemaBuilder.java b/dil/src/main/java/com/linkedin/dil/util/SchemaBuilder.java new file mode 100644 index 0000000..9d33038 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/SchemaBuilder.java @@ -0,0 +1,930 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonNull; +import com.google.gson.JsonObject; +import com.google.gson.JsonPrimitive; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Logger; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * This utility class aims to simplify the structure manipulation of JsonSchema. + * + * At the same time, we are deprecating JsonSchema class + */ +public class SchemaBuilder { + final private static JsonElement JSON_NULL_STRING = new JsonPrimitive("null"); + + final public static int UNKNOWN = 0; + final public static int RECORD = 1; + final public static int ARRAY = 2; + final public static int PRIMITIVE = 3; + + final private static ImmutableMap COMMON = ImmutableMap.of( + KEY_WORD_RECORD, RECORD, + KEY_WORD_OBJECT, RECORD, + KEY_WORD_ARRAY, ARRAY, + KEY_WORD_PRIMITIVE, PRIMITIVE, + KEY_WORD_UNKNOWN, UNKNOWN); + + private int type = UNKNOWN; + private String name; + private String primitiveType = null; + private boolean isNullable = true; + List elements = new ArrayList<>(); + + /** + * create a root builder with a list of children + * @param type the type of builder element + * @param isNullable the nullability of builder element + * @param elements the child elements + */ + public SchemaBuilder(final int type, final boolean isNullable, List elements) { + this("root", type, isNullable, elements); + } + + /** + * create a builder with a list of children + * @param name the element name + * @param type the type of builder element + * @param isNullable the nullability of builder element + * @param elements the child elements + */ + public SchemaBuilder(final String name, final int type, final boolean isNullable, List elements) { + this.name = name; + this.type = type; + this.isNullable = isNullable; + this.elements.addAll(elements); + } + + /** + * create a builder with a single child + * @param name the element name + * @param type the type of builder element + * @param isNullable the nullability of builder element + * @param element the child element + */ + public SchemaBuilder(final String name, final int type, final boolean isNullable, SchemaBuilder element) { + this.name = name; + this.type = type; + this.isNullable = isNullable; + this.elements.add(element); + } + + /** + * This is main method to parse an Avro schema, and make it into a builder. + * The builder can then be used to produce schema strings in other syntax + * + * @param json the JsonObject object representing an Avro schema + * @return a SchemaBuilder + */ + public static SchemaBuilder fromAvroSchema(JsonElement json) { + assert json.isJsonObject(); + return new SchemaBuilder(RECORD, json.getAsJsonObject().get(KEY_WORD_FIELDS)); + } + + /** + * This is the main method to parse a Json sample data and infer the schema from + * the sample. The method uses the {@link Generator} as a helper in doing so. + * + * The inferred SchemaBuilder can then be used to produce strings in other syntax. + * + * @param data the Json data sample + * @return the inferred SchemaBuilder + */ + public static SchemaBuilder fromJsonData(JsonElement data) { + return new Generator(data).getSchemaBuilder(); + } + + /** + * This is the main method to parse a Json sample data and infer the schema from + * the sample. The method uses the {@link Generator} as a helper in doing so. + * + * The inferred SchemaBuilder can then be used to produce strings in other syntax. + * + * @param data the Json data sample + * @return the inferred SchemaBuilder + */ + public static SchemaBuilder fromJsonData(String data) { + return fromJsonData(GSON.fromJson(data, JsonElement.class)); + } + + + /** + * This is the main method to convert a schema definition in Json Schema syntax, + * and product a SchemaBuilder object. The SchemaBuilder object can then be used + * to product a string in Avro schema syntax. + * + * The input JsonSchema can be in 3 forms: + * 1. an anonymous object with a list columns, there is no top level type element + * 2. an anonymous array with a list of columns, there is a top level array type element + * 3. an anonymous object with a list of columns, there is top level object type element + * + * @param jsonSchema the schema of a Json dataset in the Json Schema syntax + * @return a SchemaBuilder + */ + public static SchemaBuilder fromJsonSchema(JsonObject jsonSchema) { + int rootType = getType(jsonSchema); + return new SchemaBuilder(rootType == UNKNOWN ? RECORD : rootType, true, fromJsonComplexSchema(jsonSchema)); + } + + /** + * Override version to accept Json Schema string + * + * @param jsonSchema the schema of a Json dataset in the Json Schema syntax + * @return a SchemaBuilder + */ + public static SchemaBuilder fromJsonSchema(String jsonSchema) { + return fromJsonSchema(GSON.fromJson(jsonSchema, JsonObject.class)); + } + + /** + * Parse a Json Schema and convert the elements to a SchemaBuilder list + * @param complexSchema a schema in Json Schema syntax + * @return the converted list of elements + */ + private static List fromJsonComplexSchema(JsonObject complexSchema) { + switch (getType(complexSchema)) { + case ARRAY: + List elements = fromJsonComplexSchema(complexSchema.get(KEY_WORD_ITEMS).getAsJsonObject()); + if (getType(complexSchema.get(KEY_WORD_ITEMS).getAsJsonObject()) == PRIMITIVE) { + SchemaBuilder element = new SchemaBuilder(KEY_WORD_ARRAY_ITEM, PRIMITIVE, true, new ArrayList<>()); + element.setNullable(checkNullable(complexSchema.get(KEY_WORD_ITEMS).getAsJsonObject())); + element.setPrimitiveType(getNestedType(complexSchema.get(KEY_WORD_ITEMS).getAsJsonObject())); + elements.add(element); + } else if (getType(complexSchema.get(KEY_WORD_ITEMS).getAsJsonObject()) == RECORD) { + List singletonList = new ArrayList<>(); + singletonList.add(new SchemaBuilder(KEY_WORD_ARRAY_ITEM, RECORD, true, elements)); + return singletonList; + } + return elements; + case RECORD: + return fromJsonComplexSchema(complexSchema.get(KEY_WORD_PROPERTIES).getAsJsonObject()); + case PRIMITIVE: + return new ArrayList<>(); + default: + return fromJsonObjectSchema(complexSchema.getAsJsonObject()); + } + } + + /** + * Process a untyped Json schema object as a set of columns. If a Json + * schema object has no "type" element, the object is treated as a table + * or sub-table, and each entry of the JsonObject is treated as a column. + * + * @param jsonObject a untyped JsonObject of columns + * @return a list of columns each defined as a SchemaBuilder + */ + private static List fromJsonObjectSchema(JsonObject jsonObject) { + List columns = new ArrayList<>(); + jsonObject.entrySet().iterator().forEachRemaining(x -> { + boolean isNullable = checkNullable(x.getValue().getAsJsonObject()); + int type = getType(x.getValue().getAsJsonObject()); + String primitiveType = getNestedType(x.getValue().getAsJsonObject()); + SchemaBuilder column = new SchemaBuilder(x.getKey(), type, true, + fromJsonComplexSchema(x.getValue().getAsJsonObject())); + column.setNullable(isNullable); + if (type == PRIMITIVE) { + column.setPrimitiveType(primitiveType); + } + columns.add(column); + }); + return columns; + } + + /** + * Hidden constructor that is called by public static function only + * @param type the type of the schema element + * @param json the Json presentation of the schema element + */ + private SchemaBuilder(final int type, JsonElement json) { + this("root", type, json); + } + + /** + * Hidden constructor that is called by internal parsing functions only + * @param name the name of the schema element + * @param type the type of the schema element + * @param json the Json presentation of the schema element + */ + private SchemaBuilder(final String name, final int type, final JsonElement json) { + parseAvroSchema(name, type, json); + } + + /** + * Hidden initialization function + * @param name the name of the schema element + * @param type the type of the schema element + * @param json the Json presentation of the schema element + */ + private void parseAvroSchema(final String name, final int type, final JsonElement json) { + this.type = type == UNKNOWN ? getType(json) : type; + this.isNullable = !name.equals("root") && checkNullable(json.getAsJsonObject()); + this.name = name; + switch (this.type) { + case RECORD: + if (name.equals("root")) { + for (JsonElement field: json.getAsJsonArray()) { + elements.add(new SchemaBuilder(field.getAsJsonObject().get(KEY_WORD_NAME).getAsString(), UNKNOWN, field)); + } + } else { + for (JsonElement field: getFields(json.getAsJsonObject())) { + elements.add(new SchemaBuilder(field.getAsJsonObject().get(KEY_WORD_NAME).getAsString(), UNKNOWN, field)); + } + } + break; + case ARRAY: + elements.add(new SchemaBuilder("arrayItem", UNKNOWN, getItems(json.getAsJsonObject()))); + break; + default: //PRIMITIVE + this.primitiveType = getNestedType(json.getAsJsonObject()); + break; + } + } + + /** + * Get the schema element type + * @param json the Json presentation of the schema element + * @return the schema element type + */ + private static int getType(JsonElement json) { + if (json.isJsonPrimitive()) { + return getType(json.getAsString()); + } + return getType(getNestedType(json.getAsJsonObject())); + } + + /** + * Get the schema element type string from a straight or a unionized schema element + * @param json the Json presentation of the schema element + * @return the schema element type string + */ + private static String getNestedType(JsonObject json) { + JsonElement type = json.get(KEY_WORD_TYPE); + + if (type == null) { + return KEY_WORD_UNKNOWN; + } + + if (type.isJsonPrimitive()) { + return type.getAsString(); + } + + if (type.isJsonObject()) { + if (type.getAsJsonObject().has(KEY_WORD_SOURCE_TYPE)) { + return type.getAsJsonObject().get(KEY_WORD_SOURCE_TYPE).getAsString(); + } + return type.getAsJsonObject().get(KEY_WORD_TYPE).getAsString(); + } + + if (type.isJsonArray()) { + Set items = new HashSet<>(); + type.getAsJsonArray().iterator().forEachRemaining(items::add); + items.remove(JSON_NULL_STRING); + JsonElement trueType = items.iterator().next(); + if (trueType.isJsonPrimitive()) { + return trueType.getAsString(); + } else if (trueType.isJsonObject()) { + if (trueType.getAsJsonObject().has(KEY_WORD_SOURCE_TYPE)) { + return trueType.getAsJsonObject().get(KEY_WORD_SOURCE_TYPE).getAsString(); + } else if (trueType.getAsJsonObject().has(KEY_WORD_TYPE)) { + return trueType.getAsJsonObject().get(KEY_WORD_TYPE).getAsString(); + } + } + } + return KEY_WORD_UNKNOWN; + } + + /** + * Map a string type to internal presentation of integer type + * @param type the schema element type string + * @return the schema element type integer + */ + private static int getType(String type) { + return COMMON.getOrDefault(type, PRIMITIVE); + } + + /** + * Check if an schema element is nullable, this is for AVRO schema parsing only + * @param json the Json presentation of the schema element + * @return nullability + */ + private static boolean checkNullable(JsonObject json) { + JsonElement type = json.get(KEY_WORD_TYPE); + if (type.isJsonPrimitive()) { + return type.equals(JSON_NULL_STRING); + } + + if (type.isJsonObject()) { + return type.getAsJsonObject().get(KEY_WORD_TYPE).equals(JSON_NULL_STRING); + } + + if (type.isJsonArray()) { + return type.getAsJsonArray().contains(JSON_NULL_STRING); + } + + return true; + } + + /** + * Parse out the "fields" from an Avro schema + * + * If the schema is a nullable record, the type would be a union, and fields + * data element would be hidden in an JsonArray. In such cases, we will get the + * fields from the first non-null type. + * + * TODO: unions of more than 1 non-null types are not supported right now + * + * @param record the "record" schema element + * @return the fields of the record + */ + private JsonArray getFields(JsonObject record) { + if (record.has(KEY_WORD_FIELDS)) { + return record.get(KEY_WORD_FIELDS).getAsJsonArray(); + } + + if (record.get(KEY_WORD_TYPE).isJsonObject()) { + return record.get(KEY_WORD_TYPE).getAsJsonObject().get(KEY_WORD_FIELDS).getAsJsonArray(); + } + + if (record.get(KEY_WORD_TYPE).isJsonArray()) { + Set union = new HashSet<>(); + record.get(KEY_WORD_TYPE).getAsJsonArray().iterator().forEachRemaining(union::add); + union.remove(JSON_NULL_STRING); + if (union.iterator().hasNext()) { + JsonElement next = union.iterator().next(); + if (next.isJsonObject() && next.getAsJsonObject().has(KEY_WORD_FIELDS)) { + return next.getAsJsonObject().get(KEY_WORD_FIELDS).getAsJsonArray(); + } + } + } + + return new JsonArray(); + } + + /** + * Parse out the array items in an Avro schema, current an array item can be a record, + * a primitive, a null, or a union of any two of them. However, the union shall not be + * more than 2 types. + * + * @param array the "array" schema element + * @return the array item + */ + private JsonObject getItems(JsonObject array) { + if (array.get(KEY_WORD_TYPE).isJsonObject()) { + return array.get(KEY_WORD_TYPE).getAsJsonObject().get(KEY_WORD_ITEMS).getAsJsonObject(); + } + + Set union = new HashSet<>(); + array.get(KEY_WORD_TYPE).getAsJsonArray().iterator().forEachRemaining(union::add); + union.remove(JSON_NULL_STRING); + return union.iterator().next().getAsJsonObject().get(KEY_WORD_ITEMS).getAsJsonObject(); + } + + /** + * Build into a Json schema definition that can be converted to a string + * @return a Json schema + */ + public JsonObject buildJsonSchema() { + return buildJsonSchema(false); + } + + /** + * Build into a Json schema definition that can be converted to a string + * @param includeRootType whether to include the root type element + * @return a Json schema + */ + public JsonObject buildJsonSchema(boolean includeRootType) { + if (this.type == RECORD || type == ARRAY) { + JsonObject fields = new JsonObject(); + + if (type == ARRAY && elements.size() == 1 && elements.get(0).getType() == PRIMITIVE) { + fields = elements.get(0).buildJsonSchema(includeRootType); + } else { + for (SchemaBuilder field : elements) { + if (field.getName().equals(KEY_WORD_ARRAY_ITEM) && elements.size() == 1) { + fields = field.buildJsonSchema(includeRootType); + } else { + fields.add(field.getName(), field.buildJsonSchema(includeRootType)); + } + } + } + + if (name.equals(KEY_WORD_ROOT) && !includeRootType) { + return fields; + } + + if (type == ARRAY) { + JsonObject array = JsonUtils.createAndAddProperty(KEY_WORD_TYPE, KEY_WORD_ARRAY); + array.add(KEY_WORD_ITEMS, fields); + return array; + } + + if (name.equals(KEY_WORD_ARRAY_ITEM)) { + JsonObject arrayItem = JsonUtils.createAndAddProperty(KEY_WORD_TYPE, KEY_WORD_OBJECT); + arrayItem.add(KEY_WORD_PROPERTIES, fields); + return arrayItem; + } + + JsonObject subTable = JsonUtils.createAndAddProperty(KEY_WORD_TYPE, KEY_WORD_OBJECT); + subTable.add(KEY_WORD_PROPERTIES, fields); + return subTable; + } else { + String nullableType = isNullable && !primitiveType.equalsIgnoreCase(KEY_WORD_NULL) + ? KEY_WORD_NULLABLE + primitiveType : primitiveType; + return addPrimitiveSchemaType(JsonElementTypes.valueOf(nullableType.toUpperCase())); + } + } + + /** + * Build into a Avro flavored, but not true Avro, schema that can be fed into + * Json2Avro converter + * + * @return Json presentation of the Avro flavored schema + */ + public JsonElement buildAltSchema() { + return buildAltSchema(new HashMap<>(), false, null, null, false); + } + + /** + * Build into a Avro flavored, but not true Avro, schema that can be fed into + * Json2Avro converter. Along the way, schema names are cleansed if special characters + * are present. + * + * This method is called recursively, therefore the return element can be a JsonArray, + * for root schema, and JsonObject for child schema elements, primitive columns, + * arrays, and sub-tables. + * + * @param defaultTypes the default data types are used to explicitly assign types that cannot be inferred + * correctly + * @param enableCleansing if cleansing is enabled, field names will be checked and special characters will + * be replaced with a replacement character + * @param pattern the search pattern of schema cleansing + * @param replacement the replacement string for schema cleansing + * @param nullable whether to force output all columns as nullable + * @return the Avro flavored schema definition + */ + public JsonElement buildAltSchema(Map defaultTypes, + boolean enableCleansing, + String pattern, + String replacement, + boolean nullable) { + JsonObject nestedType = new JsonObject(); + if (this.type == RECORD + || (this.type == ARRAY && this.elements.size() > 1)) { + JsonArray fields = new JsonArray(); + for (SchemaBuilder field : elements) { + fields.add(field.buildAltSchema(defaultTypes, enableCleansing, pattern, replacement, nullable)); + } + if (name.equals("root") || type == ARRAY) { + return fields; + } + nestedType.addProperty(KEY_WORD_TYPE, KEY_WORD_RECORD); + nestedType.addProperty(KEY_WORD_NAME, this.name); + nestedType.add(KEY_WORD_VALUES, fields); + } else if (this.type == ARRAY) { + nestedType.addProperty(KEY_WORD_TYPE, KEY_WORD_ARRAY); + nestedType.addProperty(KEY_WORD_NAME, this.name); + if (this.elements.get(0).getType() == PRIMITIVE) { + nestedType.addProperty(KEY_WORD_ITEMS, this.elements.get(0).getPrimitiveType()); + } else { + nestedType.add(KEY_WORD_ITEMS, + this.elements.get(0).buildAltSchema(defaultTypes, enableCleansing, pattern, replacement, nullable)); + } + } else { + nestedType.addProperty(KEY_WORD_TYPE, defaultTypes.getOrDefault(this.getName(), this.primitiveType)); + } + + JsonObject column = new JsonObject(); + column.addProperty(KEY_WORD_COLUMN_NAME, enableCleansing ? name.replaceAll(pattern, replacement) : name); + // no explicit nullable setting for sub-tables + if (this.type != RECORD) { + column.addProperty(KEY_WORD_DATA_IS_NULLABLE, nullable || this.isNullable); + } + column.add(KEY_WORD_DATA_TYPE, nestedType); + return column; + } + + public boolean isNullable() { + return this.isNullable; + } + + public SchemaBuilder setNullable(boolean nullable) { + this.isNullable = nullable; + return this; + } + + public String getName() { + return this.name; + } + + public SchemaBuilder setName(String name) { + this.name = name; + return this; + } + + /** + * Retrieve the primitive type string, and convert null to string + * + * @return the primitive type string + */ + public String getPrimitiveType() { + return isNullable && primitiveType.equalsIgnoreCase(KEY_WORD_NULL) + ? KEY_WORD_STRING : this.primitiveType; + } + + /** + * Set the primitive type and return an object for chained calls + * + * @param primitiveType the primitive type string + * @return the builder object itself + */ + public SchemaBuilder setPrimitiveType(String primitiveType) { + if (primitiveType.equalsIgnoreCase(KEY_WORD_NULL)) { + this.isNullable = true; + this.primitiveType = KEY_WORD_STRING; + } else { + this.primitiveType = primitiveType; + } + return this; + } + + public int getType() { + return type; + } + + /** + * Add a type description to the schema element + * + * Primitive schema item types can have values like following: + * {"type", "null"} + * {"type", "string"} + * {"type", ["string", "null"]} + * + * @param itemType the type of this JsonSchema item + * @return the modified JsonSchema item with the specified type + */ + public JsonObject addPrimitiveSchemaType(JsonElementTypes itemType) { + if (itemType.isNull()) { + return JsonUtils.createAndAddProperty(KEY_WORD_TYPE, "null"); + } + + if (itemType.isNullable()) { + JsonArray typeArray = new JsonArray(); + typeArray.add(itemType.reverseNullability().toString()); + typeArray.add("null"); + return JsonUtils.createAndAddElement(KEY_WORD_TYPE, typeArray); + } + + return JsonUtils.createAndAddProperty(KEY_WORD_TYPE, itemType.toString()); + } + + /** + * This utility class helps parse Json data and infer schema. + * + * Json data have a very loose schema definition, data elements can have incomplete structure from record + * to record. In order properly infer a complete schema, a batch of records is necessary. + * + * TODO: to be able to parse a stream of Json records because event a batch of records sometimes + * are insufficient. + * + */ + public static class Generator { + final private static Logger LOGGER = Logger.getLogger(Generator.class); + private JsonElement data; + private boolean pivoted = false; + + public Generator(JsonElement data) { + this.data = data; + } + + public Generator(JsonElement data, boolean pivoted) { + this.data = data; + this.pivoted = pivoted; + } + + /** + * Given a Generator initialized with a Json object, a JsonObject or JsonArray, + * the function infers the schema using the best guess and stores it in a + * SchemaBuilder, which can then be used to produce the schema strings in + * various formats. + * + * @return the SchemaBuilder that store the schema structure + */ + public SchemaBuilder getSchemaBuilder() { + List elements = new ArrayList<>(); + if (data.isJsonObject()) { + data.getAsJsonObject().entrySet().iterator().forEachRemaining(x -> elements.add( + new Generator(x.getValue(), true).getSchemaBuilder().setName(x.getKey()))); + return new SchemaBuilder(SchemaBuilder.RECORD, true, elements); + } + + if (data.isJsonPrimitive() || data.isJsonNull()) { + return new SchemaBuilder(KEY_WORD_UNKNOWN, SchemaBuilder.PRIMITIVE, data.isJsonNull(), new ArrayList<>()) + .setPrimitiveType(inferPrimitiveType(data)); + } + + if (!pivoted) { + data = pivotJsonArray(data.getAsJsonArray()); + } + + if (data.getAsJsonArray().size() > 0 && data.getAsJsonArray().get(0).isJsonArray()) { + data.getAsJsonArray().iterator().forEachRemaining(x -> elements.add(inferColumnSchemaFromSample(x.getAsJsonArray()))); + } else { + elements.add(new SchemaBuilder(KEY_WORD_UNKNOWN, + SchemaBuilder.PRIMITIVE, data.getAsJsonArray().size() == 0, new ArrayList<>()) + .setPrimitiveType(data.getAsJsonArray().size() == 0 ? KEY_WORD_NULL + : inferPrimitiveType(data.getAsJsonArray().get(0)))); + } + + if (!pivoted) { + return new SchemaBuilder(SchemaBuilder.RECORD, true, elements); + } + + return new SchemaBuilder(KEY_WORD_UNKNOWN, SchemaBuilder.ARRAY, elements.get(0).isNullable(), elements); + } + + /** + * This function infers schema from a sample that is structured as a JsonArray and stores + * the schema in SchemaBuilder, which can then be appended to higher level schema as + * child element. + * + * The sample data input should be data from the same column. + * + * @param data sample data as JsonArray + * @return the inferred schema + */ + private SchemaBuilder inferColumnSchemaFromSample(JsonArray data) { + // if it is a blank array + if (data.size() == 0) { + return new SchemaBuilder(KEY_WORD_UNKNOWN, SchemaBuilder.PRIMITIVE, true, new ArrayList<>()) + .setPrimitiveType(KEY_WORD_NULL); + } + + JsonElementTypes itemType = JsonElementTypes.getTypeFromMultiple(data); + + // if it is a sub table array, or an array of arrays + if (itemType.isObject()) { + return inferSchemaFromKVPairs(data); + } + + if (itemType.isArray()) { + return inferSchemaFromNestedArray(data); + } + + // if it is an array of primitives + return new SchemaBuilder(KEY_WORD_UNKNOWN, SchemaBuilder.PRIMITIVE, + itemType.isNullable(), new ArrayList<>()).setPrimitiveType(itemType.getAltName()); + } + + /** + * This function takes an array of name value pairs and infer their schema + * + * @param data A Json array of objects + * @return inferred Schema Builder + */ + private SchemaBuilder inferSchemaFromKVPairs(JsonArray data) { + // ignore potentially null values at the beginning + int i = 0; + while (i < data.size() && (isEmpty(data.get(i)) + || isEmpty(data.get(i).getAsJsonObject().entrySet().iterator().next().getValue()))) { + ++i; + } + + // for placeholder type of fields, all values will be null, i will be larger than size + // in this case, we just reset i to 0 + Map.Entry dataEntry = data.get(i >= data.size() ? 0 : i) + .getAsJsonObject().entrySet().iterator().next(); + String memberKey = dataEntry.getKey(); + JsonArray objectData = getValueArray(data); + + // The value has a nested sub-table + if (isSubTable(dataEntry.getValue())) { + return new Generator(objectData).getSchemaBuilder().setName(memberKey); + } + + JsonElementTypes subType = JsonElementTypes.getTypeFromMultiple(objectData); + + // The value has an array + if (subType.isArray()) { + return new SchemaBuilder(memberKey, SchemaBuilder.ARRAY, subType.isNullable(), + inferSchemaFromNestedArray(objectData)); + } + + // The values primitive + return new SchemaBuilder(memberKey, SchemaBuilder.PRIMITIVE, subType.isNullable(), + new ArrayList<>()).setPrimitiveType(subType.getAltName()); + } + + /** + * This function takes an array of rows and infer the row schema column by column + * + * @param data an array of rows with 1 or more columns + * @return the inferred Schema Builder + */ + private SchemaBuilder inferSchemaFromNestedArray(JsonArray data) { + // strip off one layer of array because data is like + // [[{something}],[{something}]] + JsonArray arrayData = new JsonArray(); + for (JsonElement element: data) { + if (element.isJsonNull() || element.getAsJsonArray().size() == 0) { + arrayData.add(JsonNull.INSTANCE); + } else { + arrayData.addAll(element.getAsJsonArray()); + } + } + JsonElementTypes subType = JsonElementTypes.getTypeFromMultiple(arrayData); + + if (subType.isObject()) { + return new SchemaBuilder(KEY_WORD_UNKNOWN, SchemaBuilder.RECORD, subType.isNullable(), + inferColumnSchemaFromSample(arrayData)); + } + + return new SchemaBuilder(KEY_WORD_UNKNOWN, SchemaBuilder.PRIMITIVE, subType.isNullable(), + new ArrayList<>()).setPrimitiveType(subType.getAltName()); + } + + /** + * Infer schema from a primitive + * @param data the primitive data value + * @return a string of the primitive type + */ + private String inferPrimitiveType(JsonElement data) { + assert data.isJsonPrimitive() || data.isJsonNull(); + return data.isJsonNull() ? KEY_WORD_NULL + : data.toString().matches("^\".*\"$") ? KEY_WORD_STRING + : data.getAsString().toLowerCase().matches("(true|false)") ? KEY_WORD_BOOLEAN + : inferNumeric(data.getAsString()); + } + + /** + * Infer whether the numeric value is an integer. We are not differentiating other + * numeric types since Json treat them the same way + * @param value the numeric value + * @return integer or number + */ + private String inferNumeric(String value) { + try { + Integer.parseInt(value); + } catch (Exception e) { + return KEY_WORD_NUMBER; + } + return KEY_WORD_INTEGER; + } + + /** + * This function takes only the value part of the key value pair array + * + * @param kvArray an array of KV pairs + * @return an array contains only the value part + */ + private JsonArray getValueArray(JsonArray kvArray) { + assert kvArray.size() > 0; + + int i = 0; + while (kvArray.get(i).isJsonNull()) { + ++i; + } + String key = kvArray.get(i).getAsJsonObject().entrySet().iterator().next().getKey(); + JsonArray valueArray = new JsonArray(); + for (JsonElement element: kvArray) { + if (element.isJsonNull()) { + valueArray.add(JsonNull.INSTANCE); + } else { + valueArray.add(element.getAsJsonObject().get(key)); + } + } + return valueArray; + } + + /** + * Pivot JsonArray so that all values of the same column can be parsed altogether. + * This is important for nullability analysis. By taking only 1 record from an JsonArray + * to derive schema for the whole dataset, we would be seeing part of the types of a nullable + * column. + * + * The input can be: + * 1. array of JsonObjects + * 2. array of Primitives + * 3. array of Arrays + * + * The input cannot be: + * 4. array of mixed types + * + * TODO: to handle union types, this requires further work + * + * @param data a JsonArray of records + * @return an JsonArray of JsonArrays + */ + JsonArray pivotJsonArray(JsonArray data) { + int i = 0; + JsonArray pivotedArray = new JsonArray(); + Map columnIndex = new HashMap<>(); + + while (i < data.size() + && (data.get(i).isJsonNull() || isEmpty(data.get(i)))) { + ++i; + } + + // in case data has no records, or data has only blank records, then no action and + // return a blank pivoted array. + if (i >= data.size()) { + return pivotedArray; + } + + JsonElementTypes elementType = getJsonElementType(data.get(i)); + + if (elementType == JsonElementTypes.PRIMITIVE) { + return data; + } + + for (JsonElement row: data) { + if (!row.isJsonObject()) { + LOGGER.error("Array of Arrays is not supported"); + return new JsonArray(); + } + for (Map.Entry entry : row.getAsJsonObject().entrySet()) { + if (!columnIndex.containsKey(entry.getKey())) { + pivotedArray.add(new JsonArray()); + columnIndex.put(entry.getKey(), columnIndex.size()); + } + } + } + + for (JsonElement element: data) { + if (element.isJsonNull() || isEmpty(element)) { + for (i = 0; i < columnIndex.size(); ++i) { + pivotedArray.get(i).getAsJsonArray().add(JsonNull.INSTANCE); + } + } else { + // each element might have columns in different order, + // and all elements don't have the same columns + Preconditions.checkState(elementType == JsonElementTypes.OBJECT); + for (Map.Entry entry : element.getAsJsonObject().entrySet()) { + JsonObject temp = new JsonObject(); + temp.add(entry.getKey(), entry.getValue()); + if (columnIndex.get(entry.getKey()) != null && pivotedArray.size() > columnIndex.get(entry.getKey())) { + pivotedArray.get(columnIndex.get(entry.getKey())).getAsJsonArray().add(temp); + } else { + pivotedArray.add(new JsonArray()); + columnIndex.put(entry.getKey(), columnIndex.size()); + } + } + } + } + return pivotedArray; + } + + boolean isSubTable(JsonElement data) { + return data.isJsonObject() && data.getAsJsonObject().entrySet().size() > 0; + } + + /** + * Classifies an Json element to 4 high level data types, but doesn't identify further + * detailed types of primitives + * + * @param jsonElement a Json element + * @return ARRAY, OBJECT, NULL, or PRIMITIVE + */ + JsonElementTypes getJsonElementType(JsonElement jsonElement) { + if (jsonElement.isJsonPrimitive()) { + return JsonElementTypes.PRIMITIVE; + } else if (jsonElement.isJsonNull()) { + return JsonElementTypes.NULL; + } else if (jsonElement.isJsonObject()) { + return JsonElementTypes.OBJECT; + } else { + return JsonElementTypes.ARRAY; + } + } + + /** + * in real world Json strings, empty element can be presented in different forms + * + * @param data input data to test + * @return if data represent an empty object + * + */ + boolean isEmpty(JsonElement data) { + return data == null || data.isJsonNull() || (data.isJsonObject() && data.toString().equals("{}")) || ( + data.isJsonArray() && data.toString().equals("[]")) || (data.isJsonPrimitive() && StringUtils.isEmpty( + data.getAsString())); + } + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/SchemaUtils.java b/dil/src/main/java/com/linkedin/dil/util/SchemaUtils.java new file mode 100644 index 0000000..08b5ef4 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/SchemaUtils.java @@ -0,0 +1,49 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import java.util.List; +import lombok.extern.slf4j.Slf4j; + + +@Slf4j +public class SchemaUtils { + + // Following best practices for utility classes to have a private constructor + private SchemaUtils() { + throw new IllegalStateException("Utility class"); + } + + /** + * A schema is valid when all its valid schema are present in source and in the same order. + * Column names' matching is case insensitive. + * @param schemaColumns column names defined in the output schema + * @param sourceColumns column names at the source + * @return true if all columns are matching and false other wise + * + * + * Example 1: schemaColumns: [A, c], sourceColumns: [a, B, C] ==> true + * Example 2: schemaColumns: [A, e], sourceColumns: [a, B, C] ==> false + * + */ + public static boolean isValidOutputSchema(List schemaColumns, List sourceColumns) { + int i = 0; + int j = 0; + while (i < sourceColumns.size() && j < schemaColumns.size()) { + if (sourceColumns.get(i).equalsIgnoreCase(schemaColumns.get(j))) { + j++; + } + i++; + } + boolean isValidSchema = j == schemaColumns.size(); + if (!isValidSchema) { + log.error( + "Schema columns and source columns do not match: " + "undefined columns in schema or column order mismatch"); + log.debug("Schema column: {}", schemaColumns); + log.debug("Source columns: {}", sourceColumns); + } + return isValidSchema; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/VariableUtils.java b/dil/src/main/java/com/linkedin/dil/util/VariableUtils.java new file mode 100644 index 0000000..d079818 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/VariableUtils.java @@ -0,0 +1,123 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.collect.Lists; +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; + + +/** + * Utility functions for string manipulation + */ +public interface VariableUtils { + Gson GSON = new Gson(); + String OPENING_RE = "\\{\\{"; + String OPENING = "{{"; + String CLOSING = "}}"; + Pattern PATTERN = Pattern.compile(OPENING_RE + "([a-zA-Z0-9_\\-.$]+)" + CLOSING); + + /** + * Replace placeholders or variables in a JsonObject + * + * @param templateJsonObject the template JsonObject with placeholders + * @param parameters the replacement values + * @return the replaced JsonObject + * @throws UnsupportedEncodingException + */ + static JsonObject replace(JsonObject templateJsonObject, JsonObject parameters) throws UnsupportedEncodingException { + String replacedString = replaceWithTracking(templateJsonObject.toString(), parameters, false).getKey(); + return GSON.fromJson(replacedString, JsonObject.class); + } + + /** + * Replace placeholders or variables in a JsonObject + * + * @param templateJsonObject the template JsonObject with placeholders + * @param parameters the replacement values + * @param encode whether to encode the value string, note this function will not encode + * the template string in any case, the encoding only applies to the replacement values + * @return the replaced JsonObject + * @throws UnsupportedEncodingException + */ + static JsonObject replace(JsonObject templateJsonObject, JsonObject parameters, Boolean encode) + throws UnsupportedEncodingException { + String replacedString = replaceWithTracking(templateJsonObject.toString(), parameters, encode).getKey(); + return GSON.fromJson(replacedString, JsonObject.class); + } + + /** + * + * @param templateString a template string with placeholders or variables + * @param parameters the replacement values coded in a JsonObject format + * @return a pair made of replaced string and whatever parameters that were not used + * @throws UnsupportedEncodingException + */ + static Pair replaceWithTracking(String templateString, JsonObject parameters) + throws UnsupportedEncodingException { + return replaceWithTracking(templateString, parameters, false); + } + + /** + * + * @param templateString a template string with placeholders or variables + * @param parameters the replacement values coded in a JsonObject format + * @param encode whether to encode the value string, note this function will not encode + * the template string in any case, the encoding only applies to the replacement values + * @return a pair made of replaced string and whatever parameters that were not used + * @throws UnsupportedEncodingException + */ + static Pair replaceWithTracking(String templateString, JsonObject parameters, Boolean encode) + throws UnsupportedEncodingException { + String replacedString = templateString; + JsonObject remainingParameters = new JsonObject(); + + List variables = getVariables(templateString); + + for (Map.Entry entry : parameters.entrySet()) { + if (variables.contains(entry.getKey())) { + replacedString = replacedString.replace(OPENING + entry.getKey() + CLOSING, + encode ? URLEncoder.encode(entry.getValue().getAsString(), "UTF-8") : entry.getValue().getAsString()); + } else { + remainingParameters.add(entry.getKey(), entry.getValue()); + } + } + return new ImmutablePair<>(replacedString, remainingParameters); + } + + /** + * retrieve a list of placeholders or variables from the template, placeholders or variables are + * identified by alpha numeric strings surrounded by {{}} + * + * @param templateString the template with placeholders or variables + * @return a list of placeholders or variables + */ + static List getVariables(String templateString) { + List paramList = Lists.newArrayList(); + Matcher matcher = PATTERN.matcher(templateString); + while (matcher.find()) { + paramList.add(matcher.group(1)); + } + return paramList; + } + + /** + * Validates if a string has variables + * @param templateString + * @return true if {{}} is found else false + */ + static boolean hasVariable(String templateString) { + return PATTERN.matcher(templateString).find(); + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/WatermarkDefinition.java b/dil/src/main/java/com/linkedin/dil/util/WatermarkDefinition.java new file mode 100644 index 0000000..b57cad2 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/WatermarkDefinition.java @@ -0,0 +1,236 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.util.List; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.util.StringUtils; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Period; +import org.testng.Assert; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * This class encapsulates Watermark definitions, and provide function to manage + * features generate milli-seconds or date time ranges + * + * @author chrli + */ +@Slf4j @Getter @Setter +public class WatermarkDefinition { + final private static String DEFAULT_TIMEZONE = "America/Los_Angeles"; + + public enum WatermarkTypes { + DATETIME("datetime"), + UNIT("unit"); + + private final String name; + + WatermarkTypes(String name) { + this.name = name; + } + + @Override + public String toString() { + return name; + } + } + + private String name; + private WatermarkTypes type; + private Pair range; + private Boolean isPartialPartition = false; + private WorkUnitPartitionTypes workUnitPartitionType = null; + + // units is the internal storage of work units string, it should be + // a JsonArray formatted as String + private String units; + private String timezone = ""; + + /** + * A constructor that creates a Unit watermark definition + *

+ * A Unit watermark has a list of String values + * @param name the name or the watermark + * @param units the units in a JsonArray + */ + public WatermarkDefinition(String name, JsonArray units) { + this.setName(name); + this.setType(WatermarkTypes.UNIT); + this.setUnits(units.toString()); + } + + /** + * A constructor that creates a Unit watermark definition from a units string + * + *

+ * A Unit watermark has a list of name : value pairs coded as a JsonArray of JsonObjects + * @param name the name or the watermark + * @param commaSeparatedUnits the comma separated units, either a JsonArray or a simple String list + */ + public WatermarkDefinition(String name, String commaSeparatedUnits) { + setUnits(name, commaSeparatedUnits); + } + + /** + * If the string is JsonArray, it will be stored as is. + * Otherwise, the string is broken down as a list of values. And then the values + * will be combined with the unit watermark name as name : value pairs. + * @param name the name or the watermark + * @param commaSeparatedUnits the comma separated units, either a JsonArray or a simple String list + * @return the watermark definition object + */ + public WatermarkDefinition setUnits(String name, String commaSeparatedUnits) { + boolean isJsonArrayUnits = true; + this.setName(name); + this.setType(WatermarkTypes.UNIT); + try { + GSON.fromJson(commaSeparatedUnits, JsonArray.class); + } catch (Exception e) { + log.info("Assuming simple Unit Watermark definition as the unit watermark cannot be converted to JsonArray"); + log.info("Origin unit watermark definition: {} : {}", name, commaSeparatedUnits); + isJsonArrayUnits = false; + } + + if (isJsonArrayUnits) { + this.setUnits(commaSeparatedUnits); + } else { + JsonArray unitArray = new JsonArray(); + List units = Lists.newArrayList(commaSeparatedUnits.split(StringUtils.COMMA_STR)); + for (String unit: units) { + JsonObject jsonObject = new JsonObject(); + jsonObject.addProperty(name, unit); + unitArray.add(jsonObject); + } + this.setUnits(unitArray.toString()); + } + return this; + } + + /** + * A constructor that creates a Datetime watermark definition + *

+ * A Datetime watermark has a date range + * @param name the name of the watermark + * @param startDate the start date string in yyyy-MM-dd format + * @param endDate the end date string in yyyy-MM-dd format or - for current date + */ + public WatermarkDefinition(String name, String startDate, String endDate) { + this(name, startDate, endDate, false); + } + + public WatermarkDefinition(String name, String startDate, String endDate, boolean isPartialPartition) { + this.setName(name); + this.setType(WatermarkTypes.DATETIME); + this.setRange(new ImmutablePair<>(startDate, endDate)); + this.isPartialPartition = isPartialPartition; + } + + public WatermarkDefinition(JsonObject definition, boolean isPartialPartition) { + this(definition, isPartialPartition, null); + } + + public WatermarkDefinition(JsonObject definition, boolean isPartialPartition, + WorkUnitPartitionTypes workUnitPartitionType) { + Assert.assertTrue(definition.has(KEY_WORD_NAME)); + Assert.assertTrue(definition.has(KEY_WORD_TYPE)); + Assert.assertNotNull(WatermarkTypes.valueOf(definition.get(KEY_WORD_TYPE).getAsString().toUpperCase())); + this.setName(definition.get(KEY_WORD_NAME).getAsString()); + this.setIsPartialPartition(isPartialPartition); + if (definition.get(KEY_WORD_TYPE).getAsString().equalsIgnoreCase(WatermarkTypes.DATETIME.name)) { + this.setType(WatermarkTypes.DATETIME); + this.setRange(new ImmutablePair<>( + definition.get(KEY_WORD_RANGE).getAsJsonObject().get(KEY_WORD_RANGE_FROM).getAsString(), + definition.get(KEY_WORD_RANGE).getAsJsonObject().get(KEY_WORD_RANGE_TO).getAsString())); + this.setWorkUnitPartitionType(workUnitPartitionType); + } else if (definition.get(KEY_WORD_TYPE).getAsString().equalsIgnoreCase(WatermarkTypes.UNIT.name)) { + this.setType(WatermarkTypes.UNIT); + this.setUnits(definition.get(KEY_WORD_NAME).getAsString(), definition.get(KEY_WORD_UNITS).getAsString()); + } + } + + @VisibleForTesting + DateTime getDateTime(String input) { + DateTimeZone timeZone = DateTimeZone.forID(timezone.isEmpty() ? DEFAULT_TIMEZONE : timezone); + /** + * Weekly/Monthly partitioned jobs/sources expect the fromDate to be less than toDate. + * Keeping the precision at day level for Weekly and Monthly partitioned watermarks. + * + * If partial partition is set to true, we don't floor the watermark for a given + * partition type. + * For daily partition type, 2019-01-01T12:31:00 will be rounded to 2019-01-01T00:00:00, + * if partial partition is false. + */ + if (input.equals("-")) { + if (WorkUnitPartitionTypes.isMultiDayPartitioned(workUnitPartitionType)) { + return DateTime.now().withZone(timeZone).dayOfMonth().roundFloorCopy(); + } + if (this.getIsPartialPartition()) { + return DateTime.now().withZone(timeZone); + } + return DateTime.now().withZone(timeZone).dayOfMonth().roundFloorCopy(); + } else if (input.matches("P\\d+D(T\\d+H){0,1}")) { + /* + The standard ISO format - PyYmMwWdDThHmMsS + Only supporting DAY and HOUR. DAY component is mandatory. + e.g.P1D, P2DT5H, P0DT7H + */ + Period period = Period.parse(input); + DateTime dt = DateTime.now().withZone(timeZone).minus(period); + if (WorkUnitPartitionTypes.isMultiDayPartitioned(workUnitPartitionType)) { + return dt.dayOfMonth().roundFloorCopy(); + } + if (this.getIsPartialPartition()) { + return dt; + } + return dt.dayOfMonth().roundFloorCopy(); + } + + return DateTimeUtils.parse(input, timezone.isEmpty() ? DEFAULT_TIMEZONE : timezone); + } + + private Long getMillis(String input) { + return getDateTime(input).getMillis(); + } + + public ImmutablePair getRangeInDateTime() { + return new ImmutablePair<>(getDateTime(range.getKey()), getDateTime(range.getValue())); + } + + public ImmutablePair getRangeInMillis() { + return new ImmutablePair<>(getMillis(range.getKey()), getMillis(range.getValue())); + } + + /** + * get a list of work units, with each coded as a name : value pair. + * + * The internal storage of work units string should be a JsonArray string + * @return list of work units + */ + public List getUnits() { + List unitList = Lists.newArrayList(); + JsonArray unitArray = GSON.fromJson(units, JsonArray.class); + for (JsonElement unit: unitArray) { + unitList.add(unit.toString()); + } + return unitList; + } + + public String getLongName() { + return "watermark." + name; + } +} diff --git a/dil/src/main/java/com/linkedin/dil/util/WorkUnitPartitionTypes.java b/dil/src/main/java/com/linkedin/dil/util/WorkUnitPartitionTypes.java new file mode 100644 index 0000000..5ef8641 --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/WorkUnitPartitionTypes.java @@ -0,0 +1,244 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.collect.Lists; +import com.google.gson.JsonObject; +import java.util.Comparator; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import com.linkedin.dil.configuration.MultistageProperties; +import org.joda.time.DateTime; + +import static com.linkedin.dil.configuration.StaticConstants.*; + + +/** + * A work unit that takes a range of days can be sub-divided into partitions + *

+ * hourly partition will generate a work unit for each hour + *

+ * daily partition will generate a work unit for each day + *

+ * weekly partition will generate a work unit for each 7 days + *

+ * monthly partition will generate a work unit for each 1 month + *

+ * composite partitioning will generate a series of work units for each sub-type + *

+ * The last partition can be partial if allow partial flag is true. + * + * TODO: when other types of watermarks are supported, the sub-partition might need + * support hash partition or even space partitions + * + * @author chrli + * + */ +@Slf4j +public enum WorkUnitPartitionTypes { + NONE("none", 0) { + @Override + protected DateTime getNext(DateTime start, DateTime end) { + return end; + } + }, + HOURLY("hourly", 1) { + @Override + protected DateTime getNext(DateTime start, DateTime end) { + return start.plusHours(interval); + } + }, + DAILY("daily", 1) { + @Override + protected DateTime getNext(DateTime start, DateTime end) { + return start.plusDays(interval); + } + }, + WEEKLY("weekly", 7, true) { + @Override + protected DateTime getNext(DateTime start, DateTime end) { + return start.plusDays(interval); + } + }, + MONTHLY("monthly", 1, true) { + @Override + protected DateTime getNext(DateTime start, DateTime end) { + return start.plusMonths(interval); + } + }, + YEARLY("yearly", 1, true) { + @Override + protected DateTime getNext(DateTime start, DateTime end) { + return start.plusYears(interval); + } + }, + COMPOSITE("composite", 0, true) { + @Override + protected DateTime getNext(DateTime start, DateTime end) { + throw new RuntimeException("Composite should never call itself!"); + } + + /** + * Refer to the specifications and constraints for "ms.work.unit.partition" in MultistageProperties. + * The ranges should be continuous with no gaps or overlaps. + * + */ + @Override + protected Pair getNext(DateTime start, DateTime end, boolean allowPartial) { + // Start from the lowest range that has the smallest start date time + // Get the first range that matches - it would also satisfy the partial partitioning config. + Pair nextDateTime; + for (Pair> subRange: subRanges) { + if (subRange.getLeft().getMillis() <= end.getMillis() + && subRange.getRight().getLeft().getMillis() > start.getMillis()) { + WorkUnitPartitionTypes subRangeWorkUnitPartitionType = subRange.getRight().getValue(); + /* + Handling a case where the previous subrange's last partial partition was ignored because + partial partitioning is false. + */ + DateTime startToUse = subRange.getLeft().getMillis() > start.getMillis() ? subRange.getLeft() : start; + nextDateTime = subRangeWorkUnitPartitionType.getNext(startToUse, subRange.getRight().getLeft(), allowPartial); + if (nextDateTime != null) { + return nextDateTime; + } + } + } + return null; + } + }; + + final private String partitionType; + final protected Integer interval; + final private Boolean isMultiDayPartitioned; + final protected SortedSet>> subRanges; + + WorkUnitPartitionTypes(String partitionType, Integer interval) { + this(partitionType, interval, false); + } + + WorkUnitPartitionTypes(String partitionType, Integer interval, Boolean isMultiDayPartitioned) { + this(partitionType, interval, isMultiDayPartitioned, new TreeSet<>(Comparator.comparing(Pair::getLeft))); + } + + WorkUnitPartitionTypes(String partitionType, Integer interval, Boolean isMultiDayPartitioned, + SortedSet>> subRanges) { + this.partitionType = partitionType; + this.interval = interval; + this.isMultiDayPartitioned = isMultiDayPartitioned; + this.subRanges = subRanges; + } + + /** + * + * Static method to parse a string and return the partition type + * @param partitionType specified partition types or a JsonObject + * @return specified partition types or COMPOSITE + * + */ + public static WorkUnitPartitionTypes fromString(String partitionType) { + for (WorkUnitPartitionTypes workUnitPartitionType : WorkUnitPartitionTypes.values()) { + if (workUnitPartitionType.partitionType.equalsIgnoreCase(partitionType)) { + return workUnitPartitionType; + } + } + try { + JsonObject jsonObject = GSON.fromJson(partitionType, JsonObject.class); + if (jsonObject.entrySet().size() > 0) { + return WorkUnitPartitionTypes.COMPOSITE; + } + } catch (Exception e) { + log.error("Error parsing the partition type string, please check job property: " + + MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.toString(), e); + } + return null; + } + + public static boolean isMultiDayPartitioned(WorkUnitPartitionTypes workUnitPartitionType) { + return (workUnitPartitionType != null && workUnitPartitionType.isMultiDayPartitioned); + } + + @Override + public String toString() { + return partitionType; + } + + protected abstract DateTime getNext(DateTime start, DateTime end); + + protected Pair getNext(DateTime start, DateTime end, boolean allowPartial) { + DateTime next = getNext(start, end); + if (next.getMillis() <= end.getMillis()) { + return Pair.of(start, next); + } else if (allowPartial) { + return Pair.of(start, end); + } else { + return null; + } + } + + public List> getRanges(Pair range) { + return getRanges(range, false); + } + + public List> getRanges(Pair range, boolean allowPartial) { + return getRanges(range.getLeft(), range.getRight(), allowPartial); + } + + public List> getRanges(DateTime start, DateTime end) { + return getRanges(start, end, false); + } + + /** + * Convert DateTime range to a list of milli-second ranges. + * + * @param start start date time with time zone enclosed + * @param end end date time with time zone enclosed + * @param allowPartial whether the last partition can be partial + * @return a list of milli-second ranges + */ + public List> getRanges(DateTime start, DateTime end, boolean allowPartial) { + List> list = Lists.newArrayList(); + DateTime tempStart = start; + while (tempStart.getMillis() < end.getMillis()) { + Pair nextDateTimeRange = getNext(tempStart, end, allowPartial); + if (nextDateTimeRange != null) { + list.add(new ImmutablePair<>(nextDateTimeRange.getLeft().getMillis(), nextDateTimeRange.getRight().getMillis())); + /* + Composite partitioning could choose next subrange's start date + => because previous subrange's last partial partition was ignored + => because partial partitioning is false. + */ + tempStart = nextDateTimeRange.getRight(); + } else { + tempStart = end; + } + } + return list; + } + + /** + * Add a sub range and its partition method + * @param start the start date time of the range + * @param end the end date time of the range + * @param partitionTypes the partition type + * @return the object itself so that operation can be chained if needed + */ + public WorkUnitPartitionTypes addSubRange(DateTime start, DateTime end, WorkUnitPartitionTypes partitionTypes) { + this.subRanges.add(ImmutablePair.of(start, ImmutablePair.of(end, partitionTypes))); + return this; + } + + /** + * Clear the sub ranges + * @return the object itself + */ + public WorkUnitPartitionTypes resetSubRange() { + this.subRanges.clear(); + return this; + } +} \ No newline at end of file diff --git a/dil/src/main/java/com/linkedin/dil/util/WorkUnitStatus.java b/dil/src/main/java/com/linkedin/dil/util/WorkUnitStatus.java new file mode 100644 index 0000000..fb21dfd --- /dev/null +++ b/dil/src/main/java/com/linkedin/dil/util/WorkUnitStatus.java @@ -0,0 +1,225 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; + + +/** + * Define a structure for data interchange between a Source and a Extractor. + * + * @author chrli + */ +public class WorkUnitStatus { + private static final Logger log = org.slf4j.LoggerFactory.getLogger(WorkUnitStatus.class); + private long totalCount; + private long setCount; + private long pageNumber = 0; + private long pageStart = 0; + private long pageSize = 0; + private InputStream buffer; + + private Map messages; + private String sessionKey; + + WorkUnitStatus(long totalCount, long setCount, long pageNumber, long pageStart, long pageSize, InputStream buffer, + Map messages, String sessionKey) { + this.totalCount = totalCount; + this.setCount = setCount; + this.pageNumber = pageNumber; + this.pageStart = pageStart; + this.pageSize = pageSize; + this.buffer = buffer; + this.messages = messages; + this.sessionKey = sessionKey; + } + + public static WorkUnitStatusBuilder builder() { + return new WorkUnitStatusBuilder(); + } + + /** + * retrieve source schema if provided + * + * @return source schema if provided + */ + public JsonArray getSchema() { + if (messages != null && messages.containsKey("schema")) { + try { + return new Gson().fromJson(messages.get("schema"), JsonArray.class); + } catch (Exception e) { + log.warn("Error reading source schema", e); + } + } + return new JsonArray(); + } + + public void logDebugAll() { + log.debug("These are values in WorkUnitStatus"); + log.debug("Total count: {}", totalCount); + log.debug("Chunk count: {}", setCount); + log.debug("Pagination: {},{},{}", pageStart, pageSize, pageNumber); + log.debug("Session Status: {}", sessionKey); + } + + public long getTotalCount() { + return this.totalCount; + } + + public long getSetCount() { + return this.setCount; + } + + public long getPageNumber() { + return this.pageNumber; + } + + public long getPageStart() { + return this.pageStart; + } + + public long getPageSize() { + return this.pageSize; + } + + public InputStream getBuffer() { + return this.buffer; + } + + public Map getMessages() { + return this.messages == null ? new HashMap<>() : messages; + } + + public String getSessionKey() { + return this.sessionKey == null ? StringUtils.EMPTY : sessionKey; + } + + public WorkUnitStatus setTotalCount(long totalCount) { + this.totalCount = totalCount; + return this; + } + + public WorkUnitStatus setSetCount(long setCount) { + this.setCount = setCount; + return this; + } + + public WorkUnitStatus setPageNumber(long pageNumber) { + this.pageNumber = pageNumber; + return this; + } + + public WorkUnitStatus setPageStart(long pageStart) { + this.pageStart = pageStart; + return this; + } + + public WorkUnitStatus setPageSize(long pageSize) { + this.pageSize = pageSize; + return this; + } + + public WorkUnitStatus setBuffer(InputStream buffer) { + this.buffer = buffer; + return this; + } + + public WorkUnitStatus setMessages(Map messages) { + this.messages = messages; + return this; + } + + public WorkUnitStatus setSessionKey(String sessionKey) { + this.sessionKey = sessionKey; + return this; + } + + public String toString() { + return "WorkUnitStatus(totalCount=" + this.getTotalCount() + ", setCount=" + this.getSetCount() + ", pageNumber=" + + this.getPageNumber() + ", pageStart=" + this.getPageStart() + ", pageSize=" + this.getPageSize() + ", buffer=" + + this.getBuffer() + ", messages=" + this.getMessages() + ", sessionKey=" + this.getSessionKey() + ")"; + } + + public WorkUnitStatusBuilder toBuilder() { + return new WorkUnitStatusBuilder().totalCount(this.totalCount) + .setCount(this.setCount) + .pageNumber(this.pageNumber) + .pageStart(this.pageStart) + .pageSize(this.pageSize) + .buffer(this.buffer) + .messages(this.messages) + .sessionKey(this.sessionKey); + } + + public static class WorkUnitStatusBuilder { + private long totalCount; + private long setCount; + private long pageNumber; + private long pageStart; + private long pageSize; + private InputStream buffer; + private Map messages; + private String sessionKey; + + WorkUnitStatusBuilder() { + } + + public WorkUnitStatus.WorkUnitStatusBuilder totalCount(long totalCount) { + this.totalCount = totalCount; + return this; + } + + public WorkUnitStatus.WorkUnitStatusBuilder setCount(long setCount) { + this.setCount = setCount; + return this; + } + + public WorkUnitStatus.WorkUnitStatusBuilder pageNumber(long pageNumber) { + this.pageNumber = pageNumber; + return this; + } + + public WorkUnitStatus.WorkUnitStatusBuilder pageStart(long pageStart) { + this.pageStart = pageStart; + return this; + } + + public WorkUnitStatus.WorkUnitStatusBuilder pageSize(long pageSize) { + this.pageSize = pageSize; + return this; + } + + public WorkUnitStatus.WorkUnitStatusBuilder buffer(InputStream buffer) { + this.buffer = buffer; + return this; + } + + public WorkUnitStatus.WorkUnitStatusBuilder messages(Map messages) { + this.messages = messages; + return this; + } + + public WorkUnitStatus.WorkUnitStatusBuilder sessionKey(String sessionKey) { + this.sessionKey = sessionKey; + return this; + } + + public WorkUnitStatus build() { + return new WorkUnitStatus(totalCount, setCount, pageNumber, pageStart, pageSize, buffer, messages, sessionKey); + } + + public String toString() { + return "WorkUnitStatus.WorkUnitStatusBuilder(totalCount=" + this.totalCount + ", setCount=" + this.setCount + + ", pageNumber=" + this.pageNumber + ", pageStart=" + this.pageStart + ", pageSize=" + this.pageSize + + ", buffer=" + this.buffer + ", messages=" + this.messages + ", sessionKey=" + this.sessionKey + ")"; + } + } +} diff --git a/dil/src/test/java/com/linkedin/dil/configuration/MultistagePropertiesTest.java b/dil/src/test/java/com/linkedin/dil/configuration/MultistagePropertiesTest.java new file mode 100644 index 0000000..27268c7 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/configuration/MultistagePropertiesTest.java @@ -0,0 +1,283 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.configuration; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.State; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@Test +public class MultistagePropertiesTest { + private final Gson gson = new Gson(); + + @Test + void validateNonblankWithDefault() { + SourceState state = new SourceState(); + Assert.assertEquals(MultistageProperties.MSTAGE_PARAMETERS.getValidNonblankWithDefault(state), new JsonArray()); + Assert.assertEquals(MultistageProperties.MSTAGE_DATA_FIELD.getValidNonblankWithDefault(state), ""); + Assert.assertEquals(MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getValidNonblankWithDefault(state), new Integer(0)); + Assert.assertEquals(MultistageProperties.MSTAGE_AUTHENTICATION.getValidNonblankWithDefault(state), new JsonObject()); + Assert.assertEquals(MultistageProperties.MSTAGE_HTTP_STATUSES.getValidNonblankWithDefault(state).toString(), + "{\"success\":[200,201,202],\"pagination_error\":[401]}"); + Assert.assertEquals(MultistageProperties.MSTAGE_PAGINATION.getValidNonblankWithDefault(state), new JsonObject()); + Assert.assertFalse(MultistageProperties.MSTAGE_PAGINATION.validateNonblank(state)); + state.setProp(MultistageProperties.MSTAGE_PAGINATION.getConfig(), "[]"); + Assert.assertFalse(MultistageProperties.MSTAGE_PAGINATION.validateNonblank(state)); + state.setProp(MultistageProperties.MSTAGE_PAGINATION.getConfig(), "{}"); + Assert.assertFalse(MultistageProperties.MSTAGE_PAGINATION.validateNonblank(state)); + state.setProp(MultistageProperties.MSTAGE_PAGINATION.getConfig(), "{null}}"); + Assert.assertFalse(MultistageProperties.MSTAGE_PAGINATION.validateNonblank(state)); + } + + /** + * Test ms.wait.timeout.seconds under 2 scenarios + * Scenario 1: test default value + * Scenario 2: test user defined value + */ + @Test + void validateWaitTimeoutProperty() { + SourceState state = new SourceState(); + + // Scenario 1: test default value + // + // Input: State object without setting ms.wait.time.seconds + // Output: 600 seconds, or 10 minutes, or 600,000 milli-seconds + + Assert.assertEquals(MultistageProperties.MSTAGE_WAIT_TIMEOUT_SECONDS.getMillis(state).longValue(), 600000L); + + // Scenario 2: test user defined value + // + // Input: State object by setting ms.wait.time.seconds = 1000 + // Output: 1000 seconds, or 1,000,000 milli-seconds + state.setProp(MultistageProperties.MSTAGE_WAIT_TIMEOUT_SECONDS.toString(), 1000); + Assert.assertEquals(MultistageProperties.MSTAGE_WAIT_TIMEOUT_SECONDS.getMillis(state).longValue(), 1000000L); + } + + /** + * Test getDefaultValue for MSTAGE_RETENTION + */ + @Test + public void testGetDefaultValue1() { + JsonObject expected = gson.fromJson("{\"state.store\":\"P90D\",\"publish.dir\":\"P731D\",\"log\":\"P30D\"}", JsonObject.class); + Assert.assertEquals(MultistageProperties.MSTAGE_RETENTION.getDefaultValue(), expected); + } + + /** + * Test getDefaultValue for MSTAGE_ENABLE_DYNAMIC_FULL_LOAD + */ + @Test + public void testGetDefaultValue2() { + Assert.assertEquals(MultistageProperties.MSTAGE_ENABLE_DYNAMIC_FULL_LOAD.getDefaultValue(), Boolean.TRUE); + } + + /** + * Test getDefaultValue for MSTAGE_ENABLE_SCHEMA_BASED_FILTERING + */ + @Test + public void testGetDefaultValue3() { + Assert.assertEquals(MultistageProperties.MSTAGE_ENABLE_SCHEMA_BASED_FILTERING.getDefaultValue(), Boolean.TRUE); + } + + /** + * Test getDefaultValue for MSTAGE_SOURCE_FILES_PATTERN + */ + @Test + public void testGetDefaultValue4() { + Assert.assertEquals(MultistageProperties.MSTAGE_SOURCE_FILES_PATTERN.getDefaultValue(), ".*"); + } + + /** + * Test getDefaultValue for EXTRACT_IS_FULL + */ + @Test + public void testGetDefaultValue5() { + Assert.assertEquals(MultistageProperties.EXTRACT_IS_FULL.getDefaultValue(), (Boolean) false); + } + + /** + * Test getDefaultValue for MSTAGE_ENDCODING + */ + @Test + public void testGetDefaultValue6() { + Assert.assertEquals(MultistageProperties.MSTAGE_ENCODING.getDefaultValue(), "UTF-8"); + } + + /** + * Test getDefaultValue + */ + @Test + public void testGetDefaultValue7() { + Assert.assertEquals(MultistageProperties.MSTAGE_WORKUNIT_STARTTIME_KEY.getDefaultValue(), new Long(0L)); + } + + /** + * Test getValidNonblankWithDefault + */ + @Test + public void testGetValidNonblankWithDefault1() { + State state = Mockito.mock(State.class); + when(state.getPropAsInt(MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getConfig(), 0)).thenReturn(0); + Assert.assertEquals(MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getValidNonblankWithDefault(state), new Integer(0)); + + when(state.getPropAsInt(MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getConfig(), 0)).thenReturn(1); + Assert.assertEquals(MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getValidNonblankWithDefault(state), new Integer(1)); + } + + /** + * Test getValidNonblankWithDefault for MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION + */ + @Test + public void testGetValidNonblankWithDefault2() { + State state = Mockito.mock(State.class); + String expected = "input"; + when(state.getProp(MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getConfig(), StringUtils.EMPTY)).thenReturn(expected); + Assert.assertEquals(MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getValidNonblankWithDefault(state), expected.toUpperCase()); + + when(state.getProp(MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getConfig(), StringUtils.EMPTY)).thenReturn(""); + Assert.assertEquals(MultistageProperties.MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getValidNonblankWithDefault(state), "755"); + } + + /** + * Test getMillis for MSTAGE_GRACE_PERIOD_DAYS + */ + @Test + public void testGetMillis1() { + State state = new State(); + Long expected = 24L * 3600L * 1000L * (Integer) MultistageProperties.MSTAGE_GRACE_PERIOD_DAYS.getProp(state); + Assert.assertEquals(MultistageProperties.MSTAGE_GRACE_PERIOD_DAYS.getMillis(state), expected); + + Assert.assertEquals(MultistageProperties.MSTAGE_SOURCE_FILES_PATTERN.getMillis(state), (Long) 0L); + } + + /** + * Test getMillis for MSTAGE_ABSTINENT_PERIOD_DAYS + */ + @Test + public void testGetMillis2() { + State state = new State(); + Long expected = 24L * 3600L * 1000L * (Integer) MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getProp(new State()); + Assert.assertEquals(MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.getMillis(state), expected); + } + + /** + * Test validate for MSTAGE_ACTIVATION_PROPERTY + */ + @Test + public void testValidate1() { + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getConfig(), new JsonObject().toString())).thenReturn(""); + Assert.assertTrue(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validate(state)); + + when(state.getProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getConfig(), new JsonObject().toString())).thenReturn("{\"state.store\":\"P90D\"}"); + Assert.assertTrue(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validate(state)); + } + + /** + * Test validate for MSTAGE_DERIVED_FIELDS + */ + @Test + public void testValidate2() { + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_DERIVED_FIELDS.getConfig(), new JsonArray().toString())).thenReturn(""); + Assert.assertTrue(MultistageProperties.MSTAGE_DERIVED_FIELDS.validate(state)); + + when(state.getProp(MultistageProperties.MSTAGE_DERIVED_FIELDS.getConfig(), new JsonArray().toString())).thenReturn("[]"); + Assert.assertTrue(MultistageProperties.MSTAGE_DERIVED_FIELDS.validate(state)); + + when(state.getProp(MultistageProperties.MSTAGE_DERIVED_FIELDS.getConfig(), new JsonArray().toString())).thenReturn("[{\"random\":\"value\"}]"); + Assert.assertFalse(MultistageProperties.MSTAGE_DERIVED_FIELDS.validate(state)); + + when(state.getProp(MultistageProperties.MSTAGE_DERIVED_FIELDS.getConfig(), new JsonArray().toString())).thenReturn("[{\"name\":\"value\"}]"); + Assert.assertFalse(MultistageProperties.MSTAGE_DERIVED_FIELDS.validate(state)); + + when(state.getProp(MultistageProperties.MSTAGE_DERIVED_FIELDS.getConfig(), new JsonArray().toString())).thenReturn("[{\"name\":\"value\", \"formula\":\"formulaValue\"}]"); + Assert.assertTrue(MultistageProperties.MSTAGE_DERIVED_FIELDS.validate(state)); + } + + /** + * Test validate for MSTAGE_SECONDARY_INPUT + */ + @Test + public void testValidate3() { + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_SECONDARY_INPUT.getConfig(), new JsonArray().toString())).thenReturn(null); + Assert.assertFalse(MultistageProperties.MSTAGE_SECONDARY_INPUT.validate(state)); + + when(state.getProp(MultistageProperties.MSTAGE_SECONDARY_INPUT.getConfig(), new JsonArray().toString())).thenReturn("[{\"name\":\"value\"}]"); + Assert.assertTrue(MultistageProperties.MSTAGE_SECONDARY_INPUT.validate(state)); + } + + /** + * Test validate for MSTAGE_SECONDARY_INPUT + */ + @Test + public void testValidate4() { + Assert.assertTrue(MultistageProperties.MSTAGE_ABSTINENT_PERIOD_DAYS.validate(new State())); + } + + /** + * Test validateNonblank for MSTAGE_AUTHENTICATION + */ + @Test + public void testValidateNonblank1() { + State state = Mockito.mock(State.class); + JsonObject obj = new JsonObject(); + when(state.getProp(MultistageProperties.MSTAGE_AUTHENTICATION.getConfig(), new JsonObject().toString())).thenReturn(obj.toString()); + Assert.assertFalse(MultistageProperties.MSTAGE_AUTHENTICATION.validateNonblank(state)); + + obj.addProperty("test", "testValue"); + when(state.getProp(MultistageProperties.MSTAGE_AUTHENTICATION.getConfig(), new JsonObject().toString())).thenReturn(obj.toString()); + Assert.assertFalse(MultistageProperties.MSTAGE_AUTHENTICATION.validateNonblank(state)); + + obj.addProperty("method", "testMethodValue"); + when(state.getProp(MultistageProperties.MSTAGE_AUTHENTICATION.getConfig(), new JsonObject().toString())).thenReturn(obj.toString()); + Assert.assertFalse(MultistageProperties.MSTAGE_AUTHENTICATION.validateNonblank(state)); + + obj.addProperty("encryption", "testEncryptionValue"); + when(state.getProp(MultistageProperties.MSTAGE_AUTHENTICATION.getConfig(), new JsonObject().toString())).thenReturn(obj.toString()); + Assert.assertTrue(MultistageProperties.MSTAGE_AUTHENTICATION.validateNonblank(state)); + } + + /** + * Test validateNonblank for MSTAGE_CSV_COLUMN_PROJECTION + */ + @Test + public void testValidateNonblank2() { + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_CSV_COLUMN_PROJECTION.getConfig(), StringUtils.EMPTY)).thenReturn(null); + Assert.assertFalse(MultistageProperties.MSTAGE_CSV_COLUMN_PROJECTION.validateNonblank(state)); + + when(state.getProp(MultistageProperties.MSTAGE_CSV_COLUMN_PROJECTION.getConfig(), StringUtils.EMPTY)).thenReturn("test"); + Assert.assertTrue(MultistageProperties.MSTAGE_CSV_COLUMN_PROJECTION.validateNonblank(state)); + + when(state.getProp(MultistageProperties.MSTAGE_CSV_COLUMN_PROJECTION.getConfig(), StringUtils.EMPTY)).thenReturn("test1,test2"); + Assert.assertTrue(MultistageProperties.MSTAGE_CSV_COLUMN_PROJECTION.validateNonblank(state)); + } + + /** + * Test validateNonblank + */ + @Test + public void testValidateNonblank3() { + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_BACKFILL.getConfig(), StringUtils.EMPTY)).thenReturn("non-validate"); + Assert.assertFalse(MultistageProperties.MSTAGE_BACKFILL.validateNonblank(state)); + + when(state.getProp(MultistageProperties.MSTAGE_BACKFILL.getConfig(), StringUtils.EMPTY)).thenReturn("false"); + Assert.assertTrue(MultistageProperties.MSTAGE_BACKFILL.validateNonblank(state)); + + when(state.getProp(MultistageProperties.MSTAGE_BACKFILL.getConfig(), StringUtils.EMPTY)).thenReturn("true"); + Assert.assertTrue(MultistageProperties.MSTAGE_BACKFILL.validateNonblank(state)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/connection/HdfsReadConnectionTest.java b/dil/src/test/java/com/linkedin/dil/connection/HdfsReadConnectionTest.java new file mode 100644 index 0000000..5c707cd --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/connection/HdfsReadConnectionTest.java @@ -0,0 +1,101 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import java.util.ArrayList; +import java.util.List; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.source.HdfsSource; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.source.extractor.filebased.FileBasedHelperException; +import org.apache.gobblin.source.extractor.filebased.TimestampAwareFileBasedHelper; +import org.apache.gobblin.source.extractor.hadoop.HadoopFsHelper; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.mockito.Matchers.*; +import static org.powermock.api.mockito.PowerMockito.*; + + +@Test +@PrepareForTest({HadoopFsHelper.class, TimestampAwareFileBasedHelper.class}) +public class HdfsReadConnectionTest extends PowerMockTestCase { + @Test + public void testGetFileList() throws Exception { + PowerMockito.mockStatic(HadoopFsHelper.class); + HadoopFsHelper fsHelper = PowerMockito.mock(HadoopFsHelper.class); + + HdfsSource hdfsSource = new HdfsSource(); + SourceState sourceState = new SourceState(); + sourceState.setProp("ms.extractor.class", "com.linkedin.dil.extractor.CsvExtractor"); + sourceState.setProp("ms.source.uri", "/data/test?RE=.*"); + WorkUnitState state = new WorkUnitState(hdfsSource.getWorkunits(sourceState).get(0), new State()); + + state.setProp("ms.extractor.class", "com.linkedin.dil.extractor.CsvExtractor"); + state.setProp("ms.source.uri", "/data/test?RE=.*"); + hdfsSource.getExtractor(state); + + HdfsConnection conn = new HdfsConnection(state, hdfsSource.getHdfsKeys(), new ExtractorKeys()); + + // getHdfsClient would fail as there is not real HDFS + Assert.assertNull(conn.getHdfsClient()); + + // use mocked helper + conn.setFsHelper(fsHelper); + + doNothing().when(fsHelper).close(); + doNothing().when(fsHelper).connect(); + when(fsHelper.ls(any())).thenReturn(new ArrayList<>()); + when(fsHelper.getFileStream(any())).thenReturn(null); + + Assert.assertNull(conn.executeFirst(WorkUnitStatus.builder().build()).getBuffer()); + + when(fsHelper.ls(any())).thenThrow(new FileBasedHelperException("error")); + Assert.assertNull(conn.executeFirst(WorkUnitStatus.builder().build()).getBuffer()); + + conn.closeAll(""); + } + + @Test + public void testGetFileInputStream() throws Exception { + PowerMockito.mockStatic(HadoopFsHelper.class); + HadoopFsHelper fsHelper = PowerMockito.mock(HadoopFsHelper.class); + + HdfsSource hdfsSource = new HdfsSource(); + SourceState sourceState = new SourceState(); + sourceState.setProp("ms.extractor.class", "com.linkedin.dil.extractor.CsvExtractor"); + sourceState.setProp("ms.source.uri", "/jobs/exttest/data/external/snapshots/test"); + WorkUnitState state = new WorkUnitState(hdfsSource.getWorkunits(sourceState).get(0), new State()); + + state.setProp("ms.extractor.class", "com.linkedin.dil.extractor.CsvExtractor"); + state.setProp("ms.source.uri", "/jobs/exttest/data/external/snapshots/test"); + hdfsSource.getExtractor(state); + + HdfsConnection conn = new HdfsConnection(state, hdfsSource.getHdfsKeys(), new ExtractorKeys()); + conn.setFsHelper(fsHelper); + + doNothing().when(fsHelper).close(); + doNothing().when(fsHelper).connect(); + + List files = new ArrayList<>(); + files.add("dummy"); + + when(fsHelper.ls(any())).thenReturn(files); + when(fsHelper.getFileStream(any())).thenReturn(null); + + Assert.assertNull(conn.executeFirst(WorkUnitStatus.builder().build()).getBuffer()); + + when(fsHelper.getFileStream(any())).thenThrow(new FileBasedHelperException("error")); + Assert.assertNull(conn.executeFirst(WorkUnitStatus.builder().build()).getBuffer()); + + conn.closeAll(""); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/connection/HttpConnectionTest.java b/dil/src/test/java/com/linkedin/dil/connection/HttpConnectionTest.java new file mode 100644 index 0000000..6614d8c --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/connection/HttpConnectionTest.java @@ -0,0 +1,378 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import gobblin.runtime.JobState; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.extractor.JsonExtractor; +import com.linkedin.dil.extractor.MultistageExtractor; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.HttpKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.source.HttpSource; +import com.linkedin.dil.util.HttpRequestMethod; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.source.extractor.extract.LongWatermark; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.apache.http.Header; +import org.apache.http.HeaderElement; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.StatusLine; +import org.apache.http.client.HttpClient; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.impl.client.AutoRetryHttpClient; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.util.EntityUtils; +import org.mockito.Mockito; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.powermock.reflect.Whitebox; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static org.mockito.Mockito.*; + + +@Test +@PrepareForTest({EntityUtils.class, CloseableHttpClient.class}) +public class HttpConnectionTest extends PowerMockTestCase { + private Gson gson; + private WorkUnitState state; + private JobKeys jobKeys; + private SourceState sourceState; + private String token; + private JsonObject pagination; + private JsonObject sessionKeyField; + private String totalCountField; + private JsonArray parameters; + private JsonArray encryptionFields; + private String dataField; + private Long callInterval; + private Long waitTimeoutSeconds; + private Boolean enableCleansing; + private Boolean workUnitPartialPartition; + private JsonArray watermark; + private JsonArray secondaryInput; + private String httpClientFactory; + private JsonObject httpRequestHeaders; + private String sourceUri; + private String httpRequestMethod; + private String extractorClass; + private JsonObject authentication; + private JsonObject httpStatus; + private JsonObject httpStatusReasons; + + @BeforeMethod + public void setUp() { + gson = new Gson(); + state = Mockito.mock(WorkUnitState.class); + jobKeys = Mockito.mock(JobKeys.class); + sourceState = Mockito.mock(SourceState.class); + } + + /** + * Test Execute + * @throws IOException + */ + @Test(expectedExceptions = RetriableAuthenticationException.class) + public void testExecute() throws IOException, RetriableAuthenticationException { + initializeHelper(); + + // the source getExtractor() method will initialize source Keys + HttpSource source = new HttpSource(); + HttpConnection conn = new HttpConnection(state, source.getHttpSourceKeys(), + ((MultistageExtractor)source.getExtractor(state)).getExtractorKeys()); + + CloseableHttpClient client = mock(CloseableHttpClient.class); + CloseableHttpResponse response = mock(CloseableHttpResponse.class); + + conn.setHttpClient(client); + when(client.execute(any())).thenReturn(response); + + WorkUnit workUnit = mock(WorkUnit.class); + LongWatermark lowWatermark = mock(LongWatermark.class); + LongWatermark highWatermark = mock(LongWatermark.class); + + long lowWaterMark = 1590994800000L; //2020-06-01 + long highWaterMark = 1591513200000L; //2020-06-07 + when(workUnit.getLowWatermark(LongWatermark.class)).thenReturn(lowWatermark); + when(lowWatermark.getValue()).thenReturn(lowWaterMark); + when(workUnit.getExpectedHighWatermark(LongWatermark.class)).thenReturn(highWatermark); + when(highWatermark.getValue()).thenReturn(highWaterMark); + when(state.getWorkunit()).thenReturn(workUnit); + + HttpRequestMethod command = mock(HttpRequestMethod.class); + WorkUnitStatus status = mock(WorkUnitStatus.class); + + JsonObject parameters = new JsonObject(); + parameters.addProperty("param1", "dummy"); + parameters.add("payload", new JsonObject()); + + when(command.toString()).thenReturn("Some http method"); + conn.getExtractorKeys().setDynamicParameters(parameters); + + StatusLine statusLine = mock(StatusLine.class); + when(response.getStatusLine()).thenReturn(statusLine); + when(statusLine.getStatusCode()).thenReturn(200); + when(statusLine.getReasonPhrase()).thenReturn("reason1 for success"); + Assert.assertNotNull(conn.execute(command, status)); + + HttpEntity entity = mock(HttpEntity.class); + Header header = mock(Header.class); + when(response.getEntity()).thenReturn(entity); + when(entity.getContentType()).thenReturn(header); + + HeaderElement element = mock(HeaderElement.class); + when(header.getElements()).thenReturn(new HeaderElement[]{element}); + when(element.getName()).thenReturn("application/json"); + PowerMockito.mockStatic(EntityUtils.class); + when(EntityUtils.toString(entity)).thenReturn("dummy error reason"); + Assert.assertNotNull(conn.execute(command, status)); + + when(response.getEntity()).thenReturn(null); + + when(statusLine.getStatusCode()).thenReturn(204); + Assert.assertNotNull(conn.execute(command, status)); + + when(statusLine.getStatusCode()).thenReturn(302); + when(statusLine.getReasonPhrase()).thenReturn("reason1 for warning"); + Assert.assertNull(conn.execute(command, status)); + + when(statusLine.getStatusCode()).thenReturn(405); + Assert.assertNull(conn.execute(command, status)); + + when(statusLine.getReasonPhrase()).thenReturn("reason1 for error"); + Assert.assertNull(conn.execute(command, status)); + + when(statusLine.getStatusCode()).thenReturn(408); + Assert.assertNull(conn.execute(command, status)); + + when(response.getEntity()).thenReturn(entity); + doThrow(new RuntimeException()).when(entity).getContentType(); + Assert.assertNull(conn.execute(command, status)); + } + + + /** + * Test getNext + */ + @Test + public void testGetNext() throws RetriableAuthenticationException { + HttpKeys httpSourceKeys = Mockito.mock(HttpKeys.class); + when(httpSourceKeys.getCallInterval()).thenReturn(1L); + ExtractorKeys extractorKeys = new ExtractorKeys(); + WorkUnitStatus workUnitStatus = Mockito.mock(WorkUnitStatus.class); + WorkUnitStatus.WorkUnitStatusBuilder builder = Mockito.mock(WorkUnitStatus.WorkUnitStatusBuilder.class); + HttpConnection conn = new HttpConnection(null, httpSourceKeys, extractorKeys); + + extractorKeys.setSignature("testSignature"); + extractorKeys.setActivationParameters(new JsonObject()); + when(builder.build()).thenReturn(workUnitStatus); + when(workUnitStatus.toBuilder()).thenReturn(builder); + when(httpSourceKeys.getHttpRequestMethod()).thenReturn("GET"); + + Assert.assertNull(conn.executeNext(workUnitStatus)); + } + + /** + * Test closeStream + */ + @Test + public void testCloseStream() throws IOException { + HttpConnection conn = new HttpConnection(null, new HttpKeys(), new ExtractorKeys()); + MultistageExtractor extractor = mock(MultistageExtractor.class); + ExtractorKeys keys = mock(ExtractorKeys.class); + String testSignature = "test_signature"; + when(extractor.getExtractorKeys()).thenReturn(keys); + when(keys.getSignature()).thenReturn(testSignature); + conn.closeStream(); + + CloseableHttpResponse httpResponse = mock(CloseableHttpResponse.class); + conn.setResponse(httpResponse); + doThrow(new RuntimeException()).when(httpResponse).close(); + conn.closeStream(); + } + + /** + * Test shutdown + */ + @Test + public void testShutdown() throws IOException { + HttpConnection conn = new HttpConnection(null, new HttpKeys(), null); + CloseableHttpClient client = mock(CloseableHttpClient.class); + conn.setHttpClient(client); + + doNothing().when(client).close(); + conn.closeAll(""); + + client = mock(CloseableHttpClient.class); + conn.setHttpClient(client); + doThrow(IOException.class).when(client).close(); + conn.closeAll(""); + + client = mock(CloseableHttpClient.class); + conn.setHttpClient(client); + AutoRetryHttpClient retryHttpClient = mock(AutoRetryHttpClient.class); + conn.setHttpClient(retryHttpClient); + conn.closeAll(""); + } + + @Test(enabled=true) + public void retriesTest() throws IOException { + + HttpClient mockHttpClient = mock(CloseableHttpClient.class); + HttpResponse httpResponse = mock(CloseableHttpResponse.class); + StatusLine statusLine = mock(StatusLine.class); + HttpEntity entity = mock(HttpEntity.class); + SourceState state = mock(SourceState.class); + + when(entity.getContent()).thenReturn(null); + when(httpResponse.getEntity()).thenReturn(entity); + when(statusLine.getStatusCode()).thenReturn(401); + when(statusLine.getReasonPhrase()).thenReturn("pagination error"); + when(httpResponse.getStatusLine()).thenReturn(statusLine); + when(mockHttpClient.execute(any(HttpUriRequest.class))).thenReturn(httpResponse); + + when(state.getProp("ms.watermark", "")).thenReturn("[{\"name\": \"system\",\"type\": \"datetime\", \"range\": {\"from\": \"2017-01-01\", \"to\": \"-\"}}]"); + when(state.getProp("extract.table.type", "SNAPSHOT_ONLY")).thenReturn("SNAPSHOT_ONLY"); + when(state.getProp("extract.namespace", "")).thenReturn("test"); + when(state.getProp("extract.table.name", "")).thenReturn("table1"); + when(state.getProp("source.conn.username", "")).thenReturn("X7CWBD5V4T6DR77WY23YSHACH55K2OXA"); + when(state.getProp("source.conn.password", "")).thenReturn(""); + when(state.getProp("ms.source.uri", "")).thenReturn("https://host/v2/users"); + when(state.getProp("ms.authentication", new JsonObject().toString())).thenReturn("{\"method\":\"basic\",\"encryption\":\"base64\", \"header\": \"Authorization\"}"); + when(state.getProp("ms.http.request.headers", new JsonObject().toString())).thenReturn("{\"Content-Type\": \"application/json\"}"); + when(state.getProp("ms.http.request.method", "")).thenReturn("GET"); + when(state.getProp("ms.session.key.field", new JsonObject().toString())).thenReturn("{\"name\": \"records.cursor\"}"); + when(state.getProp("ms.parameters", new JsonArray().toString())).thenReturn("[{\"name\":\"cursor\",\"type\":\"session\"}]"); + when(state.getProp("ms.data.field", "")).thenReturn("users"); + when(state.getProp("ms.total.count.field", "")).thenReturn("records.totalRecords"); + when(state.getProp("ms.work.unit.partition", "")).thenReturn(""); + when(state.getProp("ms.pagination", new JsonObject().toString())).thenReturn("{}"); + when(state.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + HttpSource httpSource = new HttpSource(); + List workUnits = httpSource.getWorkunits(state); + WorkUnitState unitState = new WorkUnitState(workUnits.get(0), new JobState()); + + HttpConnection conn = new HttpConnection(null, httpSource.getJobKeys(), new ExtractorKeys()); + conn.setHttpClient(mockHttpClient); + JsonExtractor extractor = new JsonExtractor(unitState, httpSource.getHttpSourceKeys()); + extractor.setConnection(conn); + + JsonObject record = extractor.readRecord(new JsonObject()); + // since we are setting the buffer to null, the final record object will be null + Assert.assertEquals(null, record); + } + + /** + * Test getResponseContentType + */ + @Test + public void testGetResponseContentType() throws Exception { + HttpConnection conn = new HttpConnection(null, new HttpKeys(), null); + HttpResponse response = mock(HttpResponse.class); + String methodName = "getResponseContentType"; + when(response.getEntity()).thenReturn(null); + Assert.assertEquals(Whitebox.invokeMethod(conn, methodName, response), StringUtils.EMPTY); + + HttpEntity entity = mock(HttpEntity.class); + when(response.getEntity()).thenReturn(entity); + when(entity.getContentType()).thenReturn(null); + Assert.assertEquals(Whitebox.invokeMethod(conn, methodName, response), StringUtils.EMPTY); + + Header contentType = mock(Header.class); + when(entity.getContentType()).thenReturn(contentType); + + HeaderElement[] headerElements = new HeaderElement[]{}; + when(contentType.getElements()).thenReturn(headerElements); + Assert.assertEquals(Whitebox.invokeMethod(conn, methodName, response), StringUtils.EMPTY); + + String type = "some_type"; + HeaderElement element = mock(HeaderElement.class); + when(element.getName()).thenReturn(type); + headerElements = new HeaderElement[]{element}; + when(contentType.getElements()).thenReturn(headerElements); + Assert.assertEquals(Whitebox.invokeMethod(conn, methodName, response), type); + } + + private void initializeHelper() { + JsonObject allKeys = gson.fromJson(new InputStreamReader(this.getClass().getResourceAsStream("/json/sample-data-for-source.json")), JsonObject.class); + pagination = allKeys.get(MSTAGE_PAGINATION.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_PAGINATION.getConfig(), new JsonObject().toString())).thenReturn(pagination.toString()); + + sessionKeyField = allKeys.get(MSTAGE_SESSION_KEY_FIELD.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_SESSION_KEY_FIELD.getConfig(), new JsonObject().toString())).thenReturn(sessionKeyField.toString()); + + totalCountField = allKeys.get(MSTAGE_TOTAL_COUNT_FIELD.getConfig()).getAsString(); + when(state.getProp(MSTAGE_TOTAL_COUNT_FIELD.getConfig(), StringUtils.EMPTY)).thenReturn(totalCountField); + + parameters = allKeys.get(MSTAGE_PARAMETERS.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_PARAMETERS.getConfig(), new JsonArray().toString())).thenReturn(parameters.toString()); + + encryptionFields = allKeys.get(MSTAGE_ENCRYPTION_FIELDS.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_ENCRYPTION_FIELDS.getConfig(), new JsonArray().toString())).thenReturn(encryptionFields.toString()); + + dataField = allKeys.get(MSTAGE_DATA_FIELD.getConfig()).getAsString(); + when(state.getProp(MSTAGE_DATA_FIELD.getConfig(), StringUtils.EMPTY)).thenReturn(dataField); + + callInterval = allKeys.get(MSTAGE_CALL_INTERVAL.getConfig()).getAsLong(); + when(state.getPropAsLong(MSTAGE_CALL_INTERVAL.getConfig(), 0L)).thenReturn(callInterval); + + waitTimeoutSeconds = allKeys.get(MSTAGE_WAIT_TIMEOUT_SECONDS.getConfig()).getAsLong(); + when(state.getPropAsLong(MSTAGE_WAIT_TIMEOUT_SECONDS.getConfig(), 0L)).thenReturn(waitTimeoutSeconds); + + enableCleansing = allKeys.get(MSTAGE_ENABLE_CLEANSING.getConfig()).getAsBoolean(); + when(state.getPropAsBoolean(MSTAGE_ENABLE_CLEANSING.getConfig())).thenReturn(enableCleansing); + + workUnitPartialPartition = allKeys.get(MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getConfig()).getAsBoolean(); + when(state.getPropAsBoolean(MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getConfig())).thenReturn(workUnitPartialPartition); + + watermark = allKeys.get(MSTAGE_WATERMARK.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_WATERMARK.getConfig(), new JsonArray().toString())).thenReturn(watermark.toString()); + + secondaryInput = allKeys.get(MSTAGE_SECONDARY_INPUT.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_SECONDARY_INPUT.getConfig(), new JsonArray().toString())).thenReturn(secondaryInput.toString()); + + httpClientFactory = allKeys.get(MSTAGE_HTTP_CLIENT_FACTORY.getConfig()).getAsString(); + when(state.getProp(MSTAGE_HTTP_CLIENT_FACTORY.getConfig(), StringUtils.EMPTY)).thenReturn(httpClientFactory); + + httpRequestHeaders = allKeys.get(MSTAGE_HTTP_REQUEST_HEADERS.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_HTTP_REQUEST_HEADERS.getConfig(), new JsonObject().toString())).thenReturn(httpRequestHeaders.toString()); + + sourceUri = allKeys.get(MSTAGE_SOURCE_URI.getConfig()).getAsString(); + when(state.getProp(MSTAGE_SOURCE_URI.getConfig(), StringUtils.EMPTY)).thenReturn(sourceUri); + + httpRequestMethod = allKeys.get(MSTAGE_HTTP_REQUEST_METHOD.getConfig()).getAsString(); + when(state.getProp(MSTAGE_HTTP_REQUEST_METHOD.getConfig(), StringUtils.EMPTY)).thenReturn(httpRequestMethod); + + extractorClass = allKeys.get(MSTAGE_EXTRACTOR_CLASS.getConfig()).getAsString(); + when(state.getProp(MSTAGE_EXTRACTOR_CLASS.getConfig(), StringUtils.EMPTY)).thenReturn(extractorClass); + + authentication = allKeys.get(MSTAGE_AUTHENTICATION.getConfig()).getAsJsonObject(); + token = authentication.get("token").getAsString(); + when(state.getProp(MSTAGE_AUTHENTICATION.getConfig(), new JsonObject().toString())).thenReturn(authentication.toString()); + + httpStatus = allKeys.get(MSTAGE_HTTP_STATUSES.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_HTTP_STATUSES.getConfig(), new JsonObject().toString())).thenReturn(httpStatus.toString()); + + httpStatusReasons = allKeys.get(MSTAGE_HTTP_STATUS_REASONS.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_HTTP_STATUS_REASONS.getConfig(), new JsonObject().toString())).thenReturn(httpStatusReasons.toString()); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/connection/JdbcReadConnectionTest.java b/dil/src/test/java/com/linkedin/dil/connection/JdbcReadConnectionTest.java new file mode 100644 index 0000000..191605c --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/connection/JdbcReadConnectionTest.java @@ -0,0 +1,212 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.common.collect.ImmutableMap; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.io.UnsupportedEncodingException; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import javax.sql.rowset.JdbcRowSet; +import javax.sql.rowset.RowSetMetaDataImpl; +import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.MutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.extractor.MultistageExtractor; +import com.linkedin.dil.util.Database; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JdbcKeys; +import com.linkedin.dil.util.JsonParameter; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.VariableUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import org.mockito.Mockito; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.powermock.reflect.Whitebox; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static org.mockito.Matchers.*; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.anyInt; +import static org.mockito.Mockito.*; + + +@PrepareForTest({Database.class, JsonParameter.class, StringEscapeUtils.class, VariableUtils.class}) +public class JdbcReadConnectionTest extends PowerMockTestCase { + /** + * Test getFirst: + * Scenario 1: Fail to get jdbcConnection + * Scenario 2: Fail to execute statement + * @throws UnsupportedEncodingException + */ + @Test + public void testGetFirst() throws UnsupportedEncodingException, RetriableAuthenticationException { + PowerMockito.mockStatic(JsonParameter.class); + PowerMockito.mockStatic(VariableUtils.class); + PowerMockito.mockStatic(Database.class); + when(JsonParameter.getParametersAsJson(any(), any(), any())).thenReturn(new JsonObject()); + MultistageExtractor extractor = mock(MultistageExtractor.class); + ExtractorKeys extractorKeys = mock(ExtractorKeys.class); + when(extractor.getExtractorKeys()).thenReturn(extractorKeys); + when(extractorKeys.getActivationParameters()).thenReturn(new JsonObject()); + when(VariableUtils.replace(any(), any(), any())).thenReturn(new JsonObject()); + + WorkUnitState state = mock(WorkUnitState.class); + when(extractor.getState()).thenReturn(state); + JdbcKeys jdbcSourceKeys = Mockito.mock(JdbcKeys.class); + when(jdbcSourceKeys.getSourceParameters()).thenReturn(new JsonArray()); + + JdbcConnection conn = new JdbcConnection(state, jdbcSourceKeys, extractorKeys); + Assert.assertNull(conn.executeFirst(WorkUnitStatus.builder().build())); + } + + /** + * Test getNext + */ + @Test + public void testGetNext() throws UnsupportedEncodingException, SQLException, RetriableAuthenticationException { + PowerMockito.mockStatic(Database.class); + PowerMockito.mockStatic(VariableUtils.class); + PowerMockito.mockStatic(JsonParameter.class); + + JdbcKeys jdbcSourceKeys = mock(JdbcKeys.class); + when(jdbcSourceKeys.getCallInterval()).thenReturn(1L); + ExtractorKeys extractorKeys = new ExtractorKeys(); + WorkUnitState state = Mockito.mock(WorkUnitState.class); + WorkUnitStatus workUnitStatus = Mockito.mock(WorkUnitStatus.class); + WorkUnitStatus.WorkUnitStatusBuilder builder = Mockito.mock(WorkUnitStatus.WorkUnitStatusBuilder.class); + + JdbcConnection conn = new JdbcConnection(state, jdbcSourceKeys, extractorKeys); + + String jdbcStatement = "select * from linkedin.someTable limit 1000"; + when(jdbcSourceKeys.getJdbcStatement()).thenReturn(jdbcStatement); + extractorKeys.setSignature("testSignature"); + extractorKeys.setActivationParameters(new JsonObject()); + when(builder.build()).thenReturn(workUnitStatus); + when(workUnitStatus.toBuilder()).thenReturn(builder); + + String uri = "jdbc:mysql://odbcva01.clientx.com:3630/linkedin?useSSL=true"; + String username = "username"; + String password = "password"; + when(jdbcSourceKeys.getSourceUri()).thenReturn(uri); + when(state.getProp(SOURCE_CONN_USERNAME.getConfig(), StringUtils.EMPTY)).thenReturn(username); + when(state.getProp(SOURCE_CONN_PASSWORD.getConfig(), StringUtils.EMPTY)).thenReturn(password); + + Pair res = new MutablePair<>(jdbcStatement, new JsonObject()); + when(VariableUtils.replaceWithTracking(any(), any(), any())).thenReturn(res); + when(VariableUtils.replace(any(), any())).thenReturn(new JsonObject()); + when(VariableUtils.replace(any(), any(), any())).thenReturn(new JsonObject()); + + when(jdbcSourceKeys.isPaginationEnabled()).thenReturn(true); + when(jdbcSourceKeys.getSourceParameters()).thenReturn(new JsonArray()); + java.sql.Connection jdbcConnection = PowerMockito.mock(java.sql.Connection.class); + Statement statement = PowerMockito.mock(Statement.class); + PowerMockito.when(jdbcConnection.createStatement()).thenReturn(statement); + ResultSet resultSet = PowerMockito.mock(ResultSet.class); + when(statement.getResultSet()).thenReturn(resultSet); + when(statement.execute(any())).thenReturn(true); + doNothing().when(statement).setFetchSize(anyInt()); + + String unSupportedExtractor = "com.linkedin.dil.extractor.SomeExtractor"; + when(state.getProp(MSTAGE_EXTRACTOR_CLASS.getConfig(), StringUtils.EMPTY)).thenReturn(unSupportedExtractor); + when(JsonParameter.getParametersAsJson(any(), any(), any())).thenReturn(new JsonObject()); + Assert.assertNull(conn.executeNext(workUnitStatus)); + + when(jdbcSourceKeys.getPaginationInitValues()).thenReturn(ImmutableMap.of(ParameterTypes.PAGESIZE, 100L)); + conn.setJdbcConnection(jdbcConnection); + Assert.assertNull(conn.executeNext(workUnitStatus)); + + String supportedExtractor = "com.linkedin.dil.extractor.CsvExtractor"; + when(state.getProp(MSTAGE_EXTRACTOR_CLASS.getConfig(), StringUtils.EMPTY)).thenReturn(supportedExtractor); + when(jdbcSourceKeys.hasSourceSchema()).thenReturn(true); + Assert.assertEquals(conn.executeNext(workUnitStatus), workUnitStatus); + + when(statement.execute(any())).thenReturn(false); + Assert.assertEquals(conn.executeNext(workUnitStatus), workUnitStatus); + } + + /** + * Test closeAll + * Scenario: throw an exception + */ + @Test + public void testCloseAll() throws SQLException { + ExtractorKeys extractorKeys = mock(ExtractorKeys.class); + String testSignature = "test_signature"; + when(extractorKeys.getSignature()).thenReturn(testSignature); + JdbcConnection conn = new JdbcConnection(null, new JdbcKeys(), extractorKeys); + conn.closeAll(""); + + java.sql.Connection jdbcConnection = mock(java.sql.Connection.class); + conn.setJdbcConnection(jdbcConnection); + doThrow(new RuntimeException()).when(jdbcConnection).close(); + conn.closeAll(""); + } + + @Test + public void testToCsv() throws Exception { + PowerMockito.mockStatic(StringEscapeUtils.class); + when(StringEscapeUtils.escapeCsv(anyString())).thenReturn("test_data"); + RowSetMetaDataImpl rowSetMetaData = mock(RowSetMetaDataImpl.class); + JdbcRowSet jdbcRowSet = PowerMockito.mock(JdbcRowSet.class); + PowerMockito.when(jdbcRowSet.next()).thenReturn(true).thenReturn(false); + when(rowSetMetaData.getColumnCount()).thenReturn(2); + JdbcConnection conn = new JdbcConnection(null, new JdbcKeys(), null); + Assert.assertEquals(Whitebox.invokeMethod(conn, "toCsv", jdbcRowSet, rowSetMetaData).toString(), + "test_data,test_data" + System.lineSeparator()); + } + + @Test + public void testToJson() throws Exception { + PowerMockito.mockStatic(StringEscapeUtils.class); + when(StringEscapeUtils.escapeCsv(anyString())).thenReturn("test_data"); + RowSetMetaDataImpl rowSetMetaData = mock(RowSetMetaDataImpl.class); + JdbcRowSet jdbcRowSet = PowerMockito.mock(JdbcRowSet.class); + PowerMockito.when(jdbcRowSet.next()) + .thenReturn(true).thenReturn(false) + .thenReturn(true).thenReturn(false) + .thenReturn(true).thenReturn(false); + + when(rowSetMetaData.getColumnCount()).thenReturn(2); + when(rowSetMetaData.getColumnName(1)).thenReturn("column0"); + when(rowSetMetaData.getColumnName(2)).thenReturn("column1"); + JdbcConnection conn = new JdbcConnection(null, new JdbcKeys(), null); + + Assert.assertEquals(Whitebox.invokeMethod(conn, "toJson", jdbcRowSet, rowSetMetaData).toString(), + "[{\"column0\":null,\"column1\":null}]"); + + conn.getJdbcSourceKeys().setSchemaRefactorFunction("toupper"); + Assert.assertEquals(Whitebox.invokeMethod(conn, "toJson", jdbcRowSet, rowSetMetaData).toString(), + "[{\"COLUMN0\":null,\"COLUMN1\":null}]"); + + conn.getJdbcSourceKeys().setSchemaRefactorFunction("tolower"); + Assert.assertEquals(Whitebox.invokeMethod(conn, "toJson", jdbcRowSet, rowSetMetaData).toString(), + "[{\"column0\":null,\"column1\":null}]"); + } + + /** + * Test retrieveSchema + */ + @Test + public void testRetrieveSchema() throws Exception { + RowSetMetaDataImpl rowSetMetaData = mock(RowSetMetaDataImpl.class); + when(rowSetMetaData.getColumnCount()).thenReturn(1); + when(rowSetMetaData.isNullable(1)).thenReturn(2); + when(rowSetMetaData.getColumnName(1)).thenReturn("columnValue"); + when(rowSetMetaData.getColumnType(1)).thenReturn(1); + JdbcConnection conn = new JdbcConnection(null, new JdbcKeys(), null); + Assert.assertEquals(Whitebox.invokeMethod(conn, "retrieveSchema", rowSetMetaData).toString(), + "[{\"columnName\":\"columnValue\",\"isNullable\":false,\"dataType\":{\"type\":\"string\"}}]"); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/connection/MulstistageReadConnectionTest.java b/dil/src/test/java/com/linkedin/dil/connection/MulstistageReadConnectionTest.java new file mode 100644 index 0000000..5b3e969 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/connection/MulstistageReadConnectionTest.java @@ -0,0 +1,64 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import gobblin.configuration.SourceState; +import java.io.UnsupportedEncodingException; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.VariableUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import org.mockito.Mockito; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.powermock.api.mockito.PowerMockito.*; + + +@Test +@PrepareForTest(VariableUtils.class) +public class MulstistageReadConnectionTest extends PowerMockTestCase { + @Test + public void testGetNext() throws RetriableAuthenticationException { + MultistageConnection conn = new MultistageConnection(new SourceState(), new JobKeys(), new ExtractorKeys()); + conn.getExtractorKeys().setSignature("testSignature"); + conn.getExtractorKeys().setActivationParameters(new JsonObject()); + + WorkUnitStatus workUnitStatus = Mockito.mock(WorkUnitStatus.class); + WorkUnitStatus.WorkUnitStatusBuilder builder = Mockito.mock(WorkUnitStatus.WorkUnitStatusBuilder.class); + when(builder.build()).thenReturn(workUnitStatus); + when(workUnitStatus.toBuilder()).thenReturn(builder); + Assert.assertEquals(conn.executeNext(workUnitStatus), workUnitStatus); + + // cover the exception branch + JobKeys jobKeys = Mockito.mock(JobKeys.class); + when(jobKeys.getCallInterval()).thenReturn(1L); + conn.setJobKeys(jobKeys); + when(jobKeys.getSourceParameters()).thenReturn(new JsonArray()); + when(jobKeys.getCallInterval()).thenThrow(Mockito.mock(IllegalArgumentException.class)); + conn.executeNext(workUnitStatus); + Assert.assertEquals(conn.executeNext(workUnitStatus), workUnitStatus); + } + + @Test + public void testGetWorkUnitSpecificString() throws UnsupportedEncodingException { + // Test normal case + MultistageConnection conn = new MultistageConnection(new SourceState(), new JobKeys(), new ExtractorKeys()); + String template = "test_template"; + JsonObject obj = new JsonObject(); + Assert.assertEquals(conn.getWorkUnitSpecificString(template, obj), template); + + // Test exception by PowerMock + PowerMockito.mockStatic(VariableUtils.class); + when(VariableUtils.replaceWithTracking(template, obj, false)).thenThrow(UnsupportedEncodingException.class); + Assert.assertEquals(conn.getWorkUnitSpecificString(template, obj), template); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/connection/S3ReadConnectionOnlineTest.java b/dil/src/test/java/com/linkedin/dil/connection/S3ReadConnectionOnlineTest.java new file mode 100644 index 0000000..5264f51 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/connection/S3ReadConnectionOnlineTest.java @@ -0,0 +1,32 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import org.apache.gobblin.runtime.embedded.EmbeddedGobblin; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** + * These online tests are true Gobblin jobs. Their execution depends on: + * 1. a complete pull file + * 2. the online resource that is being pulled + * + * To execute these tests or debug with them, please enable the test and ensure + * above conditions are met. + */ +public class S3ReadConnectionOnlineTest { + @Test(enabled = false) + void testS3ClientWithApacheHttpClient() throws Exception { + EmbeddedGobblin job = new EmbeddedGobblin("test"); + Assert.assertTrue(job.jobFile(getClass().getResource("/pull/s3-csv.pull").getPath()).run().isSuccessful()); + } + + @Test(enabled = false) + void testS3WithFileDump() throws Exception { + EmbeddedGobblin job = new EmbeddedGobblin("test"); + Assert.assertTrue(job.jobFile(getClass().getResource("/pull/s3-filedump.pull").getPath()).run().isSuccessful()); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/connection/S3ReadConnectionTest.java b/dil/src/test/java/com/linkedin/dil/connection/S3ReadConnectionTest.java new file mode 100644 index 0000000..9344525 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/connection/S3ReadConnectionTest.java @@ -0,0 +1,40 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.connection; + +import gobblin.runtime.JobState; +import java.util.List; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.source.MultistageSource; +import com.linkedin.dil.source.S3SourceV2; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class S3ReadConnectionTest { + @Test + public void testGetS3HttpClient() { + List wus = new MultistageSource().getWorkunits(new SourceState()); + WorkUnitState wuState = new WorkUnitState(wus.get(0), new JobState()); + wuState.setProp(MultistageProperties.MSTAGE_HTTP_CLIENT_FACTORY.getConfig(), "com.linkedin.dil.factory.ApacheHttpClientFactory"); + + S3SourceV2 source = new S3SourceV2(); + SourceState sourceState = new SourceState(); + sourceState.setProp(MultistageProperties.MSTAGE_SOURCE_URI.getConfig(), "https://nonexist.s3.amazonaws.com/data"); + source.getWorkunits(sourceState); + + S3Connection conn = new S3Connection(wuState, source.getS3SourceV2Keys(), new ExtractorKeys()); + Assert.assertNotNull(conn.getS3HttpClient(wuState)); + + conn = new S3Connection(wuState, source.getS3SourceV2Keys(), new ExtractorKeys()); + conn.getS3SourceV2Keys().setConnectionTimeout(10); + Assert.assertNotNull(conn.getS3HttpClient(wuState)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/converter/AvroNormalizerConverterTest.java b/dil/src/test/java/com/linkedin/dil/converter/AvroNormalizerConverterTest.java new file mode 100644 index 0000000..da57651 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/converter/AvroNormalizerConverterTest.java @@ -0,0 +1,81 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.converter; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import lombok.SneakyThrows; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.converter.DataConversionException; +import org.apache.gobblin.converter.SchemaConversionException; +import org.apache.gobblin.converter.avro.UnsupportedDateTypeException; +import com.linkedin.dil.util.AvroSchemaUtils; +import org.apache.gobblin.source.workunit.Extract; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +public class AvroNormalizerConverterTest { + AvroNormalizerConverter _avroNormalizerConverter; + String sourceSchema = "[{\"columnName\":\"asIs\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}," + + "{\"columnName\":\"toBeNormalized1\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}," + + "{\"columnName\":\"toBeNormalized2\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}]"; + String targetSchema = "[{\"columnName\":\"asIs\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}," + + "{\"columnName\":\"normalized\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}]"; + Schema inputSchema; + Schema outputSchema; + WorkUnitState state; + + @SneakyThrows + @BeforeMethod + public void beforeMethod() { + _avroNormalizerConverter = new AvroNormalizerConverter(); + state = mock(WorkUnitState.class); + Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, "com.linkedin.test", "test"); + when(state.getProp("ms.target.schema", new JsonArray().toString())).thenReturn(targetSchema); + when(state.getPropAsLong("ms.normalizer.batch.size", 0L)).thenReturn(2L); + when(state.getExtract()).thenReturn(extract); + _avroNormalizerConverter.init(state); + inputSchema = AvroSchemaUtils.fromJsonSchema(new Gson().fromJson(sourceSchema, JsonArray.class), state); + outputSchema = AvroSchemaUtils.fromJsonSchema(new Gson().fromJson(targetSchema, JsonArray.class), state); + } + + @Test + public void testConvertSchema() throws SchemaConversionException, UnsupportedDateTypeException { + Schema schema = _avroNormalizerConverter.convertSchema(inputSchema, state); + + Assert.assertEquals(schema, + AvroSchemaUtils.fromJsonSchema(new Gson().fromJson(targetSchema, JsonArray.class), state)); + } + + @Test + public void testConvertRecord() throws SchemaConversionException, DataConversionException { + _avroNormalizerConverter.convertSchema(inputSchema, state); + GenericRecord inputRecord = new GenericData.Record(inputSchema); + inputRecord.put("asIs", "dummy"); + inputRecord.put("toBeNormalized1", "dummy"); + inputRecord.put("toBeNormalized2", "dummy"); + // Call twice to make sure the resulting record gives JsonArray size 2 + _avroNormalizerConverter.convertRecord(outputSchema, inputRecord, state); + Iterable recordIterable = _avroNormalizerConverter.convertRecord(outputSchema, inputRecord, state); + GenericRecord record = recordIterable.iterator().next(); + GenericData.Array normalized = (GenericData.Array) record.get("normalized"); + Assert.assertEquals(normalized.size(), 2); + // There's 1 record in the buffer before before passing eof + _avroNormalizerConverter.convertRecord(outputSchema, record, state); + GenericRecord eof = AvroSchemaUtils.createEOF(state); + record = _avroNormalizerConverter.convertRecord(outputSchema, eof, state).iterator().next(); + normalized = (GenericData.Array) record.get("normalized"); + Assert.assertEquals(normalized.size(), 1); + // When there are no records in the buffer calling before eof + Assert.assertFalse(_avroNormalizerConverter.convertRecord(outputSchema, eof, state).iterator().hasNext()); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/converter/JsonNormalizerConverterTest.java b/dil/src/test/java/com/linkedin/dil/converter/JsonNormalizerConverterTest.java new file mode 100644 index 0000000..25b09d9 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/converter/JsonNormalizerConverterTest.java @@ -0,0 +1,65 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.converter; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import org.apache.gobblin.configuration.WorkUnitState; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +@Test +public class JsonNormalizerConverterTest { + JsonNormalizerConverter underTest; + String sourceSchema = "[{\"columnName\":\"asIs\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}," + + "{\"columnName\":\"toBeNormalized1\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}," + + "{\"columnName\":\"toBeNormalized2\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}]"; + String targetSchema = "[{\"columnName\":\"asIs\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}," + + "{\"columnName\":\"normalized\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}]"; + JsonArray inputSchema; + JsonArray outputSchema; + WorkUnitState state; + + @BeforeMethod + public void beforeMethod() { + underTest = new JsonNormalizerConverter(); + state = new WorkUnitState(); + state.setProp("ms.target.schema", targetSchema); + state.setProp("ms.normalizer.batch.size", 2); + underTest.init(state); + inputSchema = new Gson().fromJson(sourceSchema, JsonArray.class); + outputSchema = new Gson().fromJson(targetSchema, JsonArray.class); + } + + @Test + public void testConvertSchema() { + Assert.assertTrue(outputSchema.equals(underTest.convertSchema(inputSchema, state))); + } + + @Test + public void testConvertRecord() { + underTest.convertSchema(inputSchema, state); + JsonObject record = new JsonObject(); + record.addProperty("asIs", "dummy"); + record.addProperty("toBeNormalized1", "dummy"); + record.addProperty("toBeNormalized2", "dummy"); + // Call twice to make sure the resulting record gives JsonArray size 2 + underTest.convertRecord(outputSchema, record, state); + Iterable recordIterable = underTest.convertRecord(outputSchema, record, state); + JsonObject jsonObject = recordIterable.iterator().next(); + Assert.assertEquals(jsonObject.getAsJsonArray("normalized").size(), 2); + // There's 1 record in the buffer before before passing eof + underTest.convertRecord(outputSchema, record,state); + JsonObject eof = new JsonObject(); + eof.addProperty("EOF", "EOF"); + jsonObject = underTest.convertRecord(outputSchema, eof, state).iterator().next(); + Assert.assertEquals(jsonObject.getAsJsonArray("normalized").size(), 1); + // When there are no records in the buffer calling before eof + Assert.assertFalse(underTest.convertRecord(outputSchema, eof, state).iterator().hasNext()); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/extractor/AvroExtractorTest.java b/dil/src/test/java/com/linkedin/dil/extractor/AvroExtractorTest.java new file mode 100644 index 0000000..0180764 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/extractor/AvroExtractorTest.java @@ -0,0 +1,332 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.base.Optional; +import com.google.common.collect.ImmutableMap; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.io.InputStream; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.MultistageConnection; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.keys.AvroExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.source.HttpSource; +import com.linkedin.dil.source.MultistageSource; +import com.linkedin.dil.util.JsonUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.source.workunit.Extract; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.apache.gobblin.util.AvroUtils; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Period; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.mockito.Mockito; +import org.powermock.reflect.Whitebox; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static com.linkedin.dil.configuration.StaticConstants.*; +import static org.mockito.Mockito.*; + + +@Test +@Slf4j +public class AvroExtractorTest { + + private final static String DATA_SET_URN_KEY = "com.linkedin.somecase.SeriesCollection"; + private final static String ACTIVATION_PROP = "{\"name\": \"survey\", \"type\": \"unit\", \"units\": \"id1,id2\"}"; + private final static String DATA_FINAL_DIR = "/jobs/testUser/gobblin/useCaseRoot"; + private final static String FILE_PERMISSION = "775"; + private final static long ONE_HOUR_IN_MILLS = 3600000L; + private final static long WORK_UNIT_START_TIME_KEY = 1590994800000L; + JsonArray outputJsonSchema; + private WorkUnitState state; + private SourceState sourceState; + private MultistageSource multiStageSource; + private HttpSource httpSource; + private HttpSource realHttpSource; + private WorkUnit workUnit; + private JobKeys jobKeys; + private AvroExtractor avroExtractor; + private WorkUnitStatus workUnitStatus; + private AvroExtractorKeys avroExtractorKeys; + private MultistageConnection multistageConnection; + + @BeforeMethod + public void setUp() throws RetriableAuthenticationException { + state = mock(WorkUnitState.class); + sourceState = mock(SourceState.class); + multiStageSource = mock(MultistageSource.class); + httpSource = mock(HttpSource.class); + realHttpSource = new HttpSource(); + + List wus = new MultistageSource().getWorkunits(new SourceState()); + workUnit = wus.get(0); + workUnit.setProp(MultistageProperties.DATASET_URN_KEY.getConfig(), DATA_SET_URN_KEY); + + jobKeys = mock(JobKeys.class); + workUnitStatus = mock(WorkUnitStatus.class); + + avroExtractorKeys = mock(AvroExtractorKeys.class); + when(avroExtractorKeys.getActivationParameters()).thenReturn(new JsonObject()); + + outputJsonSchema = new JsonArray(); + + // mock for state + when(state.getWorkunit()).thenReturn(workUnit); + when(state.getProp(MSTAGE_ACTIVATION_PROPERTY.getConfig(), new JsonObject().toString())).thenReturn(ACTIVATION_PROP); + when(state.getPropAsLong(MSTAGE_WORKUNIT_STARTTIME_KEY.getConfig(), 0L)).thenReturn(WORK_UNIT_START_TIME_KEY); + when(state.getProp(DATA_PUBLISHER_FINAL_DIR.getConfig(), StringUtils.EMPTY)).thenReturn(DATA_FINAL_DIR); + when(state.getProp(MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getConfig(), StringUtils.EMPTY)).thenReturn(FILE_PERMISSION); + Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, "com.linkedin.test", "test"); + when(state.getExtract()).thenReturn(extract); + // mock for source state + when(sourceState.getProp("extract.table.type", "SNAPSHOT_ONLY")).thenReturn("SNAPSHOT_ONLY"); + when(sourceState.contains("source.conn.use.proxy.url")).thenReturn(true); + + + // mock for source + when(multiStageSource.getJobKeys()).thenReturn(jobKeys); + + // mock for source keys + when(jobKeys.getOutputSchema()).thenReturn(outputJsonSchema); + when(jobKeys.getDerivedFields()).thenReturn(new HashMap<>()); + + avroExtractor = new AvroExtractor(state, multiStageSource.getJobKeys()); + avroExtractor.setAvroExtractorKeys(avroExtractorKeys); + avroExtractor.jobKeys = jobKeys; + + multistageConnection = Mockito.mock(MultistageConnection.class); + when(multistageConnection.executeFirst(workUnitStatus)).thenReturn(workUnitStatus); + when(multistageConnection.executeNext(workUnitStatus)).thenReturn(workUnitStatus); + avroExtractor.setConnection(multistageConnection); } + + @BeforeTest + public void setup() { + if (System.getProperty("hadoop.home.dir") == null) { + System.setProperty("hadoop.home.dir", "/tmp"); + } + } + + /** + * testing vanilla Avro Extractor with a blank file + */ + @Test + public void testExtractAvroWithEmptyFile() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/avro/empty_file.avro"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.output.schema", "" )).thenReturn(""); + + // replace mocked keys with default keys + realHttpSource.getWorkunits(sourceState); + avroExtractor.jobKeys = realHttpSource.getJobKeys(); + avroExtractor.setAvroExtractorKeys(new AvroExtractorKeys()); + + when(multistageConnection.executeFirst(avroExtractor.workUnitStatus)).thenReturn(status); + + // schema should be a minimum schema with no field + Schema schema = avroExtractor.getSchema(); + Assert.assertEquals(0, schema.getFields().size()); + + // there should be 0 records processed + GenericRecord rst = avroExtractor.readRecord(null); + Assert.assertNull(rst); + while (avroExtractor.hasNext()) { + avroExtractor.readRecord(null); + } + Assert.assertEquals(0, avroExtractor.getAvroExtractorKeys().getProcessedCount()); + } + + private GenericRecord createSingletonRecordWithString(String val) { + return createSingletonRecordWithString("test", val); + } + + private GenericRecord createSingletonRecordWithString(String key, String val) { + Schema schema = SchemaBuilder.record("Test").namespace("com.linkedin.test") + .doc("Test record").fields() + .name(key).doc("test").type().stringType() + .noDefault().endRecord(); + GenericRecord record = new GenericData.Record(schema); + record.put(key, val); + return record; + } + + @Test + public void testAddDerivedFields() throws Exception { + avroExtractor.setTimezone("America/Los_Angeles"); + + // derived field is in unsupported type and the source is non-existent + Map> derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "non-epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + GenericRecord row = createSingletonRecordWithString("testVal"); + GenericRecord res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + Assert.assertEquals(res.getSchema().getFields().size(), 1); + Optional fieldValue = AvroUtils.getFieldValue(res, "test"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals(fieldValue.get().toString(), "testVal"); + + // derived field is empty early exit + derivedFields = ImmutableMap.of(); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + row = createSingletonRecordWithString("testVal"); + res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + Assert.assertEquals(res.getSchema().getFields().size(), 1); + fieldValue = AvroUtils.getFieldValue(res, "test"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals(fieldValue.get().toString(), "testVal"); + + // derived field is currentdate + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "currentdate")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + row = createSingletonRecordWithString("testVal"); + res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + Assert.assertEquals(res.getSchema().getFields().size(), 2); + fieldValue = AvroUtils.getFieldValue(res, "test"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals(fieldValue.get().toString(), "testVal"); + fieldValue = AvroUtils.getFieldValue(res, "current_date"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertTrue(Math.abs((Long)fieldValue.get() - DateTime.now().getMillis()) < ONE_HOUR_IN_MILLS); + + // derived field is P1D + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "P1D")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + row = createSingletonRecordWithString("testVal"); + res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + Assert.assertEquals(res.getSchema().getFields().size(), 2); + fieldValue = AvroUtils.getFieldValue(res, "test"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals(fieldValue.get().toString(), "testVal"); + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + Period period = Period.parse("P1D"); + long p1d = DateTime.now().withZone(timeZone).minus(period).dayOfMonth().roundFloorCopy().getMillis(); + fieldValue = AvroUtils.getFieldValue(res, "current_date"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertTrue(Math.abs((Long)fieldValue.get() - p1d) < ONE_HOUR_IN_MILLS); + + // derived field is in the specified format + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + row = createSingletonRecordWithString("start_time", "2020-06-01"); + res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + Assert.assertEquals(res.getSchema().getFields().size(), 2); + fieldValue = AvroUtils.getFieldValue(res, "start_time"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals(fieldValue.get().toString(), "2020-06-01"); + DateTimeFormatter datetimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); + fieldValue = AvroUtils.getFieldValue(res, "current_date"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals((long)fieldValue.get(), datetimeFormatter.parseDateTime("2020-06-01").getMillis()); + + // derived field is NOT in the specified format + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + row = createSingletonRecordWithString("start_time", "notdatetime"); + res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + // Since the type is supported, we created a new record with new columns. + // In reality, the work unit will fail when processing the derived field's value. + Assert.assertEquals(res.getSchema().getFields().size(), 2); + fieldValue = AvroUtils.getFieldValue(res, "start_time"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals(fieldValue.get().toString(), "notdatetime"); + + // derived field is boolean + derivedFields = ImmutableMap.of("partial_failure", + ImmutableMap.of("type", "boolean", "value", "true")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + row = createSingletonRecordWithString("testVal"); + res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + Assert.assertEquals(res.getSchema().getFields().size(), 2); + fieldValue = AvroUtils.getFieldValue(res, "test"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals(fieldValue.get().toString(), "testVal"); + fieldValue = AvroUtils.getFieldValue(res, "partial_failure"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertTrue((boolean) fieldValue.get()); + + // derived fields are from variables + JsonObject parameters = new JsonObject(); + parameters.addProperty("dateString", "2019-11-01 12:00:00"); + parameters.addProperty("someInteger", 123456); + parameters.addProperty("someNumber", 123.456); + parameters.addProperty("someEpoc", 1601038688000L); + parameters.addProperty("someBoolean", true); + avroExtractor.currentParameters = parameters; + + derivedFields = ImmutableMap.of("dateString", + ImmutableMap.of("type", "string", "source", "{{dateString}}"), + "someInteger", + ImmutableMap.of("type", "integer", "source", "{{someInteger}}"), + "someEpoc", + ImmutableMap.of("type", "epoc", "source", "{{someEpoc}}"), + "someNumber", + ImmutableMap.of("type", "number", "source", "{{someNumber}}"), + "someBoolean", + ImmutableMap.of("type", "boolean", "source", "{{someBoolean}}")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + row = createSingletonRecordWithString("start_time", "2020-06-01"); + res = Whitebox.invokeMethod(avroExtractor, "addDerivedFields", row); + Assert.assertEquals(res.getSchema().getFields().size(), 6); + fieldValue = AvroUtils.getFieldValue(res, "start_time"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals((String)fieldValue.get(), "2020-06-01"); + fieldValue = AvroUtils.getFieldValue(res, "dateString"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals((String)fieldValue.get(), "2019-11-01 12:00:00"); + fieldValue = AvroUtils.getFieldValue(res, "someInteger"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals((int)fieldValue.get(), 123456); + fieldValue = AvroUtils.getFieldValue(res, "someNumber"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals((double)fieldValue.get(), 123.456); + fieldValue = AvroUtils.getFieldValue(res, "someEpoc"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertEquals((long)fieldValue.get(), 1601038688000L); + fieldValue = AvroUtils.getFieldValue(res, "someBoolean"); + Assert.assertTrue(fieldValue.isPresent()); + Assert.assertTrue((boolean) fieldValue.get()); + } + + @Test + public void testGetSchema() throws Exception { + Schema avroSchema; + String schemaString = "[{\"columnName\":\"id0\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, " + + "{\"columnName\":\"id1\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, " + + "{\"columnName\":\"id2\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]"; + JsonArray schemaArray = GSON.fromJson(schemaString, JsonArray.class); + JsonArray schema = JsonUtils.deepCopy(schemaArray).getAsJsonArray(); + when(jobKeys.hasOutputSchema()).thenReturn(true); + when(jobKeys.getOutputSchema()).thenReturn(schema); + when(jobKeys.getOutputSchema()).thenReturn(schemaArray); + avroSchema = Whitebox.invokeMethod(avroExtractor, "getSchema"); + Assert.assertEquals(avroSchema.getFields().size(), 3); + Assert.assertEquals(avroSchema.getName(), "test"); + Assert.assertEquals(avroSchema.getNamespace(), "com.linkedin.test"); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/extractor/CsvExtractorTest.java b/dil/src/test/java/com/linkedin/dil/extractor/CsvExtractorTest.java new file mode 100644 index 0000000..a53a232 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/extractor/CsvExtractorTest.java @@ -0,0 +1,734 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.lang.reflect.Method; +import java.nio.charset.StandardCharsets; +import java.util.Deque; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.MultistageConnection; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.filter.JsonSchemaBasedFilter; +import com.linkedin.dil.keys.CsvExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.source.HttpSource; +import com.linkedin.dil.source.MultistageSource; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.source.extractor.utils.InputStreamCSVReader; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Period; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.mockito.Mockito; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.reflect.Whitebox; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static org.mockito.Mockito.*; + + +@Test +@Slf4j +public class CsvExtractorTest { + + private final static String DATA_SET_URN_KEY = "com.linkedin.somecase.SeriesCollection"; + private final static String ACTIVATION_PROP = "{\"name\": \"survey\", \"type\": \"unit\", \"units\": \"id1,id2\"}"; + private final static String DATA_FINAL_DIR = "/jobs/testUser/gobblin/useCaseRoot"; + private final static String FILE_PERMISSION = "775"; + private final static long ONE_HOUR_IN_MILLS = 3600000L; + private final static long WORK_UNIT_START_TIME_KEY = 1590994800000L; + JsonArray outputJsonSchema; + JsonObject schema; + private WorkUnitState state; + private SourceState sourceState; + private MultistageSource multiStageSource; + private HttpSource httpSource; + private HttpSource realHttpSource; + private WorkUnit workUnit; + private JobKeys jobKeys; + private CsvExtractor csvExtractor; + private WorkUnitStatus workUnitStatus; + private CsvExtractorKeys csvExtractorKeys; + private MultistageConnection multistageConnection; + + @BeforeMethod + public void setUp() throws RetriableAuthenticationException { + state = mock(WorkUnitState.class); + sourceState = mock(SourceState.class); + multiStageSource = mock(MultistageSource.class); + httpSource = mock(HttpSource.class); + realHttpSource = new HttpSource(); + + List wus = new MultistageSource().getWorkunits(new SourceState()); + workUnit = wus.get(0); + workUnit.setProp(MultistageProperties.DATASET_URN_KEY.getConfig(), DATA_SET_URN_KEY); + + jobKeys = mock(JobKeys.class); + workUnitStatus = mock(WorkUnitStatus.class); + + csvExtractorKeys = mock(CsvExtractorKeys.class); + when(csvExtractorKeys.getActivationParameters()).thenReturn(new JsonObject()); + + + outputJsonSchema = new JsonArray(); + schema = new JsonObject(); + + // mock for state + when(state.getWorkunit()).thenReturn(workUnit); + when(state.getProp(MSTAGE_ACTIVATION_PROPERTY.getConfig(), new JsonObject().toString())).thenReturn(ACTIVATION_PROP); + when(state.getPropAsLong(MSTAGE_WORKUNIT_STARTTIME_KEY.getConfig(), 0L)).thenReturn(WORK_UNIT_START_TIME_KEY); + when(state.getProp(DATA_PUBLISHER_FINAL_DIR.getConfig(), StringUtils.EMPTY)).thenReturn(DATA_FINAL_DIR); + when(state.getProp(MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getConfig(), StringUtils.EMPTY)).thenReturn(FILE_PERMISSION); + + // mock for source state + when(sourceState.getProp("extract.table.type", "SNAPSHOT_ONLY")).thenReturn("SNAPSHOT_ONLY"); + when(sourceState.contains("source.conn.use.proxy.url")).thenReturn(true); + + + // mock for source + when(multiStageSource.getJobKeys()).thenReturn(jobKeys); + + // mock for source keys + when(jobKeys.getOutputSchema()).thenReturn(outputJsonSchema); + when(jobKeys.getDerivedFields()).thenReturn(new HashMap<>()); + + csvExtractor = new CsvExtractor(state, multiStageSource.getJobKeys()); + csvExtractor.setCsvExtractorKeys(csvExtractorKeys); + csvExtractor.jobKeys = jobKeys; + + multistageConnection = Mockito.mock(MultistageConnection.class); + when(multistageConnection.executeFirst(workUnitStatus)).thenReturn(workUnitStatus); + when(multistageConnection.executeNext(workUnitStatus)).thenReturn(workUnitStatus); + csvExtractor.setConnection(multistageConnection); } + + @BeforeTest + public void setup() { + if (System.getProperty("hadoop.home.dir") == null) { + System.setProperty("hadoop.home.dir", "/tmp"); + } + } + + /** + * testing vanilla CSV Extractor + */ + @Test + void testExtractCSV1() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/csv/common-crawl-files.csv"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.output.schema", "" )).thenReturn(""); + when(state.getProp("ms.csv.separator", "")).thenReturn("u0009"); + + // replace mocked keys with default keys + realHttpSource.getWorkunits(sourceState); + csvExtractor.jobKeys = realHttpSource.getJobKeys(); + csvExtractor.setCsvExtractorKeys(new CsvExtractorKeys()); + + when(multistageConnection.executeFirst(csvExtractor.workUnitStatus)).thenReturn(status); + + csvExtractor.readRecord(null); + while (csvExtractor.hasNext()) { + String[] rst = csvExtractor.readRecord(null); + } + Assert.assertEquals(10, csvExtractor.getCsvExtractorKeys().getProcessedCount()); + } + + /** + * testing u0004 + */ + @Test + void testExtractCSV2() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/csv/ctl_d_text.dat"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.output.schema", "" )).thenReturn(""); + when(state.getProp("ms.csv.separator", "")).thenReturn("u0004"); + + realHttpSource.getWorkunits(sourceState); + CsvExtractor extractor = new CsvExtractor(state, realHttpSource.getHttpSourceKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + extractor.setConnection(multistageConnection); + + extractor.readRecord(null); + while (extractor.hasNext()) { + String[] x = extractor.readRecord(null); + Assert.assertNotNull(x); + Assert.assertEquals(15, x.length); + } + Assert.assertEquals(2, extractor.getCsvExtractorKeys().getProcessedCount()); + } + + @Test + void testExtractCSV3() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/csv/comma-separated.csv"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.output.schema", "" )).thenReturn(""); + when(MultistageProperties.MSTAGE_CSV_SEPARATOR.getValidNonblankWithDefault(state)).thenReturn("u002c"); + + realHttpSource.getWorkunits(sourceState); + CsvExtractor extractor = new CsvExtractor(state, realHttpSource.getHttpSourceKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + extractor.setConnection(multistageConnection); + + extractor.readRecord(null); + while (extractor.hasNext()) { + String[] record = extractor.readRecord(null); + Assert.assertNotNull(record); + Assert.assertEquals(record.length, 2); + } + Assert.assertEquals(5, extractor.getCsvExtractorKeys().getProcessedCount()); + } + + /** + * testing CSV extractor with Gunzip processor + */ + @Test + void testExtractGzippedCSV() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/gzip/cc-index.paths.gz"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.output.schema", "" )).thenReturn(""); + when(state.getProp("ms.extract.preprocessors", "")).thenReturn("com.linkedin.dil.preprocessor.GunzipProcessor"); + + realHttpSource.getWorkunits(sourceState); + CsvExtractor extractor = new CsvExtractor(state, realHttpSource.getHttpSourceKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + extractor.setConnection(multistageConnection); + + extractor.readRecord(null); + while (extractor.hasNext()) { + extractor.readRecord(null); + } + Assert.assertEquals(302, extractor.getCsvExtractorKeys().getProcessedCount()); + } + + + /** + * testing CSV Extractor schema inference + * In this case, a column name contains an illegal character. Since ms.enable.cleansing is enabled by default, + * "$" in the header should be converted to "_" but the actual data will not be cleansed. + */ + @Test + void testExtractCSVSchemaInference() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/csv/ids_need_cleansing.csv"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn("[{\"name\": \"snapshotDate\", \"formula\": {\"type\": \"epoc\", \"source\": \"currentdate\"}}]"); + // The following line is added intentionally to make sure that column projection is not enabled when user does not specify the output schema + when(state.getProp("ms.csv.column.projection", StringUtils.EMPTY)).thenReturn("0,4,2-3"); + when(state.getProp("ms.csv.column.header", StringUtils.EMPTY)).thenReturn("true"); + when(state.getProp(MSTAGE_ENABLE_CLEANSING.getConfig(), StringUtils.EMPTY)).thenReturn(""); + when(state.getProp("ms.csv.column.header", StringUtils.EMPTY)).thenReturn("true"); + when(state.getPropAsBoolean("ms.csv.column.header")).thenReturn(true); + when(sourceState.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + + realHttpSource.getWorkunits(sourceState); + CsvExtractor extractor = new CsvExtractor(state, realHttpSource.getHttpSourceKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + extractor.setConnection(multistageConnection); + + JsonParser parser = new JsonParser(); + JsonArray schema = parser.parse(extractor.getSchema()).getAsJsonArray(); + Assert.assertEquals(schema.get(0).getAsJsonObject().get("columnName").getAsString(), "id_0"); + Assert.assertEquals(schema.size(), 2); + + // check if schema has been added + String[] row; + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row[0], "497766636$"); + Assert.assertEquals(row.length, 2); + while (extractor.hasNext()) { + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row.length, 2); + } + Assert.assertEquals(10, extractor.getCsvExtractorKeys().getProcessedCount()); + } + + /** + * Various tests for column projection + */ + @Test + void testColumnProjection() throws RetriableAuthenticationException { + // testing column projection with schema and ms.csv.column.projection specified + testColumnProjectionHelper("/csv/flat.csv", + "[{\"columnName\":\"col1\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col3\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col4\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col5\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]", + true, + false, null, "0,4,2-3"); + + // testing column projection with schema and header, but without ms.csv.column.projection specified + testColumnProjectionHelper("/csv/flat.csv", + "[{\"columnName\":\"col1\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col3\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col4\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col5\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]", + true, + false, null, ""); + + // testing column projection with schema, but without header and ms.csv.column.projection specified + testColumnProjectionHelper("/csv/flat_without_header.csv", + "[{\"columnName\":\"col1\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col2\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col3\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col4\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]", + false, + false, null, ""); + + // testing column projection with schema and header, but schema contains some fields not in the header + testColumnProjectionHelper("/csv/flat.csv", + "[{\"columnName\":\"col11\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col3\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col4\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col5\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]", + true, + true, new String[][]{{"val1", "val2", "val3", "val4"}, {"val6", "val7", "val8", "val9"}}, ""); + + // testing column projection with schema and header, but headers are in upper case + testColumnProjectionHelper("/csv/flat_uppercase_header.csv", + "[{\"columnName\":\"col1\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col3\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col4\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, {\"columnName\":\"col5\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]", + true, + true, new String[][]{{"val1", "val3", "val4", "val5"}, {"val6", "val8", "val9", "val10"}}, ""); + } + + /** + * Utility function to test column projection + * @param filePath csv file path string + * @param outputSchema output schema + * @param hasHeader flag for having header + * @param shouldValidateContent flag for checking content explicitly + * @param contents array of contents for explicit checking + * @param columnProjection explicit column projection string + */ + private void testColumnProjectionHelper(String filePath, String outputSchema, boolean hasHeader, + boolean shouldValidateContent, String[][] contents, String columnProjection) + throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream(filePath); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + when(sourceState.getProp("ms.output.schema", new JsonArray().toString())).thenReturn(outputSchema); + when(state.getProp("ms.csv.column.projection", StringUtils.EMPTY)).thenReturn(columnProjection); + when(state.getProp("ms.csv.column.header", StringUtils.EMPTY)).thenReturn(String.valueOf(hasHeader)); + when(state.getPropAsBoolean("ms.csv.column.header")).thenReturn(hasHeader); + + realHttpSource.getWorkunits(sourceState); + CsvExtractor extractor = new CsvExtractor(state, realHttpSource.getHttpSourceKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + extractor.setConnection(multistageConnection); + + // check if schema has been added + JsonParser parser = new JsonParser(); + String schema = extractor.getSchema(); + Assert.assertEquals(parser.parse(schema).getAsJsonArray().size(), 4); + String[] row; + int index = 0; + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row.length, 4); + if(shouldValidateContent) { + Assert.assertEquals(row, contents[index++]); + } + while (extractor.hasNext()) { + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row.length, 4); + if(shouldValidateContent) { + Assert.assertEquals(row, contents[index++]); + } + } + Assert.assertEquals(2, extractor.getCsvExtractorKeys().getProcessedCount()); + } + + /** + * testing the interaction between add derived field with column projection + * column projection defined and the column excluded in the middle + */ + @Test + void testAddDerivedFieldWithColumnProjection1() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/csv/ids_flat.csv"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.output.schema", new JsonArray().toString())).thenReturn("[{\"columnName\":\"id0\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"date\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id2\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id3\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id4\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id5\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id6\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id7\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id8\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]"); + when(sourceState.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn("[{\"name\": \"date\", \"formula\": {\"type\": \"epoc\", \"source\": \"date\", \"format\": \"yyyy-MM-dd Z\"}}]"); + when(state.getProp("ms.csv.column.projection", StringUtils.EMPTY)).thenReturn("0,2-9"); + when(state.getProp("ms.csv.column.header", StringUtils.EMPTY)).thenReturn("true"); + when(state.getPropAsBoolean("ms.csv.column.header")).thenReturn(true); + + realHttpSource.getWorkunits(sourceState); + CsvExtractor extractor = new CsvExtractor(state, realHttpSource.getHttpSourceKeys()); + extractor.setConnection(multistageConnection); + extractor.setJobKeys(realHttpSource.getJobKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + + // check if schema has been added + JsonParser parser = new JsonParser(); + String schema = extractor.getSchema(); + Assert.assertEquals(parser.parse(schema).getAsJsonArray().size(), 10); + + int index = 0; + long[] dates = new long[]{1586502000000L, 1586588400000L, 1586674800000L, 1586761200000L, 1586847600000L, + 1586934000000L, 1587020400000L, 1587106800000L, 1587193200000L, 1587279600000L}; + String[] row; + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row.length, 10); + Assert.assertEquals(Long.parseLong(row[9]), dates[index++]); + while (extractor.hasNext()) { + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row.length, 10); + Assert.assertEquals(Long.parseLong(row[9]), dates[index++]); + } + Assert.assertEquals(10, extractor.getCsvExtractorKeys().getProcessedCount()); + + } + + /** + * testing the interaction between add derived field with column projection + * header exists and the column excluded in the middle + */ + @Test + void testAddDerivedFieldWithColumnProjection2() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/csv/ids_flat.csv"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + when(sourceState.getProp("ms.output.schema", new JsonArray().toString())).thenReturn("[{\"columnName\":\"id0\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"date\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id2\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id3\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id4\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id5\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id6\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id7\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"id8\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]"); + when(sourceState.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn("[{\"name\": \"date\", \"formula\": {\"type\": \"epoc\", \"source\": \"date\", \"format\": \"yyyy-MM-dd Z\"}}]"); + when(state.getProp("ms.csv.column.header", StringUtils.EMPTY)).thenReturn("true"); + when(state.getPropAsBoolean("ms.csv.column.header")).thenReturn(true); + + realHttpSource.getWorkunits(sourceState); + CsvExtractor extractor = new CsvExtractor(state, realHttpSource.getHttpSourceKeys()); + extractor.setConnection(multistageConnection); + extractor.setJobKeys(realHttpSource.getJobKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + + // check if schema has been added + JsonParser parser = new JsonParser(); + String schema = extractor.getSchema(); + Assert.assertEquals(parser.parse(schema).getAsJsonArray().size(), 10); + + int index = 0; + long[] dates = new long[]{1586502000000L, 1586588400000L, 1586674800000L, 1586761200000L, 1586847600000L, + 1586934000000L, 1587020400000L, 1587106800000L, 1587193200000L, 1587279600000L}; + String[] row; + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row.length, 10); + Assert.assertEquals(Long.parseLong(row[9]), dates[index++]); + while (extractor.hasNext()) { + row = extractor.readRecord(null); + Assert.assertNotNull(row); + Assert.assertEquals(row.length, 10); + Assert.assertEquals(Long.parseLong(row[9]), dates[index++]); + } + Assert.assertEquals(10, extractor.getCsvExtractorKeys().getProcessedCount()); + + } + + @Test + void testCSVParser() { + String input = "S1234\u001AS12345\u001ATrue\u001Atest@gmail.com\u001Atest\u001AAtar-תיווך ושיווק נדל\"ן\u001AONLINE"; + InputStream stream = new ByteArrayInputStream(input.getBytes()); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(stream, StandardCharsets.UTF_8)) + .withCSVParser(new CSVParserBuilder().withSeparator("\u001A".charAt(0)).withQuoteChar("\u0000".charAt(0)).build()) + .build(); + Assert.assertEquals(7,reader.iterator().next().length); + } + + @Test + void testInputStreamCSVReader () throws IOException { + String input = "S1234\u001AS12345\u001ATrue\u001Atest@gmail.com\u001Atest\u001AAtar-תיווך ושיווק נדל\"ן\u001AONLINE"; + InputStreamCSVReader reader = new InputStreamCSVReader(input,"\u001A".charAt(0),"\u0000".charAt(0)); + Assert.assertEquals(7,reader.splitRecord().size()); + } + + @Test + public void testProcessInputStream() throws RetriableAuthenticationException { + Iterator csvIterator = Mockito.mock(Iterator.class); + when(csvExtractorKeys.getCsvIterator()).thenReturn(csvIterator); + CsvExtractor extractor = new CsvExtractor(state, multiStageSource.getJobKeys()); + extractor.setConnection(multistageConnection); + extractor.setJobKeys(new JobKeys()); + when(multistageConnection.executeNext(extractor.workUnitStatus)).thenReturn(null); + doNothing().when(state).setWorkingState(WorkUnitState.WorkingState.FAILED); + Assert.assertFalse(extractor.processInputStream(10)); + + when(multistageConnection.executeNext(extractor.workUnitStatus)).thenReturn(workUnitStatus); + Map messages = ImmutableMap.of("contentType", "non-text/csv"); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertFalse(extractor.processInputStream(10)); + + messages = ImmutableMap.of("contentType", "text/csv", "schema", "test_schema"); + when(workUnitStatus.getMessages()).thenReturn(messages); + when(workUnitStatus.getSchema()).thenReturn(new JsonArray()); + when(workUnitStatus.getBuffer()).thenReturn(null); + Assert.assertFalse(extractor.processInputStream(10)); + + when(workUnitStatus.getBuffer()).thenReturn(new ByteArrayInputStream("test_string".getBytes())); + when(csvExtractorKeys.getCsvIterator()).thenReturn(null); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(workUnitStatus); + when(csvExtractorKeys.getSeparator()).thenReturn(","); + when(csvExtractorKeys.getQuoteCharacter()).thenReturn("\""); + when(csvExtractorKeys.getEscapeCharacter()).thenReturn("u005C"); + Assert.assertTrue(extractor.processInputStream(10)); + + schema.addProperty("someKey", "someValue"); + //when(outputJsonSchema.getSchema()).thenReturn(schema); + doThrow(new RuntimeException()).when(csvExtractorKeys).setCsvIterator(any()); + Assert.assertFalse(extractor.processInputStream(10)); + } + + @Test + public void testExpandColumnProjection() throws Exception { + state = new WorkUnitState(); + WorkUnitState workUnitState = PowerMockito.spy(state); + initExtractor(workUnitState); + + csvExtractor = new CsvExtractor(workUnitState, multiStageSource.getJobKeys()); + Method method = CsvExtractor.class.getDeclaredMethod("expandColumnProjection", String.class, int.class); + method.setAccessible(true); + Assert.assertEquals(method.invoke(csvExtractor, "0,4,2-3", 4).toString(), "[0, 2, 3, 4]"); + + Assert.assertEquals(method.invoke(csvExtractor, "0,4,2-3", 3).toString(), "[0, 2, 3, 4]"); + Assert.assertEquals(method.invoke(csvExtractor, null, 4).toString(), "[]"); + Assert.assertEquals(method.invoke(csvExtractor, "", 4).toString(), "[]"); + Assert.assertEquals(method.invoke(csvExtractor, "-1-4", 4).toString(), "[]"); + Assert.assertEquals(method.invoke(csvExtractor, "2--2", 4).toString(), "[]"); + Assert.assertEquals(method.invoke(csvExtractor, "-2--3", 4).toString(), "[]"); + Assert.assertEquals(method.invoke(csvExtractor, "3-3", 4).toString(), "[]"); + verify(workUnitState, atLeast(7)).setWorkingState(WorkUnitState.WorkingState.FAILED); + + when(workUnitState.getProp(MSTAGE_CSV_COLUMN_HEADER.getConfig(), StringUtils.EMPTY)).thenReturn("false"); + csvExtractor = new CsvExtractor(workUnitState, multiStageSource.getJobKeys()); + method = CsvExtractor.class.getDeclaredMethod("expandColumnProjection", String.class, int.class); + method.setAccessible(true); + Assert.assertEquals(method.invoke(csvExtractor, "3-1", 4).toString(), "[]"); + Assert.assertEquals(method.invoke(csvExtractor, "-1", 4).toString(), "[]"); + Assert.assertEquals(method.invoke(csvExtractor, "abc", 4).toString(), "[]"); + verify(workUnitState, atLeast(3)).setWorkingState(WorkUnitState.WorkingState.FAILED); + } + + @Test + public void testProcessGzipInputStream() throws RetriableAuthenticationException { + CsvExtractor extractor = new CsvExtractor(state, multiStageSource.getJobKeys()); + extractor.setConnection(multistageConnection); + extractor.setCsvExtractorKeys(new CsvExtractorKeys()); + extractor.setJobKeys(new JobKeys()); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(workUnitStatus); + when(multistageConnection.executeNext(extractor.workUnitStatus)).thenReturn(workUnitStatus); + + Map messages = ImmutableMap.of("contentType", "application/gzip", "schema", "test_schema"); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertFalse(extractor.processInputStream(10)); + } + + @Test + public void testAddDerivedFields() throws Exception { + initExtractor(state); + csvExtractor.setTimezone("America/Los_Angeles"); + + // derived field is in unsupported type + Map> derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "non-epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + when(csvExtractorKeys.getColumnToIndexMap()).thenReturn(ImmutableMap.of()); + Object[] row = new Object[]{new String[1]}; + String[] res = Whitebox.invokeMethod(csvExtractor, "addDerivedFields", row); + // Since the type is supported, we created a new record with new columns. + // In reality, the work unit will fail when processing the derived field's value. + Assert.assertEquals(res.length, 2); + Assert.assertNull(res[0]); + + // derived field is empty early exit + derivedFields = ImmutableMap.of(); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + when(csvExtractorKeys.getColumnToIndexMap()).thenReturn(ImmutableMap.of()); + row = new Object[]{new String[1]}; + res = Whitebox.invokeMethod(csvExtractor, "addDerivedFields", row); + Assert.assertEquals(res.length, 1); + Assert.assertNull(res[0]); + + // derived field is currentdate + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "currentdate")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + when(csvExtractorKeys.getColumnToIndexMap()).thenReturn(ImmutableMap.of("a", 0)); + row = new Object[]{new String[]{"a"}}; + res = Whitebox.invokeMethod(csvExtractor, "addDerivedFields", row); + Assert.assertEquals(res.length, 2); + Assert.assertEquals(res[0], "a"); + Assert.assertTrue(Math.abs(Long.parseLong(res[1]) - DateTime.now().getMillis()) < ONE_HOUR_IN_MILLS); + + // derived field is P1D + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "P1D")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + when(csvExtractorKeys.getColumnToIndexMap()).thenReturn(ImmutableMap.of("a", 0)); + row = new Object[]{new String[]{"a"}}; + res = Whitebox.invokeMethod(csvExtractor, "addDerivedFields", row); + Assert.assertEquals(res.length, 2); + Assert.assertEquals(res[0], "a"); + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + Period period = Period.parse("P1D"); + long p1d = DateTime.now().withZone(timeZone).minus(period).dayOfMonth().roundFloorCopy().getMillis(); + Assert.assertTrue(Math.abs(Long.parseLong(res[1]) - p1d) < ONE_HOUR_IN_MILLS); + + // derived field is in the specified format + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + when(csvExtractorKeys.getColumnToIndexMap()).thenReturn(ImmutableMap.of("start_time", 0)); + row = new Object[]{new String[]{"2020-06-01"}}; + res = Whitebox.invokeMethod(csvExtractor, "addDerivedFields", row); + Assert.assertEquals(res.length, 2); + Assert.assertEquals(res[0], "2020-06-01"); + DateTimeFormatter datetimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); + Assert.assertEquals(Long.parseLong(res[1]), datetimeFormatter.parseDateTime("2020-06-01").getMillis()); + + // derived field is NOT in the specified format + derivedFields = ImmutableMap.of("current_date", + ImmutableMap.of("type", "epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + when(csvExtractorKeys.getColumnToIndexMap()).thenReturn(ImmutableMap.of("start_time", 0)); + row = new Object[]{new String[]{"notdatatime"}}; + res = Whitebox.invokeMethod(csvExtractor, "addDerivedFields", row); + // Since the type is supported, we created a new record with new columns. + // In reality, the work unit will fail when processing the derived field's value. + Assert.assertEquals(res.length, 2); + Assert.assertEquals(res[0], "notdatatime"); + Assert.assertEquals(res[1], ""); + + // derived fields are from variables + JsonObject parameters = new JsonObject(); + parameters.addProperty("dateString", "2019-11-01 12:00:00"); + parameters.addProperty("someInteger", 123456); + parameters.addProperty("someNumber", 123.456); + parameters.addProperty("someEpoc", 1601038688000L); + csvExtractor.currentParameters = parameters; + + derivedFields = ImmutableMap.of("dateString", + ImmutableMap.of("type", "string", "source", "{{dateString}}"), + "someInteger", + ImmutableMap.of("type", "integer", "source", "{{someInteger}}"), + "someEpoc", + ImmutableMap.of("type", "epoc", "source", "{{someEpoc}}"), + "someNumber", + ImmutableMap.of("type", "number", "source", "{{someNumber}}")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + when(csvExtractorKeys.getColumnToIndexMap()).thenReturn(ImmutableMap.of("start_time", 0)); + row = new Object[]{new String[]{"2020-06-01"}}; + res = Whitebox.invokeMethod(csvExtractor, "addDerivedFields", row); + Assert.assertEquals(res.length, 5); + Assert.assertEquals(res[0], "2020-06-01"); + Assert.assertEquals(res[1], "2019-11-01 12:00:00"); + Assert.assertEquals(res[2], "123456"); + Assert.assertEquals(res[3], "1601038688000"); + Assert.assertEquals(res[4], "123.456"); + } + + @Test + public void testSkipRowAndSaveHeader() throws Exception { + initExtractor(state); + String[] someData = new String[]{"some_date"}; + String[] moreData = new String[]{"more_data"}; + List rows = ImmutableList.of(someData, moreData); + CsvExtractorKeys csvExtractorKeys = new CsvExtractorKeys(); + CsvExtractorKeys spy = spy(csvExtractorKeys); + csvExtractor.setCsvExtractorKeys(spy); + when(spy.getRowsToSkip()).thenReturn(2); + when(spy.getColumnHeader()).thenReturn(false); + Whitebox.invokeMethod(csvExtractor, "skipRowAndSaveHeader", rows.iterator()); + verify(spy, atMost(0)).setHeaderRow(someData); + verify(spy, atMost(0)).setHeaderRow(moreData); + } + + @Test + public void testInferSchemaWithSample() throws Exception { + initExtractor(state); + + String[] someData = new String[]{"some_date"}; + String[] moreData = new String[]{"more_data"}; + List rows = ImmutableList.of(someData, moreData); + csvExtractorKeys = new CsvExtractorKeys(); + CsvExtractorKeys spy = spy(csvExtractorKeys); + csvExtractor.setCsvExtractorKeys(spy); + when(spy.getRowsToSkip()).thenReturn(0); + Deque deque = new LinkedList(); + when(spy.getSampleRows()).thenReturn(deque); + + when(spy.getHeaderRow()).thenReturn(new String[]{"col1", "col2"}); + Assert.assertEquals( + Whitebox.invokeMethod(csvExtractor, "inferSchemaWithSample", rows.iterator()).toString(), + "[{\"columnName\":\"col0\",\"isNullable\":false,\"dataType\":{\"type\":\"string\"}}]"); + + when(spy.getHeaderRow()).thenReturn(null); + Whitebox.invokeMethod(csvExtractor, "inferSchemaWithSample", rows.iterator()); + Assert.assertEquals( + Whitebox.invokeMethod(csvExtractor, "inferSchemaWithSample", rows.iterator()).toString(), + "[{\"columnName\":\"col0\",\"isNullable\":false,\"dataType\":{\"type\":\"string\"}}]" + ); + } + + @Test + public void testSetRowFilter() { + JsonSchemaBasedFilter filter = Mockito.mock(JsonSchemaBasedFilter.class); + JsonArray schema = new JsonArray(); + csvExtractor.rowFilter = filter; + csvExtractor.setRowFilter(schema); + + csvExtractor.rowFilter = null; + when(state.getProp(MultistageProperties.MSTAGE_ENABLE_SCHEMA_BASED_FILTERING.getConfig(), StringUtils.EMPTY)).thenReturn("false"); + csvExtractor.setRowFilter(new JsonArray()); + Assert.assertNull(csvExtractor.rowFilter); + } + + @Test + public void testAddParsedCSVData() throws Exception { + initExtractor(state); + Method method = CsvExtractor.class.getDeclaredMethod("addParsedCSVData", String.class, String.class, JsonObject.class); + method.setAccessible(true); + method.invoke(csvExtractor, "key1", "true", schema); + Assert.assertEquals(schema.get("key1").getAsBoolean(), true); + + method.invoke(csvExtractor, "key2", "false", schema); + Assert.assertEquals(schema.get("key2").getAsBoolean(), false); + + method.invoke(csvExtractor, "key3", "1.234F", schema); + Assert.assertEquals(schema.get("key3").getAsFloat(), 1.234F); + + method.invoke(csvExtractor, "key4", "something else", schema); + Assert.assertEquals(schema.get("key4").getAsString(), "something else"); + } + + private void initExtractor(WorkUnitState state) { + when(state.getProp(MSTAGE_CSV_COLUMN_HEADER.getConfig(), StringUtils.EMPTY)).thenReturn("true"); + when(state.getPropAsBoolean(MSTAGE_CSV_COLUMN_HEADER.getConfig())).thenReturn(true); + when(state.getPropAsInt(MSTAGE_CSV_SKIP_LINES.getConfig(), 0)).thenReturn(2); + when(state.getProp(MSTAGE_CSV_SEPARATOR.getConfig(), StringUtils.EMPTY)).thenReturn(","); + when(state.getProp(MSTAGE_CSV_QUOTE_CHARACTER.getConfig(), StringUtils.EMPTY)).thenReturn("\""); + when(state.getProp(MSTAGE_CSV_ESCAPE_CHARACTER.getConfig(), StringUtils.EMPTY)).thenReturn("u005C"); + when(state.getProp(MSTAGE_EXTRACT_PREPROCESSORS_PARAMETERS.getConfig(), new JsonObject().toString())).thenReturn(StringUtils.EMPTY); + when(state.getProp(MSTAGE_EXTRACT_PREPROCESSORS.getConfig(), StringUtils.EMPTY)).thenReturn(StringUtils.EMPTY); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/extractor/FileDumpExtractorTest.java b/dil/src/test/java/com/linkedin/dil/extractor/FileDumpExtractorTest.java new file mode 100644 index 0000000..36f4d5c --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/extractor/FileDumpExtractorTest.java @@ -0,0 +1,80 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import java.io.File; +import java.io.FileFilter; +import java.io.InputStream; +import java.util.List; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.connection.MultistageConnection; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.source.HttpSource; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.runtime.JobState; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +/** + * Test FileDumpExtractor under following scenarios: + * + * Scenario 1: download a file and save to /tmp + */ +@Test +public class FileDumpExtractorTest { + + /** + * Test for scenario 1: download a file and save to /tmp + * + * Input: a mocked InputStream from a resource file + * Output: a non-empty schema, and a file saved in /tmp + */ + @Test + void testSaveCsvFile() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/csv/common-crawl-files.csv"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + + SourceState sourceState = new SourceState(); + HttpSource source = new HttpSource(); + source.getWorkunits(sourceState); + + List wus = source.getWorkunits(new SourceState()); + WorkUnitState state = new WorkUnitState(wus.get(0), new JobState()); + state.setProp("fs.uri", "file://localhost/"); + state.setProp("state.store.fs.uri", "file://localhost"); + state.setProp("data.publisher.final.dir", "/tmp/gobblin/job-output"); + state.setProp("ms.extractor.target.file.name", "common-crawl-files.csv"); + + FileDumpExtractor extractor = new FileDumpExtractor(state, source.getHttpSourceKeys()); + MultistageConnection connection = Mockito.mock(MultistageConnection.class); + extractor.setConnection(connection); + extractor.workUnitStatus = WorkUnitStatus.builder().build(); + when(connection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + + String schema = extractor.getSchema(); + Assert.assertNotEquals(schema.length(), 0); + + extractor.readRecord(null); + + File[] dbFiles = new File("/tmp/gobblin/job-output").listFiles(new FileFilter() { + @Override + public boolean accept(File pathname) { + return pathname.isFile() && pathname.toString().matches(".*common-crawl-files.csv"); + } + }); + Assert.assertNotEquals(dbFiles.length, 0); + if (dbFiles != null) { + for (File file : dbFiles) { + file.delete(); + } + } + } +} diff --git a/dil/src/test/java/com/linkedin/dil/extractor/FileDumpExtractorTest2.java b/dil/src/test/java/com/linkedin/dil/extractor/FileDumpExtractorTest2.java new file mode 100644 index 0000000..80cbe0d --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/extractor/FileDumpExtractorTest2.java @@ -0,0 +1,199 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.List; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.tuple.MutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.MultistageConnection; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.keys.FileDumpExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.source.MultistageSource; +import com.linkedin.dil.util.VariableUtils; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.mockito.Mockito; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static org.mockito.Mockito.*; + + +/** + * Test FileDumpExtractor under following scenarios: + * + * Scenario 1: download a file and save to /tmp + */ +@PrepareForTest({VariableUtils.class, FileSystem.class}) +public class FileDumpExtractorTest2 extends PowerMockTestCase { + + private final static String DATA_SET_URN_KEY = "com.linkedin.SeriesCollection"; + private final static String ACTIVATION_PROP = "{\"name\": \"survey\", \"type\": \"unit\", \"units\": \"id1,id2\"}"; + private final static String DATA_FINAL_DIR = "/jobs/testUser/gobblin/useCaseRoot"; + private final static String FILE_PERMISSION = "775"; + private final static long WORK_UNIT_START_TIME_KEY = 1590994800000L; + + private WorkUnitState state; + private MultistageSource source; + private WorkUnit workUnit; + private FileDumpExtractorKeys fileDumpExtractorKeys; + private WorkUnitStatus workUnitStatus; + private JobKeys jobKeys; + private FileDumpExtractor fileDumpExtractor; + private MultistageConnection multistageConnection; + + @BeforeMethod + public void setUp() { + state = Mockito.mock(WorkUnitState.class); + source = Mockito.mock(MultistageSource.class); + + List wus = new MultistageSource().getWorkunits(new SourceState()); + workUnit = wus.get(0); + workUnit.setProp(MultistageProperties.DATASET_URN_KEY.getConfig(), DATA_SET_URN_KEY); + + fileDumpExtractorKeys = Mockito.mock(FileDumpExtractorKeys.class); + workUnitStatus = Mockito.mock(WorkUnitStatus.class); + jobKeys = Mockito.mock(JobKeys.class); + + when(state.getProp(MSTAGE_ACTIVATION_PROPERTY.getConfig(), new JsonObject().toString())).thenReturn(ACTIVATION_PROP); + when(state.getPropAsLong(MSTAGE_WORKUNIT_STARTTIME_KEY.getConfig(), 0L)).thenReturn(WORK_UNIT_START_TIME_KEY); + when(state.getWorkunit()).thenReturn(workUnit); + when(state.getProp(DATA_PUBLISHER_FINAL_DIR.getConfig(), StringUtils.EMPTY)).thenReturn(DATA_FINAL_DIR); + when(state.getProp(MSTAGE_EXTRACTOR_TARGET_FILE_PERMISSION.getConfig(), StringUtils.EMPTY)).thenReturn(FILE_PERMISSION); + + fileDumpExtractor = new FileDumpExtractor(state, source.getJobKeys()); + fileDumpExtractor.setFileDumpExtractorKeys(fileDumpExtractorKeys); + fileDumpExtractor.jobKeys = jobKeys; + + multistageConnection = Mockito.mock(MultistageConnection.class); + fileDumpExtractor.setConnection(multistageConnection); + } + + /** + * Test FileDumpExtractor Constructor with a happy path + */ + @Test + public void testFileDumpExtractorConstructor() { + FileDumpExtractor fileDumpExtractor = new FileDumpExtractor(state, source.getJobKeys()); + Assert.assertEquals(fileDumpExtractor.getFileDumpExtractorKeys().getFileName(), StringUtils.EMPTY); + Assert.assertEquals(fileDumpExtractor.getFileDumpExtractorKeys().getFileWritePermissions(), FILE_PERMISSION); + Assert.assertEquals(fileDumpExtractor.getFileDumpExtractorKeys().getFileDumpLocation(), DATA_FINAL_DIR); + Assert.assertEquals(fileDumpExtractor.getFileDumpExtractorKeys().getCurrentFileNumber(), 0); + } + + /** + * Test FileDumpExtractor Constructor when a RuntimeException is thrown + */ + @Test(expectedExceptions = RuntimeException.class) + public void testFileDumpExtractorConstructorWithException() { + doThrow(new RuntimeException()).when(state).getProp(DATA_PUBLISHER_FINAL_DIR.getConfig(), StringUtils.EMPTY); + new FileDumpExtractor(state, source.getJobKeys()); + } + + /** + * Test readRecord + */ + @Test(expectedExceptions = RuntimeException.class) + public void testReadRecord() throws IOException, RetriableAuthenticationException { + when(jobKeys.getPaginationFields()).thenReturn(new HashMap<>()); + when(jobKeys.getPaginationInitValues()).thenReturn(new HashMap<>()); + when(jobKeys.isPaginationEnabled()).thenReturn(false); + when(multistageConnection.executeNext(fileDumpExtractor.workUnitStatus)).thenReturn(workUnitStatus); + when(workUnitStatus.getBuffer()).thenReturn(new ByteArrayInputStream("test_string".getBytes())); + when(fileDumpExtractorKeys.getCurrentFileNumber()).thenReturn(Long.valueOf(10)); + when(fileDumpExtractorKeys.getFileName()).thenReturn("file_name"); + when(fileDumpExtractorKeys.getFileDumpLocation()).thenReturn("dir"); + when(fileDumpExtractorKeys.getFileWritePermissions()).thenReturn("775"); + when(fileDumpExtractorKeys.getCurrentFileNumber()).thenReturn(Long.valueOf(1)); + PowerMockito.mockStatic(FileSystem.class); + FSDataOutputStream out = Mockito.mock(FSDataOutputStream.class); + PowerMockito.when(FileSystem.create(any(), any(), any())).thenReturn(out); + PowerMockito.doNothing().when(out).flush(); + PowerMockito.doNothing().when(out).close(); + + Assert.assertNull(fileDumpExtractor.readRecord("")); + + when(jobKeys.isPaginationEnabled()).thenReturn(true); + + doThrow(new RuntimeException()).when(fileDumpExtractorKeys).incrCurrentFileNumber(); + fileDumpExtractor.readRecord(""); + } + + /** + * Test processInputStream with two scenarios + * 1: Happy path + * 2: Invalid file name provided + * + * @throws IOException + */ + @Test + public void testProcessInputStream() throws RetriableAuthenticationException { + // replace mocked source key with default source key + fileDumpExtractor.jobKeys = new JobKeys(); + + when(fileDumpExtractorKeys.getActivationParameters()).thenReturn(new JsonObject()); + when(fileDumpExtractorKeys.getPayloads()).thenReturn(new JsonArray()); + when(multistageConnection.executeNext(fileDumpExtractor.workUnitStatus)).thenReturn(null); + Assert.assertFalse(fileDumpExtractor.processInputStream(10)); + + WorkUnitStatus unitStatus = Mockito.mock(WorkUnitStatus.class); + when(multistageConnection.executeNext(fileDumpExtractor.workUnitStatus)).thenReturn(unitStatus); + fileDumpExtractor.getFileDumpExtractorKeys().setFileName(StringUtils.EMPTY); + Assert.assertFalse(fileDumpExtractor.processInputStream(10)); + + when(unitStatus.getBuffer()).thenReturn(null); + fileDumpExtractor.getFileDumpExtractorKeys().setFileName("test_file_name"); + Assert.assertFalse(fileDumpExtractor.processInputStream(10)); + + JobKeys keys = Mockito.mock(JobKeys.class); + when(source.getJobKeys()).thenReturn(keys); + when(keys.isPaginationEnabled()).thenReturn(true); + InputStream input = new ByteArrayInputStream("test_string".getBytes()); + when(unitStatus.getBuffer()).thenReturn(input); + Assert.assertFalse(fileDumpExtractor.processInputStream(10)); + } + /** + * Test getFileName with two scenarios + * 1: Happy path + * 2: Unresolved placeholder + */ + @Test + public void testGetFileName() throws Exception { + PowerMockito.mockStatic(VariableUtils.class); + String fileNameTemplate = "testFileTemplate"; + when(state.getProp(MSTAGE_EXTRACTOR_TARGET_FILE_NAME.getConfig(), StringUtils.EMPTY)).thenReturn(fileNameTemplate); + String fileName = IOUtils.toString(this.getClass().getResourceAsStream("/other/sample-data-include-long-file-name.txt"), StandardCharsets.UTF_8.name()); + String filePath = String.join("/", "dir", fileName); + Pair pair = new MutablePair<>(filePath, new JsonObject()); + PowerMockito.when(VariableUtils.replaceWithTracking(any(), any())).thenReturn(pair); + FileDumpExtractorKeys extractorKeys = new FileDumpExtractor(state, source.getJobKeys()).getFileDumpExtractorKeys(); + Assert.assertEquals(extractorKeys.getFileName(), String.join("/", "dir", fileName.substring(0, 255 - 1))); + + PowerMockito.doThrow(new UnsupportedEncodingException()).when(VariableUtils.class, "replaceWithTracking", any(), any()); + extractorKeys = new FileDumpExtractor(state, source.getJobKeys()).getFileDumpExtractorKeys(); + Assert.assertEquals(extractorKeys.getFileName(), fileNameTemplate); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/extractor/JsonExtractorTest.java b/dil/src/test/java/com/linkedin/dil/extractor/JsonExtractorTest.java new file mode 100644 index 0000000..05dae6d --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/extractor/JsonExtractorTest.java @@ -0,0 +1,561 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonNull; +import com.google.gson.JsonObject; +import com.google.gson.JsonPrimitive; +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.connection.MultistageConnection; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.keys.JsonExtractorKeys; +import com.linkedin.dil.source.HttpSource; +import com.linkedin.dil.source.MultistageSource; +import com.linkedin.dil.util.JsonUtils; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.SchemaBuilder; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.runtime.JobState; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.joda.time.DateTime; +import org.mockito.Mockito; +import org.powermock.reflect.Whitebox; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static org.mockito.Mockito.*; + + +@Test +public class JsonExtractorTest { + + // Matches to the total count field in the response json + private static final int TOTAL_COUNT = 2741497; + private final static String DATA_SET_URN_KEY = "com.apache.SeriesCollection"; + private final static String ACTIVATION_PROP = "{\"name\": \"survey\", \"type\": \"unit\", \"units\": \"id1,id2\"}"; + private final static long WORKUNIT_STARTTIME_KEY = 1590994800000L; + private final static long ONE_HOUR_IN_MILLS = 3600000L; + + private Gson gson; + private JobKeys jobKeys; + private WorkUnit workUnit; + private WorkUnitState state; + private WorkUnitStatus workUnitStatus; + private MultistageSource source; + private JsonExtractorKeys jsonExtractorKeys; + private JsonExtractor jsonExtractor; + private MultistageConnection multistageConnection; + + @BeforeMethod + public void setUp() throws RetriableAuthenticationException { + gson = new Gson(); + source = Mockito.mock(MultistageSource.class); + jobKeys = Mockito.mock(JobKeys.class); + + List wus = new MultistageSource().getWorkunits(new SourceState()); + workUnit = wus.get(0); + + workUnitStatus = Mockito.mock(WorkUnitStatus.class); + state = Mockito.mock(WorkUnitState.class); + when(state.getProp(MSTAGE_ACTIVATION_PROPERTY.getConfig(), new JsonObject().toString())).thenReturn(ACTIVATION_PROP); + when(state.getPropAsLong(MSTAGE_WORKUNIT_STARTTIME_KEY.getConfig(), 0L)).thenReturn(WORKUNIT_STARTTIME_KEY); + when(state.getWorkunit()).thenReturn(workUnit); + workUnit.setProp(MultistageProperties.DATASET_URN_KEY.getConfig(), DATA_SET_URN_KEY); + when(source.getJobKeys()).thenReturn(jobKeys); + when(jobKeys.getSourceParameters()).thenReturn(new JsonArray()); + when(jobKeys.getPaginationInitValues()).thenReturn(new HashMap<>()); + when(jobKeys.getSchemaCleansingPattern()).thenReturn("(\\s|\\$|@)"); + when(jobKeys.getSchemaCleansingReplacement()).thenReturn("_"); + when(jobKeys.getSchemaCleansingNullable()).thenReturn(false); + jsonExtractorKeys = Mockito.mock(JsonExtractorKeys.class); + jsonExtractor = new JsonExtractor(state, source.getJobKeys()); + jsonExtractor.setJsonExtractorKeys(jsonExtractorKeys); + jsonExtractor.jobKeys = jobKeys; + + multistageConnection = Mockito.mock(MultistageConnection.class); + when(multistageConnection.executeFirst(workUnitStatus)).thenReturn(workUnitStatus); + when(multistageConnection.executeNext(workUnitStatus)).thenReturn(workUnitStatus); + jsonExtractor.setConnection(multistageConnection); + } + + @Test + public void testReadRecord() throws RetriableAuthenticationException { + when(jobKeys.getTotalCountField()).thenReturn("totalRecords"); + + when(jsonExtractorKeys.getActivationParameters()).thenReturn(new JsonObject()); + when(jsonExtractorKeys.getTotalCount()).thenReturn(Long.valueOf(0)); + when(jsonExtractorKeys.getJsonElementIterator()).thenReturn(null); + when(jsonExtractorKeys.getPayloads()).thenReturn(new JsonArray()); + Assert.assertNull(jsonExtractor.readRecord(new JsonObject())); + + when(jobKeys.getTotalCountField()).thenReturn(StringUtils.EMPTY); + when(workUnitStatus.getMessages()).thenReturn(ImmutableMap.of("contentType", "application/json")); + when(multistageConnection.executeNext(jsonExtractor.workUnitStatus)).thenReturn(workUnitStatus); + InputStream stream = new ByteArrayInputStream("{\"key\":\"value\"}".getBytes()); + when(workUnitStatus.getBuffer()).thenReturn(stream); + when(jobKeys.getDataField()).thenReturn(StringUtils.EMPTY); + when(jobKeys.getSessionKeyField()).thenReturn(new JsonObject()); + JsonArray outputSchema = SchemaBuilder.fromJsonData("{\"key\":\"value\"}").buildAltSchema().getAsJsonArray(); + when(jobKeys.getOutputSchema()).thenReturn(outputSchema); + when(jsonExtractorKeys.getCurrentPageNumber()).thenReturn(Long.valueOf(0)); + when(jsonExtractorKeys.getSessionKeyValue()).thenReturn("session_key"); + workUnit.setProp(MultistageProperties.DATASET_URN_KEY.getConfig(), "com.linkedin.xxxxx.UserGroups"); + Iterator jsonElementIterator = ImmutableList.of().iterator(); + when(jsonExtractorKeys.getJsonElementIterator()).thenReturn(jsonElementIterator); + when(jsonExtractorKeys.getProcessedCount()).thenReturn(Long.valueOf(0)); + when(jsonExtractorKeys.getTotalCount()).thenReturn(Long.valueOf(20)); + Assert.assertNull(jsonExtractor.readRecord(new JsonObject())); + + JsonObject item = gson.fromJson("{\"key\":\"value\"}", JsonObject.class); + jsonElementIterator = ImmutableList.of(item).iterator(); + when(jsonExtractorKeys.getJsonElementIterator()).thenReturn(jsonElementIterator); + when(jsonExtractorKeys.getProcessedCount()).thenReturn(Long.valueOf(10)); + when(jobKeys.getEncryptionField()).thenReturn(null); + when(jobKeys.isEnableCleansing()).thenReturn(true); + when(jobKeys.getSchemaCleansingPattern()).thenReturn("(\\s|\\$|@)"); + when(jobKeys.getSchemaCleansingReplacement()).thenReturn("_"); + when(jobKeys.getSchemaCleansingNullable()).thenReturn(false); + Assert.assertEquals(jsonExtractor.readRecord(new JsonObject()).toString(), "{\"key\":\"value\"}"); + + jsonElementIterator = ImmutableList.of().iterator(); + when(jsonExtractorKeys.getJsonElementIterator()).thenReturn(jsonElementIterator); + when(jsonExtractorKeys.getProcessedCount()).thenReturn(Long.valueOf(10)); + when(jsonExtractorKeys.getTotalCount()).thenReturn(Long.valueOf(20)); + Assert.assertNull(jsonExtractor.readRecord(new JsonObject())); + + when(jsonExtractorKeys.getTotalCount()).thenReturn(Long.valueOf(10)); + when(jobKeys.isPaginationEnabled()).thenReturn(true); + when(jobKeys.isSessionStateEnabled()).thenReturn(false); + Assert.assertNull(jsonExtractor.readRecord(new JsonObject())); + + when(jobKeys.isPaginationEnabled()).thenReturn(false); + Assert.assertNull(jsonExtractor.readRecord(new JsonObject())); + + when(jobKeys.isPaginationEnabled()).thenReturn(true); + when(jobKeys.isSessionStateEnabled()).thenReturn(true); + when(jobKeys.getSessionStateCondition()).thenReturn("success|ready"); + when(jsonExtractorKeys.getSessionKeyValue()).thenReturn("success"); + Assert.assertNull(jsonExtractor.readRecord(new JsonObject())); + } + + @Test + public void testProcessInputStream() throws RetriableAuthenticationException { + // replaced mock'ed work unit status with default work unit status + jsonExtractor.workUnitStatus = WorkUnitStatus.builder().build(); + + when(jsonExtractorKeys.getActivationParameters()).thenReturn(new JsonObject()); + when(jsonExtractorKeys.getPayloads()).thenReturn(new JsonArray()); + when(jobKeys.getTotalCountField()).thenReturn(StringUtils.EMPTY); + Assert.assertFalse(jsonExtractor.processInputStream(10)); + + JsonElement item = new JsonObject(); + Iterator jsonElementIterator = ImmutableList.of(item).iterator(); + when(jsonExtractorKeys.getJsonElementIterator()).thenReturn(jsonElementIterator); + when(multistageConnection.executeNext(jsonExtractor.workUnitStatus)).thenReturn(null); + Assert.assertFalse(jsonExtractor.processInputStream(0)); + + when(workUnitStatus.getMessages()).thenReturn(ImmutableMap.of("contentType", "multipart/form-data")); + when(multistageConnection.executeNext(jsonExtractor.workUnitStatus)).thenReturn(workUnitStatus); + Assert.assertFalse(jsonExtractor.processInputStream(0)); + + when(workUnitStatus.getMessages()).thenReturn(null); + when(jobKeys.hasSourceSchema()).thenReturn(true); + Assert.assertFalse(jsonExtractor.processInputStream(0)); + + when(jobKeys.hasSourceSchema()).thenReturn(false); + when(jobKeys.hasOutputSchema()).thenReturn(true); + Assert.assertFalse(jsonExtractor.processInputStream(0)); + } + + @Test + public void testProcessInputStream2() { + jsonExtractor.setJsonExtractorKeys(new JsonExtractorKeys()); + jsonExtractor.setJobKeys(new JobKeys()); + + when(workUnitStatus.getMessages()).thenReturn(null); + when(workUnitStatus.getBuffer()).thenReturn(null); + Assert.assertFalse(jsonExtractor.processInputStream(0)); + + InputStream stream = new ByteArrayInputStream("null".getBytes()); + when(workUnitStatus.getBuffer()).thenReturn(stream); + Assert.assertFalse(jsonExtractor.processInputStream(0)); + + stream = new ByteArrayInputStream("primitive_string".getBytes()); + when(workUnitStatus.getBuffer()).thenReturn(stream); + Assert.assertFalse(jsonExtractor.processInputStream(0)); + } + + @Test + public void testGetElementByJsonPathWithEdgeCases() { + JsonObject row = new JsonObject(); + String jsonPath = StringUtils.EMPTY; + Assert.assertEquals(JsonUtils.get(row, jsonPath), JsonNull.INSTANCE); + + jsonPath = "key"; + Assert.assertEquals(JsonUtils.get(null, jsonPath), JsonNull.INSTANCE); + + row = gson.fromJson("{\"key\":\"some_primitive_value\"}", JsonObject.class); + jsonPath = "key.1"; + Assert.assertEquals(JsonUtils.get(row, jsonPath), JsonNull.INSTANCE); + + row = gson.fromJson("{\"key\":[\"some_primitive_value\"]}", JsonObject.class); + jsonPath = "key.a"; + Assert.assertEquals(JsonUtils.get(row, jsonPath), JsonNull.INSTANCE); + + jsonPath = "key.3"; + Assert.assertEquals(JsonUtils.get(row, jsonPath), JsonNull.INSTANCE); + } + + /** + * Test Extractor shall stop the session when total count of records is met + */ + @Test + void testStopConditionTotalCountMet() throws RetriableAuthenticationException { + InputStream inputStream = getClass().getResourceAsStream("/json/last-page-with-data.json"); + WorkUnitStatus status = WorkUnitStatus.builder().buffer(inputStream).build(); + status.setTotalCount(TOTAL_COUNT); + + SourceState sourceState = mock(SourceState.class); + when(sourceState.getProp("ms.data.field", "")).thenReturn("items"); + when(sourceState.getProp("ms.total.count.field", "")).thenReturn("totalResults"); + when(sourceState.getProp("ms.pagination", "")).thenReturn("{\"fields\": [\"offset\", \"limit\"], \"initialvalues\": [0, 5000]}"); + when(sourceState.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new HttpSource(); + List wus = source.getWorkunits(sourceState); + WorkUnitState state = new WorkUnitState(wus.get(0), new JobState()); + + JsonExtractor extractor = new JsonExtractor(state, source.getJobKeys()); + extractor.setConnection(multistageConnection); + extractor.getJsonExtractorKeys().setTotalCount(TOTAL_COUNT); + + extractor.workUnitStatus = WorkUnitStatus.builder().build(); + when(multistageConnection.executeFirst(extractor.workUnitStatus)).thenReturn(status); + + Assert.assertFalse(extractor.processInputStream(TOTAL_COUNT)); + // If total count not reached, should not fail + Assert.assertTrue(extractor.processInputStream(TOTAL_COUNT-1)); + } + + @Test + public void testAddDerivedFields() throws Exception { + Map> derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "non-epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + jsonExtractor.setTimezone("America/Los_Angeles"); + JsonObject row = new JsonObject(); + JsonObject pushDowns = new JsonObject(); + JsonObject actual; + when(jsonExtractorKeys.getPushDowns()).thenReturn(pushDowns); + Assert.assertEquals(Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row).toString(), "{}"); + + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "epoc", "source", "currentdate")); + pushDowns.addProperty("non-formula", "testValue"); + row.addProperty("start_time", "2020-06-01"); + when(jsonExtractorKeys.getPushDowns()).thenReturn(pushDowns); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 2); + Assert.assertTrue(actual.has("formula")); + Assert.assertTrue( + Math.abs(Long.parseLong(actual.get("formula").toString()) - DateTime.now().getMillis()) + < ONE_HOUR_IN_MILLS); + + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "epoc", "source", "start_time", "format", "yyyy-MM-dd")); + pushDowns.addProperty("non-formula", "testValue"); + row.addProperty("start_time", "2020-06-01"); + when(jsonExtractorKeys.getPushDowns()).thenReturn(pushDowns); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 2); + Assert.assertTrue(actual.has("formula")); + Assert.assertEquals(actual.get("start_time").toString(), "\"2020-06-01\""); + + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "string", "source", "P0D", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + pushDowns.addProperty("non-formula", "testValue"); + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row).toString(), + "{\"start_time\":\"2020-06-01\",\"formula\":\"\"}"); + + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "epoc", "source", "P0D", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + pushDowns.addProperty("non-formula", "testValue"); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 2); + Assert.assertTrue(actual.has("formula")); + Assert.assertEquals(actual.get("start_time").toString(), "\"2020-06-01\""); + + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "epoc", "source", "start_time", "format", "yyyy-MM-dd")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + pushDowns.addProperty("non-formula", "testValue"); + row.addProperty("start_time", "1592809200000"); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 2); + Assert.assertTrue(actual.has("formula")); + Assert.assertEquals(actual.get("start_time").toString(), "\"1592809200000\""); + // negative regex case + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "regexp", "source", "uri", "format", "/syncs/([0-9]+)$")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + pushDowns.addProperty("non-formula", "testValue"); + row.addProperty("uri", "invalid_uri"); + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row).toString(), + "{\"start_time\":\"1592809200000\",\"formula\":\"no match\",\"uri\":\"invalid_uri\"}"); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 3); + Assert.assertEquals(actual.get("start_time").toString(), "\"1592809200000\""); + Assert.assertEquals(actual.get("formula").toString(), "\"no match\""); + Assert.assertEquals(actual.get("uri").toString(), "\"invalid_uri\""); + // positive regex case + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "regexp", "source", "uri", "format", "/syncs/([0-9]+)$")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + pushDowns.addProperty("formula", "/syncs/1234"); + row.addProperty("uri", "invalid_uri"); + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row).toString(), + "{\"start_time\":\"1592809200000\",\"formula\":\"1234\",\"uri\":\"invalid_uri\"}"); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 3); + Assert.assertEquals(actual.get("start_time").toString(), "\"1592809200000\""); + Assert.assertEquals(actual.get("formula").getAsString(), "1234"); + Assert.assertEquals(actual.get("uri").toString(), "\"invalid_uri\""); + pushDowns.remove("formula"); + + derivedFields = ImmutableMap.of("formula", + ImmutableMap.of("type", "boolean", "value", "true")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + pushDowns.addProperty("non-formula", "testValue"); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 3); + Assert.assertTrue(actual.has("formula")); + Assert.assertEquals(actual.get("formula").toString(), "true"); + + // Testing derived fields from variable + JsonObject parameters = new JsonObject(); + parameters.addProperty("dateString", "2019-11-01 12:00:00"); + parameters.addProperty("someInteger", 123456); + parameters.addProperty("someNumber", 123.456); + parameters.addProperty("someEpoc", 1601038688000L); + jsonExtractor.currentParameters = parameters; + + derivedFields = ImmutableMap.of("dateString", + ImmutableMap.of("type", "string", "source", "{{dateString}}"), + "someInteger", + ImmutableMap.of("type", "integer", "source", "{{someInteger}}"), + "someEpoc", + ImmutableMap.of("type", "epoc", "source", "{{someEpoc}}"), + "someNumber", + ImmutableMap.of("type", "number", "source", "{{someNumber}}")); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + pushDowns.addProperty("non-formula", "testValue"); + actual = Whitebox.invokeMethod(jsonExtractor, "addDerivedFields", row); + Assert.assertEquals(actual.entrySet().size(), 7); + Assert.assertEquals(actual.get("dateString").toString(), "\"2019-11-01 12:00:00\""); + Assert.assertEquals(actual.get("someInteger").toString(), "123456"); + Assert.assertEquals(actual.get("someNumber").toString(), "123.456"); + Assert.assertEquals(actual.get("start_time").toString(), "\"1592809200000\""); + } + + @Test + public void testGetNextPaginationValues() throws Exception { + Map paginationKeys = ImmutableMap.of( + ParameterTypes.PAGESTART, "page_start", + ParameterTypes.PAGESIZE, "page_size", + ParameterTypes.PAGENO, "page_number"); + + when(jobKeys.getPaginationFields()).thenReturn(paginationKeys); + JsonElement input = gson.fromJson("{\"page_start\":0, \"page_size\":100, \"page_number\":1}", JsonObject.class); + HashMap paginationValues = Whitebox.invokeMethod(jsonExtractor, "getNextPaginationValues", input); + Assert.assertEquals(paginationValues.size(), 3); + Assert.assertEquals(paginationValues.get(ParameterTypes.PAGESIZE), Long.valueOf(100)); + Assert.assertEquals(paginationValues.get(ParameterTypes.PAGESTART), Long.valueOf(100)); + + input = gson.fromJson("{\"page_size\":100, \"page_number\":1}", JsonObject.class); + paginationValues = Whitebox.invokeMethod(jsonExtractor, "getNextPaginationValues", input); + Assert.assertEquals(paginationValues.size(), 1); + Assert.assertEquals(paginationValues.get(ParameterTypes.PAGENO), Long.valueOf(2)); + + input = gson.fromJson("{\"page_start\":0, \"page_number\":1}", JsonObject.class); + paginationValues = Whitebox.invokeMethod(jsonExtractor, "getNextPaginationValues", input); + Assert.assertEquals(paginationValues.size(), 1); + Assert.assertEquals(paginationValues.get(ParameterTypes.PAGENO), Long.valueOf(2)); + + input = gson.fromJson("{\"page_number\":1}", JsonObject.class); + paginationValues = Whitebox.invokeMethod(jsonExtractor, "getNextPaginationValues", input); + Assert.assertEquals(paginationValues.size(), 1); + Assert.assertEquals(paginationValues.get(ParameterTypes.PAGENO), Long.valueOf(2)); + + gson.fromJson("{\"page_start\":null, \"page_size\":100, \"page_number\":1}", JsonObject.class); + paginationValues = Whitebox.invokeMethod(jsonExtractor, "getNextPaginationValues", input); + Assert.assertEquals(paginationValues.size(), 1); + Assert.assertEquals(paginationValues.get(ParameterTypes.PAGENO), Long.valueOf(2)); + + gson.fromJson("{\"page_start\":0, \"page_size\":null, \"page_number\":1}", JsonObject.class); + paginationValues = Whitebox.invokeMethod(jsonExtractor, "getNextPaginationValues", input); + Assert.assertEquals(paginationValues.size(), 1); + Assert.assertEquals(paginationValues.get(ParameterTypes.PAGENO), Long.valueOf(2)); + + input = gson.fromJson("test_primitive_value", JsonPrimitive.class); + paginationValues = Whitebox.invokeMethod(jsonExtractor, "getNextPaginationValues", input); + Assert.assertEquals(paginationValues.size(), 0); + } + + @Test + public void testRetrieveSessionKeyValue() throws Exception { + JsonObject sessionKeyField = gson.fromJson("{\"name\": \"hasMore\", \"condition\": {\"regexp\": \"false|False\"}}", JsonObject.class); + when(jobKeys.getSessionKeyField()).thenReturn(sessionKeyField); + JsonElement input = gson.fromJson("[{\"name\": \"hasMore\"}]", JsonArray.class); + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "retrieveSessionKeyValue", input), StringUtils.EMPTY); + + input = gson.fromJson("{\"notMore\": \"someValue\"}", JsonObject.class); + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "retrieveSessionKeyValue", input), StringUtils.EMPTY); + } + + /** + * Test getTotalCountValue with non-JsonArray payload + * Expect: RuntimeException + */ + @Test(expectedExceptions = RuntimeException.class) + public void testGetTotalCountValueWithJsonObjectPayload() throws Exception { + when(source.getJobKeys().getTotalCountField()).thenReturn(""); + when(source.getJobKeys().getDataField()).thenReturn("items"); + JsonObject data = gson.fromJson("{\"records\":{\"totalRecords\":2},\"items\":{\"callId\":\"001\"}}", JsonObject.class); + Assert.assertEquals(Whitebox.invokeMethod(jsonExtractor, "getTotalCountValue", data), Long.valueOf(0)); + } + + @Test + public void testLimitedCleanse() throws Exception { + JsonElement input; + input = gson.fromJson("{\"key\": \"value\"}", JsonObject.class); + Assert.assertEquals(Whitebox.invokeMethod(jsonExtractor, "limitedCleanse", input).toString() + , input.toString()); + + input = gson.fromJson("[{\"key\": \"value\"}]", JsonArray.class); + Assert.assertEquals(Whitebox.invokeMethod(jsonExtractor, "limitedCleanse", input).toString() + , input.toString()); + + input = gson.fromJson("test_primitive_value", JsonPrimitive.class); + Assert.assertEquals(Whitebox.invokeMethod(jsonExtractor, "limitedCleanse", input).toString() + , input.toString()); + } + + /** + * Test the timeout scenario: session timeout and condition is not met + * @throws Exception + */ + @Test (expectedExceptions = {RuntimeException.class}) + public void testWaitingBySessionKeyWithTimeout() throws Exception { + when(jobKeys.isSessionStateEnabled()).thenReturn(false); + Assert.assertTrue(Whitebox.invokeMethod(jsonExtractor, "waitingBySessionKeyWithTimeout")); + + when(jobKeys.isSessionStateEnabled()).thenReturn(true); + when(jobKeys.getSessionStateCondition()).thenReturn("success|ready"); + when(jsonExtractorKeys.getSessionKeyValue()).thenReturn("failed"); + long secondsBeforeCurrentTime = DateTime.now().minus(3000).getMillis(); + long timeout = 2000; + when(jsonExtractorKeys.getStartTime()).thenReturn(secondsBeforeCurrentTime); + when(source.getJobKeys().getSessionTimeout()).thenReturn(timeout); + Whitebox.invokeMethod(jsonExtractor, "waitingBySessionKeyWithTimeout"); + } + + /** + * Test the in-session scenario: session condition not met, but not timeout, therefore no exception + * @throws Exception + */ + @Test + public void testWaitingBySessionKeyWithTimeout2() throws Exception { + when(jobKeys.isSessionStateEnabled()).thenReturn(false); + Assert.assertTrue(Whitebox.invokeMethod(jsonExtractor, "waitingBySessionKeyWithTimeout")); + + when(jobKeys.isSessionStateEnabled()).thenReturn(true); + when(jobKeys.getSessionStateCondition()).thenReturn("success|ready"); + when(jobKeys.getSessionStateFailCondition()).thenReturn(StringUtils.EMPTY); + when(jsonExtractorKeys.getSessionKeyValue()).thenReturn("failed"); + long secondsBeforeCurrentTime = DateTime.now().minus(3000).getMillis(); + long timeout = 4000; + when(jsonExtractorKeys.getStartTime()).thenReturn(secondsBeforeCurrentTime); + when(jobKeys.getSessionTimeout()).thenReturn(timeout); + Assert.assertFalse(Whitebox.invokeMethod(jsonExtractor, "waitingBySessionKeyWithTimeout")); + } + + /** + * Test the in-session scenario: Exception when session failCondition met + * @throws Exception + */ + @Test (expectedExceptions = {RuntimeException.class}) + public void testWaitingBySessionKeyWithTimeoutWhenFailConditionIsMet() throws Exception { + when(jobKeys.isSessionStateEnabled()).thenReturn(true); + when(jobKeys.getSessionStateCondition()).thenReturn("success|ready"); + when(jobKeys.getSessionStateFailCondition()).thenReturn("failed"); + when(jsonExtractorKeys.getSessionKeyValue()).thenReturn("failed"); + + Whitebox.invokeMethod(jsonExtractor, "waitingBySessionKeyWithTimeout"); + } + + @Test + public void testIsSessionStateMatch() throws Exception { + when(jobKeys.isSessionStateEnabled()).thenReturn(false); + Assert.assertFalse(Whitebox.invokeMethod(jsonExtractor, "isSessionStateMatch")); + + when(jobKeys.isSessionStateEnabled()).thenReturn(true); + when(jobKeys.getSessionStateCondition()).thenReturn("success|ready"); + when(jsonExtractorKeys.getSessionKeyValue()).thenReturn("success"); + Assert.assertTrue(Whitebox.invokeMethod(jsonExtractor, "isSessionStateMatch")); + + when(jsonExtractorKeys.getSessionKeyValue()).thenReturn("failed"); + Assert.assertFalse(Whitebox.invokeMethod(jsonExtractor, "isSessionStateMatch")); + } + + @Test + public void testRetrievePushDowns() throws Exception { + Map> derivedFields = new HashMap<>(); + JsonElement response = null; + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "retrievePushDowns", response, derivedFields), + new JsonObject()); + + response = JsonNull.INSTANCE; + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "retrievePushDowns", response, derivedFields), + new JsonObject()); + + response = new JsonArray(); + Assert.assertEquals( + Whitebox.invokeMethod(jsonExtractor, "retrievePushDowns", response, derivedFields), + new JsonObject()); + } + + @Test + public void testExtractJson() throws Exception { + InputStream input = null; + Assert.assertNull(Whitebox.invokeMethod(jsonExtractor, "extractJson", input)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/extractor/MultistageExtractorTest.java b/dil/src/test/java/com/linkedin/dil/extractor/MultistageExtractorTest.java new file mode 100644 index 0000000..5d4e46a --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/extractor/MultistageExtractorTest.java @@ -0,0 +1,509 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.extractor; + +import com.google.common.collect.ImmutableMap; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.lang.reflect.Method; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.ConfigurationKeys; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.filter.JsonSchemaBasedFilter; +import com.linkedin.dil.keys.ExtractorKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.preprocessor.GpgDecryptProcessor; +import com.linkedin.dil.preprocessor.GunzipProcessor; +import com.linkedin.dil.preprocessor.InputStreamProcessor; +import com.linkedin.dil.source.HttpSource; +import com.linkedin.dil.source.MultistageSource; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.WorkUnitStatus; +import org.apache.gobblin.runtime.JobState; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.joda.time.DateTime; +import org.mockito.Mockito; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@PrepareForTest({Thread.class, IOUtils.class}) +public class MultistageExtractorTest extends PowerMockTestCase { + private Gson gson; + private ExtractorKeys extractorKeys; + private MultistageExtractor multistageExtractor; + private MultistageSource source; + private WorkUnitState state; + private WorkUnitStatus workUnitStatus; + private JobKeys jobKeys; + private JsonArray jsonSchema; + private JsonArray outputSchema; + + @BeforeMethod + public void setUp() { + gson = new Gson(); + extractorKeys = Mockito.mock(ExtractorKeys.class); + state = mock(WorkUnitState.class); + workUnitStatus = Mockito.mock(WorkUnitStatus.class); + source = mock(MultistageSource.class); + jobKeys = Mockito.mock(JobKeys.class); + jsonSchema = new JsonArray(); + outputSchema = new JsonArray(); + multistageExtractor = new MultistageExtractor(state, source.getJobKeys()); + multistageExtractor.extractorKeys = extractorKeys; + multistageExtractor.jobKeys = jobKeys; + } + + @Test + public void testInitialization() { + WorkUnitState state = mock(WorkUnitState.class); + when(state.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn("[{\"name\": \"activityDate\", \"formula\": {\"type\": \"epoc\", \"source\": \"fromDateTime\", \"format\": \"yyyy-MM-dd'T'HH:mm:ss'Z'\"}}]"); + when(state.getProp("ms.output.schema", new JsonArray().toString())).thenReturn(""); + when(state.getProp("ms.activation.property", new JsonObject().toString())).thenReturn("{\"a\":\"x\"}"); + + SourceState sourceState = mock(SourceState.class); + when(sourceState.contains("source.conn.use.proxy.url")).thenReturn(true); + when(sourceState.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new HttpSource(); + source.getWorkunits(sourceState); + + MultistageExtractor extractor = new MultistageExtractor(state, source.getJobKeys()); + Assert.assertNotNull(source.getJobKeys().getDerivedFields()); + } + + @Test + public void testJobProperties() { + WorkUnitState state = mock(WorkUnitState.class); + when(state.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn("[{\"name\": \"activityDate\", \"formula\": {\"type\": \"epoc\", \"source\": \"fromDateTime\", \"format\": \"yyyy-MM-dd'T'HH:mm:ss'Z'\"}}]"); + when(state.getProp("ms.output.schema", new JsonArray().toString())).thenReturn(""); + + SourceState sourceState = mock(SourceState.class); + + when(state.getProp("ms.activation.property", new JsonObject().toString())).thenReturn("{\"a\":\"x\"}"); + Assert.assertNotNull(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getProp(state)); + Assert.assertNotNull(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getValidNonblankWithDefault(state)); + Assert.assertTrue(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validate(state)); + Assert.assertTrue(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validateNonblank(state)); + + when(state.getProp("ms.activation.property", new JsonObject().toString())).thenReturn("{\"a\"}"); + Assert.assertFalse(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validate(state)); + Assert.assertFalse(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validateNonblank(state)); + Assert.assertNotNull(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getValidNonblankWithDefault(state)); + + when(state.getProp("ms.activation.property", new JsonObject().toString())).thenReturn("{}"); + Assert.assertTrue(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validate(state)); + Assert.assertFalse(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validateNonblank(state)); + Assert.assertNotNull(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getValidNonblankWithDefault(state)); + + when(state.getProp("ms.activation.property", new JsonObject().toString())).thenReturn(""); + Assert.assertTrue(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validate(state)); + Assert.assertFalse(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.validateNonblank(state)); + Assert.assertNotNull(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.getValidNonblankWithDefault(state)); + } + + + @Test + public void testWorkUnitWatermark(){ + SourceState state = mock(SourceState.class); + when(state.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new MultistageSource(); + List workUnits = source.getWorkunits(state); + WorkUnitState workUnitState = new WorkUnitState(workUnits.get(0)); + JsonExtractor extractor = new JsonExtractor(workUnitState, source.getJobKeys()); + + // low watermark by default is 2017-01-01 + Assert.assertEquals("1546329600000", extractor.getWorkUnitWaterMarks().get("low").getAsString()); + } + + @Test + public void testGetOnePreprocessor() { + WorkUnitState state = mock(WorkUnitState.class); + when(state.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn( + "[]"); + when(state.getProp("ms.output.schema", new JsonArray().toString())).thenReturn(""); + when(state.getProp("ms.activation.property", new JsonObject().toString())).thenReturn( + "{\"a\":\"x\"}"); + when(state.getProp("ms.extract.preprocessor.parameters", new JsonObject().toString())).thenReturn( + "{\"com.linkedin.dil.preprocessor.GpgProcessor\":" + + "{\"keystore_path\" :\"some path\", \"keystore_password\" : \"some password\"}}"); + when(state.getProp("ms.extract.preprocessors", new String())).thenReturn( + "com.linkedin.dil.preprocessor.GpgProcessor"); + + SourceState sourceState = mock(SourceState.class); + when(sourceState.contains("source.conn.use.proxy.url")).thenReturn(true); + when(sourceState.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new HttpSource(); + source.getWorkunits(sourceState); + + MultistageExtractor extractor = new MultistageExtractor(state, source.getJobKeys()); + + List res = extractor.getPreprocessors(state); + Assert.assertEquals(res.size(), 1); + Assert.assertTrue(res.get(0) instanceof GpgDecryptProcessor); + } + + @Test + public void testGetTwoPreprocessors() { + WorkUnitState state = mock(WorkUnitState.class); + when(state.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn( + "[]"); + when(state.getProp("ms.output.schema", new JsonArray().toString())).thenReturn(""); + when(state.getProp("ms.activation.property", new JsonObject().toString())).thenReturn( + "{\"a\":\"x\"}"); + when(state.getProp("ms.extract.preprocessor.parameters", new JsonObject().toString())).thenReturn( + "{\"com.linkedin.dil.preprocessor.GpgProcessor\":" + + "{\"keystore_path\" :\"some path\", \"keystore_password\" : \"some password\"}}"); + when(state.getProp("ms.extract.preprocessors", new String())).thenReturn( + "com.linkedin.dil.preprocessor.GpgProcessor,"+ + "com.linkedin.dil.preprocessor.GunzipProcessor"); + + SourceState sourceState = mock(SourceState.class); + when(sourceState.contains("source.conn.use.proxy.url")).thenReturn(true); + when(sourceState.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new HttpSource(); + source.getWorkunits(sourceState); + + MultistageExtractor extractor = new MultistageExtractor(state, source.getJobKeys()); + + List res = extractor.getPreprocessors(state); + Assert.assertEquals(res.size(), 2); + Assert.assertTrue(res.get(0) instanceof GpgDecryptProcessor); + Assert.assertTrue(res.get(1) instanceof GunzipProcessor); + } + + @Test + public void testGetSchema() { + Assert.assertNull(multistageExtractor.getSchema()); + } + + @Test + public void testGetExpectedRecordCount() { + Assert.assertEquals(multistageExtractor.getExpectedRecordCount(), 0); + } + + @Test + public void testGetHighWatermark() { + Assert.assertEquals(multistageExtractor.getHighWatermark(), 0); + } + + @Test + public void testReadRecord() { + Assert.assertNull(multistageExtractor.readRecord(null)); + } + + @Test + public void testClose() { + when(state.getWorkingState()).thenReturn(WorkUnitState.WorkingState.CANCELLED); + multistageExtractor.close(); + } + + @Test + public void testProcessInputStream() { + MultistageSource source = new MultistageSource(); + List wus = source.getWorkunits(new SourceState()); + WorkUnitState state = new WorkUnitState(wus.get(0), new JobState()); + multistageExtractor = new MultistageExtractor(state, source.getJobKeys()); + multistageExtractor.initialize(new ExtractorKeys()); + Assert.assertFalse(multistageExtractor.processInputStream(100L)); + } + + @Test + public void testSetRowFilter() { + JsonSchemaBasedFilter filter = Mockito.mock(JsonSchemaBasedFilter.class); + JsonArray schema = new JsonArray(); + multistageExtractor.rowFilter = filter; + multistageExtractor.setRowFilter(schema); + + multistageExtractor.rowFilter = null; + when(state.getProp(MultistageProperties.MSTAGE_ENABLE_SCHEMA_BASED_FILTERING.getConfig(), StringUtils.EMPTY)).thenReturn("false"); + multistageExtractor.setRowFilter(new JsonArray()); + Assert.assertNull(multistageExtractor.rowFilter); + } + + @Test + public void testGetOrInferSchema() { + MultistageSource source = new MultistageSource(); + List wus = source.getWorkunits(new SourceState()); + WorkUnitState state = new WorkUnitState(wus.get(0), new JobState()); + multistageExtractor = new MultistageExtractor(state, source.getJobKeys()); + multistageExtractor.initialize(new ExtractorKeys()); + + JsonObject schema = new JsonObject(); + schema.addProperty("testAttribute", "something"); + + JsonArray schemaArray = new JsonArray(); + Map defaultFieldTypes = new HashMap<>(); + + Assert.assertEquals(multistageExtractor.getOrInferSchema(), schemaArray); + + ExtractorKeys extractorKeys = Mockito.mock(ExtractorKeys.class); + JsonArray inferredSchema = new JsonArray(); + JsonObject schemaObj = new JsonObject(); + schemaObj.addProperty("type", "null"); + multistageExtractor.extractorKeys = extractorKeys; + when(extractorKeys.getInferredSchema()).thenReturn(inferredSchema); + when(extractorKeys.getActivationParameters()).thenReturn(schemaObj); + when(extractorKeys.getPayloads()).thenReturn(new JsonArray()); + when(jobKeys.hasSourceSchema()).thenReturn(false); + Assert.assertEquals(multistageExtractor.getOrInferSchema(), schemaArray); + + when(jobKeys.hasSourceSchema()).thenReturn(true); + Assert.assertEquals(multistageExtractor.getOrInferSchema(), schemaArray); + } + + @Test + public void testHoldExecutionUnitPresetStartTime() throws Exception { + multistageExtractor.extractorKeys = extractorKeys; + //current time + 3 s + Long currentSeconds = DateTime.now().plusSeconds(3).getMillis(); + when(extractorKeys.getDelayStartTime()).thenReturn(currentSeconds); + + PowerMockito.mockStatic(Thread.class); + PowerMockito.doNothing().when(Thread.class); + Thread.sleep(100L); + multistageExtractor.holdExecutionUnitPresetStartTime(); + + when(extractorKeys.getDelayStartTime()).thenReturn(DateTime.now().plusSeconds(3).getMillis()); + PowerMockito.doThrow(new InterruptedException()).when(Thread.class); + Thread.sleep(100L); + multistageExtractor.holdExecutionUnitPresetStartTime(); + } + + @Test + public void testsFailWorkUnit() { + state = new WorkUnitState(); + WorkUnitState stateSpy = spy(state); + multistageExtractor.state = stateSpy; + multistageExtractor.failWorkUnit(StringUtils.EMPTY); + verify(stateSpy).setWorkingState(WorkUnitState.WorkingState.FAILED); + multistageExtractor.failWorkUnit("NON_EMPTY_ERROR_STRING"); + } + + @Test + public void testDeriveEpoc() { + String format = "yyyy-MM-dd"; + String strValue = "2020-06-20"; + Assert.assertNotEquals(multistageExtractor.deriveEpoc(format, strValue), StringUtils.EMPTY); + + strValue = "2018-07-14Txsdfs"; + Assert.assertNotEquals(multistageExtractor.deriveEpoc(format, strValue), StringUtils.EMPTY); + + format = "yyyy-MM-dd'T'HH:mm:ssZ"; + strValue = "2018/07/14T14:31:30+0530"; + Assert.assertEquals(multistageExtractor.deriveEpoc(format, strValue), StringUtils.EMPTY); + } + + @Test + public void testsAddDerivedFieldsToAltSchema() { + Map items = ImmutableMap.of("type", "some_type", "source", "token.full_token"); + Map> derivedFields = ImmutableMap.of("formula", items); + JsonArray outputSchema = gson.fromJson("[{\"token.full_token\": {\"type\":\"string\"}}]", JsonArray.class); + when(source.getJobKeys()).thenReturn(jobKeys); + when(jobKeys.getOutputSchema()).thenReturn(outputSchema); + when(jobKeys.getDerivedFields()).thenReturn(derivedFields); + Assert.assertEquals(multistageExtractor.addDerivedFieldsToAltSchema().toString(), + "[{\"columnName\":\"formula\",\"dataType\":{\"type\":\"string\"}}]"); + } + + @Test + public void testExtractText() throws Exception { + Assert.assertEquals(multistageExtractor.extractText(null), StringUtils.EMPTY); + + String expected = "test_string"; + InputStream input = new ByteArrayInputStream(expected.getBytes()); + when(state.getProp(MultistageProperties.MSTAGE_SOURCE_DATA_CHARACTER_SET.getConfig(), StringUtils.EMPTY)).thenReturn("UTF-8"); + Assert.assertEquals(multistageExtractor.extractText(input), expected); + + PowerMockito.mockStatic(IOUtils.class); + PowerMockito.doThrow(new IOException()).when(IOUtils.class, "toString", input, Charset.forName("UTF-8")); + multistageExtractor.extractText(input); + Assert.assertEquals(multistageExtractor.extractText(null), StringUtils.EMPTY); + } + + @Test + public void testCheckContentType() { + String expectedContentType = "application/json"; + Map messages = new HashMap<>(); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertTrue(multistageExtractor.checkContentType(workUnitStatus, expectedContentType)); + + messages.put("contentType", expectedContentType); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertTrue(multistageExtractor.checkContentType(workUnitStatus, expectedContentType)); + + messages.put("contentType", "non-expected-contentType"); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertFalse(multistageExtractor.checkContentType(workUnitStatus, expectedContentType)); + + when(workUnitStatus.getMessages()).thenReturn(null); + Assert.assertTrue(multistageExtractor.checkContentType(workUnitStatus, expectedContentType)); + HashSet expectedContentTypeSet = new LinkedHashSet<>( + Arrays.asList("text/csv", "application/gzip", "application/json") + ); + messages.clear(); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertTrue(multistageExtractor.checkContentType(workUnitStatus, expectedContentTypeSet)); + + messages.put("contentType", expectedContentType); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertTrue(multistageExtractor.checkContentType(workUnitStatus, expectedContentTypeSet)); + + messages.put("contentType", "non-expected-contentType"); + when(workUnitStatus.getMessages()).thenReturn(messages); + Assert.assertFalse(multistageExtractor.checkContentType(workUnitStatus, expectedContentTypeSet)); + + when(workUnitStatus.getMessages()).thenReturn(null); + Assert.assertTrue(multistageExtractor.checkContentType(workUnitStatus, expectedContentTypeSet)); + } + + /** + * test getting session key value when the value is in the headers + */ + @Test + public void testGetSessionKeyValue() { + String headers = "{\"cursor\": \"123\"}"; + Map messages = new HashMap<>(); + messages.put("headers", headers); + when(workUnitStatus.getMessages()).thenReturn(messages); + + JsonObject sessionKeyField = gson.fromJson("{\"name\": \"cursor\"}", JsonObject.class); + when(source.getJobKeys()).thenReturn(jobKeys); + when(jobKeys.getSessionKeyField()).thenReturn(sessionKeyField); + + Assert.assertEquals(multistageExtractor.getSessionKey(workUnitStatus), "123"); + } + + @Test + public void testMinimumSchema() { + WorkUnitState state = new WorkUnitState(); + state.setProp(ConfigurationKeys.EXTRACT_PRIMARY_KEY_FIELDS_KEY, "id"); + state.setProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY, "date"); + MultistageSource source = new MultistageSource<>(); + MultistageExtractor extractor = new MultistageExtractor<>(state, source.getJobKeys()); + JsonArray schema = extractor.createMinimumSchema(); + String expected = "[{\"columnName\":\"id\",\"isNullable\":true,\"dataType\":{\"type\":\"string\"}},{\"columnName\":\"date\",\"isNullable\":true,\"dataType\":{\"type\":\"timestamp\"}}]"; + Assert.assertEquals(schema.toString(), expected); + } + + @Test + public void testMinimumSchemaEmpty() { + WorkUnitState state = new WorkUnitState(); + state.setProp(ConfigurationKeys.EXTRACT_PRIMARY_KEY_FIELDS_KEY, ""); + state.setProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY, "date"); + MultistageSource source = new MultistageSource<>(); + MultistageExtractor extractor = new MultistageExtractor<>(state, source.getJobKeys()); + JsonArray schema = extractor.createMinimumSchema(); + String expected = "[{\"columnName\":\"date\",\"isNullable\":true,\"dataType\":{\"type\":\"timestamp\"}}]"; + Assert.assertEquals(schema.toString(), expected); + } + /** + * ReplaceVariablesInParameters() replace placeholders with their real values. This process + * is called substitution. + * + * When the substituted parameter starts with tmp, the parameter is removed from the final. + * + * @throws Exception + */ + @Test + public void testReplaceVariablesInParameters() throws Exception { + WorkUnitState state = new WorkUnitState(); + MultistageSource source = new MultistageSource<>(); + MultistageExtractor extractor = new MultistageExtractor<>(state, source.getJobKeys()); + + JsonObject parameters = gson.fromJson("{\"param1\":\"value1\"}", JsonObject.class); + JsonObject replaced = extractor.replaceVariablesInParameters(parameters); + Assert.assertEquals(replaced, parameters); + + parameters = gson.fromJson("{\"param1\":\"value1\",\"param2\":\"{{param1}}\"}", JsonObject.class); + JsonObject parameters2Expected = gson.fromJson("{\"param1\":\"value1\",\"param2\":\"value1\"}", JsonObject.class); + replaced = extractor.replaceVariablesInParameters(parameters); + Assert.assertEquals(replaced, parameters2Expected); + + parameters = gson.fromJson("{\"tmpParam1\":\"value1\",\"param2\":\"{{tmpParam1}}\"}", JsonObject.class); + parameters2Expected = gson.fromJson("{\"param2\":\"value1\"}", JsonObject.class); + replaced = extractor.replaceVariablesInParameters(parameters); + Assert.assertEquals(replaced, parameters2Expected); + } + + @Test + public void testAppendActivationParameter() throws Exception { + MultistageExtractor extractor = Mockito.mock(MultistageExtractor.class); + ExtractorKeys extractorKeys = Mockito.mock(ExtractorKeys.class); + extractor.extractorKeys = extractorKeys; + + JsonObject obj = gson.fromJson("{\"survey\": \"id1\"}", JsonObject.class); + when(extractorKeys.getActivationParameters()).thenReturn(obj); + + Method method = MultistageExtractor.class.getDeclaredMethod("appendActivationParameter", JsonObject.class); + method.setAccessible(true); + + Assert.assertEquals(method.invoke(extractor, obj), obj); + } + + @Test + public void testGetUpdatedWorkUnitVariableValues() throws Exception { + MultistageExtractor extractor = Mockito.mock(MultistageExtractor.class); + WorkUnitStatus wuStatus = Mockito.mock(WorkUnitStatus.class); + + when(extractor.getWorkUnitStatus()).thenReturn(wuStatus); + when(wuStatus.getPageSize()).thenReturn(100L); + when(wuStatus.getPageNumber()).thenReturn(5L); + when(wuStatus.getPageStart()).thenReturn(1L); + when(wuStatus.getSessionKey()).thenReturn("test_session_key"); + + JsonObject jsonObject = new JsonObject(); + jsonObject.addProperty(ParameterTypes.SESSION.toString(), "{\"name\": \"status\"}"); + jsonObject.addProperty(ParameterTypes.PAGESTART.toString(), 1); + jsonObject.addProperty(ParameterTypes.PAGESIZE.toString(), 100); + jsonObject.addProperty(ParameterTypes.PAGENO.toString(), 5); + + Method method = MultistageExtractor.class.getDeclaredMethod("getUpdatedWorkUnitVariableValues", JsonObject.class); + method.setAccessible(true); + + Assert.assertEquals(method.invoke(extractor, jsonObject).toString(), + "{\"session\":\"test_session_key\",\"pagestart\":1,\"pagesize\":100,\"pageno\":5}"); + + when(wuStatus.getPageSize()).thenReturn(-1L); + Assert.assertEquals(method.invoke(extractor, jsonObject).toString(), + "{\"pagesize\":100,\"session\":\"test_session_key\",\"pagestart\":1,\"pageno\":5}"); + } + + @Test + public void testGetInitialWorkUnitVariableValues() throws Exception { + MultistageExtractor extractor = Mockito.mock(MultistageExtractor.class); + Method method = MultistageExtractor.class.getDeclaredMethod("getInitialWorkUnitVariableValues"); + method.setAccessible(true); + + JobKeys jobKeys = Mockito.mock(JobKeys.class); + extractor.jobKeys = jobKeys; + JsonObject waterMarkObj = gson.fromJson("{\"watermark\":{\"low\":-100,\"high\":1564642800}}", JsonObject.class); + when(extractor.getWorkUnitWaterMarks()).thenReturn(waterMarkObj); + when(jobKeys.getPaginationInitValues()).thenReturn(ImmutableMap.of(ParameterTypes.PAGESIZE, 10L)); + Assert.assertEquals(method.invoke(extractor).toString(), + "{\"watermark\":{\"watermark\":{\"low\":-100,\"high\":1564642800}},\"pagesize\":10}"); + } + + +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/factory/ApacheHttpClientFactoryTest.java b/dil/src/test/java/com/linkedin/dil/factory/ApacheHttpClientFactoryTest.java new file mode 100644 index 0000000..3225aa2 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/factory/ApacheHttpClientFactoryTest.java @@ -0,0 +1,39 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.factory; + +import org.apache.gobblin.configuration.State; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.mockito.Mock; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@PrepareForTest({HttpClientBuilder.class}) +public class ApacheHttpClientFactoryTest extends PowerMockTestCase { + @Mock + private HttpClientBuilder httpClientBuilder; + + @Mock + private CloseableHttpClient closeableHttpClient; + + /** + * Test whether an Apache HttpClient is produced as expected + */ + @Test + public void testGet() { + ApacheHttpClientFactory factory = new ApacheHttpClientFactory(); + PowerMockito.mockStatic(HttpClientBuilder.class); + PowerMockito.when(HttpClientBuilder.create()).thenReturn(httpClientBuilder); + when(httpClientBuilder.build()).thenReturn(closeableHttpClient); + Assert.assertEquals(factory.get(new State()), closeableHttpClient); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/filter/AvroSchemaBasedFilterTest.java b/dil/src/test/java/com/linkedin/dil/filter/AvroSchemaBasedFilterTest.java new file mode 100644 index 0000000..c39b215 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/filter/AvroSchemaBasedFilterTest.java @@ -0,0 +1,84 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import java.util.ArrayList; +import java.util.List; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.exception.RetriableAuthenticationException; +import com.linkedin.dil.keys.AvroExtractorKeys; +import com.linkedin.dil.util.AvroSchemaUtils; +import com.linkedin.dil.util.JsonIntermediateSchema; +import org.apache.gobblin.source.workunit.Extract; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +public class AvroSchemaBasedFilterTest { + private GenericRecord inputRecord; + private Gson GSON; + private AvroExtractorKeys _avroExtractorKeys; + private WorkUnitState state; + + @BeforeMethod + public void setUp() throws RetriableAuthenticationException { + Schema schema = Schema.createRecord("test", "test", "test", false); + List fieldList = new ArrayList<>(); + fieldList.add(new Schema.Field("id0", Schema.create(Schema.Type.STRING), "id0", null)); + fieldList.add(new Schema.Field("id1", Schema.create(Schema.Type.STRING), "id0", null)); + schema.setFields(fieldList); + inputRecord = new GenericData.Record(schema); + inputRecord.put("id0", "0"); + inputRecord.put("id1", "1"); + + GSON = new Gson(); + _avroExtractorKeys = new AvroExtractorKeys(); + _avroExtractorKeys.setIsValidOutputSchema(true); + _avroExtractorKeys.setAvroOutputSchema(schema); + + state = mock(WorkUnitState.class); + Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, "com.linkedin.test", "test"); + when(state.getExtract()).thenReturn(extract); + } + + @Test + public void testFilter() { + // The case where one column is filtered out + JsonArray rawSchemaArray = GSON.fromJson( + "[{\"columnName\":\"id0\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]", JsonArray.class); + AvroSchemaBasedFilter avroSchemaBasedFilter = + new AvroSchemaBasedFilter(new JsonIntermediateSchema(rawSchemaArray), _avroExtractorKeys, state); + GenericRecord record = avroSchemaBasedFilter.filter(inputRecord); + // id0 remains + Assert.assertEquals(record.get("id0"), "0"); + // id1 is filtered out + Assert.assertFalse(AvroSchemaUtils.getSchemaFieldNames(record.getSchema()).contains("id1")); + Assert.assertNull(record.get("id1")); + + // The case where output schema contains an extra column not in the original record + rawSchemaArray = GSON.fromJson( + "[{\"columnName\":\"id0\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, " + + "{\"columnName\":\"id1\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}, " + + "{\"columnName\":\"id2\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]", JsonArray.class); + avroSchemaBasedFilter = + new AvroSchemaBasedFilter(new JsonIntermediateSchema(rawSchemaArray), _avroExtractorKeys, state); + record = avroSchemaBasedFilter.filter(inputRecord); + // id0 remains + Assert.assertEquals(record.get("id0"), "0"); + // id1 remains + Assert.assertEquals(record.get("id1"), "1"); + // id2 is padded with null + Assert.assertTrue(AvroSchemaUtils.getSchemaFieldNames(record.getSchema()).contains("id2")); + Assert.assertNull(record.get("id2")); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/filter/JsonSchemaBasedFilterTest.java b/dil/src/test/java/com/linkedin/dil/filter/JsonSchemaBasedFilterTest.java new file mode 100644 index 0000000..96601a2 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/filter/JsonSchemaBasedFilterTest.java @@ -0,0 +1,50 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import java.lang.reflect.Method; +import com.linkedin.dil.util.JsonElementTypes; +import com.linkedin.dil.util.JsonIntermediateSchema; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@Test +public class JsonSchemaBasedFilterTest { + private Gson gson = new Gson(); + private JsonSchemaBasedFilter JsonSchemaBasedFilter; + + @BeforeMethod + public void Setup(){ + JsonIntermediateSchema schema = Mockito.mock(JsonIntermediateSchema.class); + JsonSchemaBasedFilter = new JsonSchemaBasedFilter(schema); + } + + /** + * Test filter(JsonIntermediateSchema.JisDataType dataType, JsonElement input) + */ + @Test + public void testFilterWithJsonJsonElementParameter() throws Exception { + Method method = JsonSchemaBasedFilter.class.getDeclaredMethod("filter", JsonIntermediateSchema.JisDataType.class, JsonElement.class); + method.setAccessible(true); + + JsonIntermediateSchema.JisDataType jisDataType = Mockito.mock(JsonIntermediateSchema.JisDataType.class); + when(jisDataType.isPrimitive()).thenReturn(false); + + JsonElement jsonElement = gson.fromJson("[]", JsonElement.class); + when(jisDataType.getType()).thenReturn(JsonElementTypes.ARRAY); + when(jisDataType.getItemType()).thenReturn(jisDataType); + Assert.assertEquals(method.invoke(JsonSchemaBasedFilter, jisDataType, jsonElement), jsonElement); + + when(jisDataType.getType()).thenReturn(JsonElementTypes.OBJECT); + Assert.assertEquals(method.invoke(JsonSchemaBasedFilter, jisDataType, jsonElement), null); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/filter/MultistageSchemaBasedFilterTest.java b/dil/src/test/java/com/linkedin/dil/filter/MultistageSchemaBasedFilterTest.java new file mode 100644 index 0000000..fb42bac --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/filter/MultistageSchemaBasedFilterTest.java @@ -0,0 +1,21 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.filter; + +import com.linkedin.dil.util.JsonIntermediateSchema; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class MultistageSchemaBasedFilterTest { + + @Test + public void testFilter() { + JsonIntermediateSchema schema = Mockito.mock(JsonIntermediateSchema.class); + MultistageSchemaBasedFilter filter = new MultistageSchemaBasedFilter(schema); + Assert.assertEquals(filter.filter("input"), null); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/helpers/GobblinMultiStageTestHelpers.java b/dil/src/test/java/com/linkedin/dil/helpers/GobblinMultiStageTestHelpers.java new file mode 100644 index 0000000..0ab1a8f --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/helpers/GobblinMultiStageTestHelpers.java @@ -0,0 +1,124 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.helpers; + +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.TimeZone; +import org.apache.commons.io.IOUtils; +import org.apache.gobblin.configuration.ConfigurationKeys; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.source.extractor.extract.LongWatermark; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.apache.http.HttpResponse; +import org.apache.http.ProtocolVersion; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.entity.BasicHttpEntity; +import org.apache.http.message.BasicStatusLine; + +import static org.mockito.Mockito.*; + + +/* + * Helper class to create mock test objects + */ +public class GobblinMultiStageTestHelpers { + + public GobblinMultiStageTestHelpers() {} + + public static WorkUnitState prepareMockWorkUnitState() { + + WorkUnit unit = mock(WorkUnit.class); + + /* + * mock watermark with default values + */ + LongWatermark low = mock(LongWatermark.class); + when(low.getValue()).thenReturn(-1L); + LongWatermark high = mock(LongWatermark.class); + when(high.getValue()).thenReturn(1L); + + + when(unit.getLowWatermark(any())).thenReturn(low); + when(unit.getExpectedHighWatermark(any())).thenReturn(high); + + WorkUnitState state = mock(WorkUnitState.class); + /* + * mocking properties in state that can be used in the extractor. + * If a property is used and not mocked, NPE will be thrown. + */ + when(state.getWorkunit()).thenReturn(unit); + when(state.getProp("source.conn.username")).thenReturn("dummy_username"); + when(state.getProp("source.conn.password")).thenReturn("dummy_password"); + when(state.getProp("PLAIN_PASSWORD", "")).thenReturn(""); + return state; + } + + public static SourceState prepareSourceStateWithoutWaterMark() { + SourceState state = mock(SourceState.class); + when(state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY)).thenReturn("dumyExtractNamespace"); + when(state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)).thenReturn("dumyExtractTableNameKey"); + when(state.getProp(ConfigurationKeys.DATASET_URN_KEY)).thenReturn("dumyDataSetUrnKey"); + when(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "SNAPSHOT_ONLY")).thenReturn("SNAPSHOT_ONLY"); + when(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "")).thenReturn("SNAPSHOT_ONLY"); + when(state.getProp(ConfigurationKeys.DATASET_URN_KEY)).thenReturn("dumyDataSetUrnKey"); + return state; + } + + public static SourceState prepareSourceStateWithWaterMark() { + SourceState state = prepareSourceStateWithoutWaterMark(); + when(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "SNAPSHOT_ONLY")).thenReturn("SNAPSHOT_ONLY"); + when(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "")).thenReturn("SNAPSHOT_ONLY"); + when(state.getProp(ConfigurationKeys.DATASET_URN_KEY)).thenReturn("dumyDataSetUrnKey"); + return state; + } + + /* + * Preparing mocked response for a valid use case. + */ + public static HttpResponse getBaseMockHttpResponse() { + HttpResponse resp = mock(CloseableHttpResponse.class); + BasicStatusLine line = new BasicStatusLine(new ProtocolVersion("http", 1, 1), + 200, "success"); + when(resp.getStatusLine()).thenReturn(line); + BasicHttpEntity entity = new BasicHttpEntity(); + entity.setContent(IOUtils.toInputStream(MockedResponseStrings.mockedStringResponse)); + when(resp.getEntity()).thenReturn(entity); + return resp; + } + + /* + * Preparing mocked response for a invalid use case. + */ + public static HttpResponse getMockHttpResponseInvalidStatus() { + HttpResponse resp = getBaseMockHttpResponse(); + BasicStatusLine line = new BasicStatusLine(new ProtocolVersion("http", 1, 1), + 400, "success"); + when(resp.getStatusLine()).thenReturn(line); + return resp; + } + + /* + * Preparing mocked response for a valid use case with multiple records. + */ + public static HttpResponse getBaseMockHttpResponseMultipleRecords() { + HttpResponse resp = mock(CloseableHttpResponse.class); + BasicStatusLine line = new BasicStatusLine(new ProtocolVersion("http", 1, 1), + 200, "success"); + when(resp.getStatusLine()).thenReturn(line); + BasicHttpEntity entity = new BasicHttpEntity(); + entity.setContent(IOUtils.toInputStream(MockedResponseStrings.mockedStringResponseMultipleRecords)); + when(resp.getEntity()).thenReturn(entity); + return resp; + } + + public static String getDateFromTimeStamp(long timestamp) { + DateFormat df = new SimpleDateFormat("yyyy-MM-dd"); + df.setTimeZone(TimeZone.getTimeZone("GMT")); + return df.format(new Date(timestamp)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/helpers/MockedResponseStrings.java b/dil/src/test/java/com/linkedin/dil/helpers/MockedResponseStrings.java new file mode 100644 index 0000000..a710c4d --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/helpers/MockedResponseStrings.java @@ -0,0 +1,14 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.helpers; + +public class MockedResponseStrings { + public MockedResponseStrings() { + } + + public static final String mockedStringResponse = + "[{\"id\":1111,\"site_id\":3333,\"uuid\":null,\"name\":\"dummyGHPExitSurvey\",\"active\":false,\"kind\":\"embed\",\"canonical_name\":null,\"created_at\":\"2017-09-1123:20:05+0000\",\"survey_url\":\"https://dummy/surveys/179513\",\"type\":\"survey\"}]"; + public static final String mockedStringResponseMultipleRecords = "[{\"id\":1111,\"site_id\":3333,\"uuid\":null,\"name\":\"dummyGHPExitSurvey\",\"active\":false,\"kind\":\"embed\",\"canonical_name\":null,\"created_at\":\"2017-09-1123:20:05+0000\",\"survey_url\":\"https://dummy/surveys/179513\",\"type\":\"survey\"},{\"id\":2222,\"site_id\":1234,\"uuid\":null,\"name\":\"dummyGHPExitSurvey\",\"active\":false,\"kind\":\"embed\",\"canonical_name\":null,\"created_at\":\"2017-09-1123:20:05+0000\",\"survey_url\":\"https://dummy/surveys/179513\",\"type\":\"survey\"}]"; +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/jdbcclient/DatabaseTest.java b/dil/src/test/java/com/linkedin/dil/jdbcclient/DatabaseTest.java new file mode 100644 index 0000000..77d0b2f --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/jdbcclient/DatabaseTest.java @@ -0,0 +1,56 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.jdbcclient; + +import mockit.Mock; +import mockit.MockUp; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.util.Database; +import org.apache.gobblin.password.PasswordManager; +import org.mockito.Matchers; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@Test +public class DatabaseTest { + @Test + public void testFromUrl() { + Database mySqlDb = Database.fromUrl("jdbc:mysql://localhost:3036/test"); + Assert.assertEquals(mySqlDb, Database.MYSQL); + } + + @Test + public void testGetName() { + Database mySqlDb = Database.fromUrl("jdbc:mysql://localhost:3036/test"); + Assert.assertEquals(mySqlDb.getName(), "MySql"); + } + + @Test + public void testGetDbType() { + Database mySqlDb = Database.fromUrl("jdbc:mysql://localhost:3036/test"); + Assert.assertEquals(mySqlDb.getDbType(), "mysql"); + } + + @Test + public void getDefaultDriver() { + Database mySqlDb = Database.fromUrl("jdbc:mysql://localhost:3036/test"); + Assert.assertEquals(mySqlDb.getDefaultDriver(), "com.mysql.cj.jdbc.Driver"); + } + + private void mockEncryptionUtils(String expectPassword) { + new MockUp() { + @Mock + PasswordManager getInstance(State state) { + PasswordManager pm = mock(PasswordManager.class); + when(pm.readPassword(Matchers.any())).thenReturn(expectPassword); + return pm; + } + }; + } +} + diff --git a/dil/src/test/java/com/linkedin/dil/keys/FileDumpExtractorKeysTest.java b/dil/src/test/java/com/linkedin/dil/keys/FileDumpExtractorKeysTest.java new file mode 100644 index 0000000..973ba37 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/keys/FileDumpExtractorKeysTest.java @@ -0,0 +1,19 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class FileDumpExtractorKeysTest { + + @Test + public void testIncrCurrentFileNumber() { + FileDumpExtractorKeys key = new FileDumpExtractorKeys(); + key.incrCurrentFileNumber(); + Assert.assertEquals(key.getCurrentFileNumber(), 1); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/keys/JobKeysTest.java b/dil/src/test/java/com/linkedin/dil/keys/JobKeysTest.java new file mode 100644 index 0000000..f408658 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/keys/JobKeysTest.java @@ -0,0 +1,203 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.keys; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import gobblin.configuration.SourceState; +import java.lang.reflect.Method; +import java.util.HashMap; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.util.JsonUtils; +import com.linkedin.dil.util.ParameterTypes; +import com.linkedin.dil.util.SchemaBuilder; +import com.linkedin.dil.util.WorkUnitPartitionTypes; +import org.mockito.Mockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@PrepareForTest({JsonUtils.class}) +public class JobKeysTest extends PowerMockTestCase { + private JobKeys jobKeys; + private Gson gson; + + @BeforeMethod + public void setUp() { + jobKeys = new JobKeys(); + gson = new Gson(); + } + + @Test + public void testIsSessionStateEnabled() { + JsonObject sessions = new JsonObject(); + jobKeys.setSessionKeyField(sessions); + Assert.assertFalse(jobKeys.isSessionStateEnabled()); + + sessions.addProperty("non-condition", false); + Assert.assertFalse(jobKeys.isSessionStateEnabled()); + + JsonObject nestedObj = new JsonObject(); + sessions.add("condition", nestedObj); + Assert.assertFalse(jobKeys.isSessionStateEnabled()); + Assert.assertEquals(jobKeys.getSessionStateCondition(), StringUtils.EMPTY); + + nestedObj.addProperty("regexp", "testValue"); + sessions.add("condition", nestedObj); + Assert.assertTrue(jobKeys.isSessionStateEnabled()); + Assert.assertEquals(jobKeys.getSessionStateCondition(), "testValue"); + + JsonObject failConditionNestedObj = new JsonObject(); + sessions.add("failCondition", failConditionNestedObj); + Assert.assertTrue(jobKeys.isSessionStateEnabled()); + Assert.assertEquals(jobKeys.getSessionStateCondition(), "testValue"); + Assert.assertEquals(jobKeys.getSessionStateFailCondition(), StringUtils.EMPTY); + + failConditionNestedObj.addProperty("regexp", "testFailValue"); + sessions.add("failCondition", failConditionNestedObj); + Assert.assertTrue(jobKeys.isSessionStateEnabled()); + Assert.assertEquals(jobKeys.getSessionStateCondition(), "testValue"); + Assert.assertEquals(jobKeys.getSessionStateFailCondition(), "testFailValue"); + } + + @Test + public void testHasSourceSchema() { + JsonArray sourceSchema = SchemaBuilder.fromJsonData( + JsonUtils.createAndAddProperty("testKey", "testValue")).buildAltSchema().getAsJsonArray(); + Assert.assertFalse(jobKeys.hasSourceSchema()); + jobKeys.setSourceSchema(sourceSchema); + Assert.assertTrue(jobKeys.hasSourceSchema()); + } + + @Test + public void testIsPaginationEnabled() { + Assert.assertFalse(jobKeys.isPaginationEnabled()); + + Map paginationFields = new HashMap<>(); + paginationFields.put(ParameterTypes.PAGESIZE, "testValue"); + jobKeys.setPaginationFields(paginationFields); + Assert.assertTrue(jobKeys.isPaginationEnabled()); + + paginationFields = new HashMap<>(); + jobKeys.setPaginationFields(paginationFields); + Map paginationInitValues = new HashMap<>(); + paginationInitValues.put(ParameterTypes.PAGESIZE, 100L); + jobKeys.setPaginationInitValues(paginationInitValues); + Assert.assertTrue(jobKeys.isPaginationEnabled()); + } + + /** + * Test the validate() method + * + * Scenario 1: pagination defined, but no total count field, nor session key field + * + * Scenario 2: wrong output schema structure + */ + @Test + public void testValidation() { + // test pagination parameter validation + SourceState state = new SourceState(); + Map paginationInitValues = new HashMap<>(); + paginationInitValues.put(ParameterTypes.PAGESTART, 0L); + paginationInitValues.put(ParameterTypes.PAGESIZE, 100L); + jobKeys.setPaginationInitValues(paginationInitValues); + Assert.assertTrue(jobKeys.validate(state)); + + // test output schema validation with a wrong type + state.setProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "{}"); + Assert.assertFalse(jobKeys.validate(state)); + + // test output schema validation with an empty array + state.setProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "[{}]"); + Assert.assertFalse(jobKeys.validate(state)); + + // test output schema validation with an incorrect structure + String schema = "[{\"columnName\":\"test\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}]"; + state.setProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), schema); + jobKeys.initialize(state); + Assert.assertFalse(jobKeys.validate(state)); + + schema = "[{\"columnName\":\"test\",\"isNullable\":\"true\",\"dataType\":{\"type\":\"string\"}}]"; + state.setProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), schema); + jobKeys.setOutputSchema(jobKeys.parseOutputSchema(state)); + Assert.assertTrue(jobKeys.validate(state)); + + state.setProp(MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.getConfig(), "lovely"); + jobKeys.setWorkUnitPartitionType(null); + Assert.assertFalse(jobKeys.validate(state)); + + state.setProp(MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.getConfig(), "{\"weekly\": [\"2020-01-01\", \"2020-02-1\"]}"); + jobKeys.setWorkUnitPartitionType(WorkUnitPartitionTypes.COMPOSITE); + Assert.assertFalse(jobKeys.validate(state)); + } + + @Test + public void testGetDefaultFieldTypes() throws Exception { + JobKeys jobkeys = new JobKeys(); + Method method = JobKeys.class.getDeclaredMethod("parseDefaultFieldTypes", State.class); + method.setAccessible(true); + + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_DATA_DEFAULT_TYPE.getConfig(), new JsonObject().toString())).thenReturn("{\"testField\":100}"); + Assert.assertEquals(method.invoke(jobkeys, state).toString(), "{testField=100}"); + } + + @Test + public void testParseSecondaryInputRetry() throws Exception { + JobKeys jobkeys = new JobKeys(); + JsonArray input = gson.fromJson("[{\"retry\": {\"threadpool\": 5}}]", JsonArray.class); + Method method = JobKeys.class.getDeclaredMethod("parseSecondaryInputRetry", JsonArray.class); + method.setAccessible(true); + Map actual = (Map) method.invoke(jobkeys, input); + Assert.assertEquals((long) actual.get("delayInSec"), 300L); + Assert.assertEquals((long) actual.get("retryCount"), 3); + + input = gson.fromJson("[{\"retry\": {\"delayInSec\": 500,\"retryCount\": 5}}]", JsonArray.class); + actual = (Map) method.invoke(jobkeys, input); + Assert.assertEquals((long) actual.get("delayInSec"), 500L); + Assert.assertEquals((long) actual.get("retryCount"), 5); + } + + @Test + public void testGetPaginationInitialValues() throws Exception { + JobKeys jobkeys = new JobKeys(); + Method method = JobKeys.class.getDeclaredMethod("parsePaginationInitialValues", State.class); + method.setAccessible(true); + + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_PAGINATION.getConfig(), new JsonObject().toString())) + .thenReturn("{\"fields\": [\"offset\", \"limit\"], \"initialvalues\": [0, 5000]}"); + method.invoke(jobkeys, state); + Map paginationInitValues = jobkeys.getPaginationInitValues(); + Assert.assertEquals((long) paginationInitValues.get(ParameterTypes.PAGESTART), 0L); + Assert.assertEquals((long) paginationInitValues.get(ParameterTypes.PAGESIZE), 5000L); + } + + @Test + public void testGetPaginationFields() throws Exception { + JobKeys jobkeys = new JobKeys(); + State state = Mockito.mock(State.class); + when(state.getProp(MultistageProperties.MSTAGE_PAGINATION.getConfig(), new JsonObject().toString())) + .thenReturn("{\"fields\": [\"\", \"\"], \"initialvalues\": [0, 5000]}"); + Method method = JobKeys.class.getDeclaredMethod("parsePaginationFields", State.class); + method.setAccessible(true); + method.invoke(jobkeys, state); + Assert.assertEquals(jobkeys.getPaginationInitValues().size(), 0); + + when(state.getProp(MultistageProperties.MSTAGE_PAGINATION.getConfig(), new JsonObject().toString())) + .thenReturn("{\"initialvalues\": [0, 5000]}"); + method.invoke(jobkeys, state); + Assert.assertEquals(jobkeys.getPaginationInitValues().size(), 0); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/preprocessor/GpgDecryptProcessorTest.java b/dil/src/test/java/com/linkedin/dil/preprocessor/GpgDecryptProcessorTest.java new file mode 100644 index 0000000..30be085 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/preprocessor/GpgDecryptProcessorTest.java @@ -0,0 +1,94 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.InputStream; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.builder.EqualsBuilder; +import org.apache.gobblin.crypto.EncryptionConfigParser; +import org.apache.gobblin.crypto.GPGCodec; +import org.mockito.Mockito; +import org.powermock.reflect.Whitebox; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +public class GpgDecryptProcessorTest { + + private final static String KEYSTORE_PASSWORD = "test_keystore_password"; + private final static String KEYSTORE_PATH = "test_keystore_path"; + private final static long KEY_NAME = 2342341L; + private final static String CIPHER = "test_cipher"; + private static final String KEY_ACTION = "decrypt"; + private static final String UNSUPPORTED_KEY_ACTION = "unsupported_decrypt"; + private JsonObject parameters; + private GpgDecryptProcessor _gpgDecryptProcessor; + + @BeforeMethod + public void setUp() { + parameters = new JsonObject(); + } + + /** + * Test GpgProcessor Constructor with null parameters + */ + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGpgProcessorConstructorWithEmptyPassword() throws Exception { + parameters.addProperty(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY, StringUtils.EMPTY); + Whitebox.invokeMethod(new GpgDecryptProcessor(parameters), "getGpgCodec"); + } + + /** + * Test getGpgCodec with 3 happy paths + */ + @Test + public void testGetGpgCodec() { + parameters.addProperty(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY, KEYSTORE_PASSWORD); + _gpgDecryptProcessor = new GpgDecryptProcessor(parameters); + Assert.assertTrue(EqualsBuilder.reflectionEquals( + _gpgDecryptProcessor.getCodec(), new GPGCodec(KEYSTORE_PASSWORD, null))); + + parameters.addProperty(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PATH_KEY, KEYSTORE_PATH); + parameters.addProperty(EncryptionConfigParser.ENCRYPTION_CIPHER_KEY, CIPHER); + _gpgDecryptProcessor = new GpgDecryptProcessor(parameters); + Assert.assertNotNull(_gpgDecryptProcessor.getCodec()); + + parameters.addProperty(EncryptionConfigParser.ENCRYPTION_KEY_NAME, KEY_NAME); + _gpgDecryptProcessor = new GpgDecryptProcessor(parameters); + Assert.assertNotNull(_gpgDecryptProcessor.getCodec()); + } + + /** + * Test process with supported action + */ + @Test + public void testProcessWithSupportedAction() throws IOException { + parameters.addProperty(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY, KEYSTORE_PASSWORD); + parameters.addProperty("action", KEY_ACTION); + InputStream inputStream = Mockito.mock(InputStream.class); + GPGCodec gPGCodec = Mockito.mock(GPGCodec.class); + _gpgDecryptProcessor = new GpgDecryptProcessor(parameters); + _gpgDecryptProcessor.setCodec(gPGCodec); + when(gPGCodec.decodeInputStream(inputStream)).thenReturn(inputStream); + Assert.assertEquals(_gpgDecryptProcessor.process(inputStream), inputStream); + } + + /** + * Test process with unsupported action + */ + @Test(enabled = false) + public void testProcessWithUnsupportedAction() throws Exception { + parameters.addProperty(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY, KEYSTORE_PASSWORD); + parameters.addProperty("action", UNSUPPORTED_KEY_ACTION); + InputStream inputStream = Mockito.mock(InputStream.class); + _gpgDecryptProcessor = new GpgDecryptProcessor(parameters); + _gpgDecryptProcessor.process(inputStream); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/preprocessor/GpgEncryptProcessorTest.java b/dil/src/test/java/com/linkedin/dil/preprocessor/GpgEncryptProcessorTest.java new file mode 100644 index 0000000..6405412 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/preprocessor/GpgEncryptProcessorTest.java @@ -0,0 +1,43 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class GpgEncryptProcessorTest { + @Test + public void testConvertFileName() { + String fileName = "abc.zip"; + + JsonObject parameters = new JsonObject(); + parameters.addProperty("cipher", "AES_256"); + parameters.addProperty("keystore_path","/tmp/public.key"); + + OutputStreamProcessor processor = new GpgEncryptProcessor(parameters); + Assert.assertEquals(processor.convertFileName(fileName), "abc.zip.gpg"); + } + + @Test + public void testEncryption() throws IOException { + JsonObject parameters = new JsonObject(); + parameters.addProperty("cipher", "AES_256"); + parameters.addProperty("keystore_path",this.getClass().getResource("/key/public.key").toString()); + parameters.addProperty("key_name","48A84F2FA6E38870"); + + PipedInputStream is = new PipedInputStream(); + PipedOutputStream os = new PipedOutputStream(is); + + OutputStreamProcessor processor = new GpgEncryptProcessor(parameters); + Assert.assertNotNull(processor.process(os)); + + } +} diff --git a/dil/src/test/java/com/linkedin/dil/preprocessor/GpgPreprocessorTest.java b/dil/src/test/java/com/linkedin/dil/preprocessor/GpgPreprocessorTest.java new file mode 100644 index 0000000..5a6b966 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/preprocessor/GpgPreprocessorTest.java @@ -0,0 +1,120 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import org.apache.gobblin.crypto.GPGCodec; +import org.mockito.internal.util.reflection.Whitebox; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@Test +public class GpgPreprocessorTest { + + @Test(expectedExceptions = IllegalArgumentException.class) + void testGpgInitNoParameters() throws IOException { + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(null); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + void testGpgInitNoPassword() throws IOException { + JsonObject params = new JsonObject(); + params.addProperty("action", "decrypt"); + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(params); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + void testGpgInitEmptyPassword() throws IOException { + JsonObject params = new JsonObject(); + params.addProperty("action", "decrypt"); + params.addProperty("keystore_password", ""); + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(params); + } + + @Test() + void testGpgInitNoPassWordWithKey() { + JsonObject params = new JsonObject(); + // Provide key file, but not password + params.addProperty("action", "decrypt"); + params.addProperty("keystore_path", "some path"); + // No error should be thrown + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(params); + Assert.assertNotNull(preprocessor.getCodec()); + } + + @Test + void testGpgDecryptWithPassword() throws IOException { + InputStream inputStream = getClass().getResourceAsStream("/gpg/test.csv.gpg"); + InputStream decodedInputStream = getClass().getResourceAsStream("/gpg/test.csv"); + + GPGCodec mockedCodec = mock(GPGCodec.class); + when(mockedCodec.decodeInputStream(inputStream)).thenReturn(decodedInputStream); + JsonObject params = new JsonObject(); + params.addProperty("keystore_password", "gpgTest"); + params.addProperty("action", "decrypt"); + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(params); + Whitebox.setInternalState(preprocessor, "codec", mockedCodec); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(preprocessor.process(inputStream))) + .withCSVParser(new CSVParserBuilder().withSeparator(',').build()) + .withSkipLines(0) + .build(); + + Assert.assertEquals(2, reader.readAll().size()); + } + + @Test + void testGpgDefaultDecrypt() throws IOException { + InputStream inputStream = getClass().getResourceAsStream("/gpg/test.csv.gpg"); + InputStream decodedInputStream = getClass().getResourceAsStream("/gpg/test.csv"); + + GPGCodec mockedCodec = mock(GPGCodec.class); + when(mockedCodec.decodeInputStream(inputStream)).thenReturn(decodedInputStream); + JsonObject params = new JsonObject(); + params.addProperty("keystore_password", "gpgTest"); + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(params); + Whitebox.setInternalState(preprocessor, "codec", mockedCodec); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(preprocessor.process(inputStream))) + .withCSVParser(new CSVParserBuilder().withSeparator(',').build()) + .withSkipLines(0) + .build(); + + Assert.assertEquals(2, reader.readAll().size()); + } + + @Test(enabled = false) + void testGpgEncrypt() throws UnsupportedOperationException, IOException { + InputStream inputStream = getClass().getResourceAsStream("/gpg/test.csv.gpg"); + JsonObject params = new JsonObject(); + params.addProperty("keystore_password", "gpgTest"); + params.addProperty("action", "encrypt"); + params.addProperty("cipher", "AES256"); + params.addProperty("keystore_path", "some path"); + params.addProperty("key_name", 0); + + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(params); + // Should not support + preprocessor.process(inputStream); + } + + @Test + void testGpgFileNameConversion() { + String fileName = "test.gpg"; + String expectedFilename = "test"; + InputStream inputStream = getClass().getResourceAsStream("/gpg/test.csv.gpg"); + JsonObject params = new JsonObject(); + params.addProperty("keystore_password", "gpgTest"); + GpgDecryptProcessor preprocessor = new GpgDecryptProcessor(params); + Assert.assertEquals(expectedFilename, preprocessor.convertFileName(fileName)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/preprocessor/GunzipPreprocessorTest.java b/dil/src/test/java/com/linkedin/dil/preprocessor/GunzipPreprocessorTest.java new file mode 100644 index 0000000..a52d0d1 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/preprocessor/GunzipPreprocessorTest.java @@ -0,0 +1,37 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class GunzipPreprocessorTest { + @Test + void testGunzip() throws IOException { + InputStream inputStream = getClass().getResourceAsStream("/gzip/cc-index.paths.gz"); + GunzipProcessor preprocessor = new GunzipProcessor(null); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(preprocessor.process(inputStream))) + .withCSVParser(new CSVParserBuilder().withSeparator(',').build()) + .withSkipLines(0) + .build(); + + Assert.assertEquals(302, reader.readAll().size()); + } + + @Test + void testGunzipFileNameConversion() { + String filename = "test.gz"; + GunzipProcessor preprocessor = new GunzipProcessor(null); + Assert.assertEquals("test", preprocessor.convertFileName(filename)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/preprocessor/GunzipProcessorTest.java b/dil/src/test/java/com/linkedin/dil/preprocessor/GunzipProcessorTest.java new file mode 100644 index 0000000..60d9da9 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/preprocessor/GunzipProcessorTest.java @@ -0,0 +1,42 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.preprocessor; + +import com.google.gson.JsonObject; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +public class GunzipProcessorTest { + + private JsonObject parameters; + private String fileName; + + @BeforeMethod + public void setUp(){ + parameters = new JsonObject(); + } + + /** + * Test convertFileName with .gz file + */ + @Test + public void testConvertFileNameWithgz() { + fileName = "testFileName.gz"; + GunzipProcessor processor = new GunzipProcessor(parameters); + Assert.assertEquals(processor.convertFileName(fileName), "testFileName"); + } + + /** + * Test convertFileName with non .gz file + */ + @Test + public void testConvertFileName() { + fileName = "testFileName.nongz"; + GunzipProcessor processor = new GunzipProcessor(parameters); + Assert.assertEquals(processor.convertFileName(fileName), fileName); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/source/HdfsSourceTest.java b/dil/src/test/java/com/linkedin/dil/source/HdfsSourceTest.java new file mode 100644 index 0000000..08446b1 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/source/HdfsSourceTest.java @@ -0,0 +1,31 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import gobblin.configuration.SourceState; +import gobblin.configuration.State; +import gobblin.configuration.WorkUnitState; +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class HdfsSourceTest { + @Test + public void testInitialize() { + HdfsSource hdfsSource = new HdfsSource(); + SourceState state = new SourceState(); + Assert.assertTrue(hdfsSource.getWorkunits(state).size() > 0); + } + + @Test + public void testGetExtractor() { + HdfsSource hdfsSource = new HdfsSource(); + SourceState sourceState = new SourceState(); + WorkUnitState state = new WorkUnitState(hdfsSource.getWorkunits(sourceState).get(0), new State()); + state.setProp("ms.extractor.class", "com.linkedin.dil.extractor.CsvExtractor"); + hdfsSource.getExtractor(state); + Assert.assertNotNull(hdfsSource.getExtractor(state)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/source/HttpSourceTest.java b/dil/src/test/java/com/linkedin/dil/source/HttpSourceTest.java new file mode 100644 index 0000000..a8c1c67 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/source/HttpSourceTest.java @@ -0,0 +1,399 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.ConfigurationKeys; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.extractor.JsonExtractor; +import com.linkedin.dil.helpers.GobblinMultiStageTestHelpers; +import com.linkedin.dil.keys.HttpKeys; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.EncryptionUtils; +import com.linkedin.dil.util.ParameterTypes; +import org.apache.gobblin.runtime.embedded.EmbeddedGobblin; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.mockito.Mockito; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.powermock.reflect.Whitebox; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static com.linkedin.dil.source.HttpSource.*; +import static org.mockito.Mockito.*; + + +@Slf4j +@PrepareForTest({EncryptionUtils.class}) +public class HttpSourceTest extends PowerMockTestCase { + + private Gson gson; + private WorkUnitState state; + private HttpSource source; + private JobKeys jobKeys; + private SourceState sourceState; + private String token; + private JsonObject pagination; + private JsonObject sessionKeyField; + private String totalCountField; + private JsonArray parameters; + private JsonArray encryptionFields; + private String dataField; + private Long callInterval; + private Long waitTimeoutSeconds; + private Boolean enableCleansing; + private Boolean workUnitPartialPartition; + private JsonArray watermark; + private JsonArray secondaryInput; + private String httpClientFactory; + private JsonObject httpRequestHeaders; + private String sourceUri; + private String httpRequestMethod; + private String extractorClass; + private JsonObject authentication; + private JsonObject httpStatus; + private JsonObject httpStatusReasons; + + @BeforeMethod + public void setUp() { + gson = new Gson(); + state = Mockito.mock(WorkUnitState.class); + jobKeys = Mockito.mock(JobKeys.class); + sourceState = Mockito.mock(SourceState.class); + source = new HttpSource(); + } + + @Test(enabled = false) + public void testAuthentication() { + HttpSource source = new HttpSource(); + + SourceState state = mock(SourceState.class); + when(state.getProp("ms.watermark", "")).thenReturn("[{\"name\": \"system\",\"type\": \"datetime\", \"range\": {\"from\": \"2017-01-01\", \"to\": \"-\"}}]"); + when(state.getProp("extract.table.type", "SNAPSHOT_ONLY")).thenReturn("SNAPSHOT_ONLY"); + when(state.getProp("extract.namespace", "")).thenReturn("test"); + when(state.getProp("extract.table.name", "")).thenReturn("table1"); + when(state.getProp("source.conn.username", "")).thenReturn("X7CWBD5V4T6DR77WY23YSHACH55K2OXA"); + when(state.getProp("source.conn.password", "")).thenReturn(""); + when(state.getProp("ms.source.uri", "")).thenReturn("https://host/v2/users"); + when(state.getProp("ms.authentication", new JsonObject().toString())).thenReturn("{\"method\":\"basic\",\"encryption\":\"base64\", \"header\": \"Authorization\"}"); + when(state.getProp("ms.http.request.headers", new JsonObject().toString())).thenReturn("{\"Content-Type\": \"application/json\"}"); + when(state.getProp("ms.http.request.method", "")).thenReturn("GET"); + when(state.getProp("ms.session.key.field", new JsonObject().toString())).thenReturn("{\"name\": \"records.cursor\"}"); + when(state.getProp("ms.parameters", new JsonArray().toString())).thenReturn("[{\"name\":\"cursor\",\"type\":\"session\"}]"); + when(state.getProp("ms.data.field", "")).thenReturn("users"); + when(state.getProp("ms.total.count.field", "")).thenReturn("records.totalRecords"); + when(state.getProp("ms.work.unit.partition", "")).thenReturn(""); + when(state.getProp("ms.pagination", new JsonObject().toString())).thenReturn("{}"); + + List workUnits = source.getWorkunits(state); + + Assert.assertFalse(source.getJobKeys().isPaginationEnabled()); + Assert.assertNotNull(source.getJobKeys()); + Assert.assertNotNull(source.getHttpSourceKeys()); + Assert.assertNotNull(source.getJobKeys().getSourceParameters()); + Assert.assertTrue(workUnits.size() == 1); + Assert.assertEquals(source.getHttpSourceKeys().getHttpRequestHeaders().toString(), "{\"Content-Type\":\"application/json\"}"); + + WorkUnitState unitState = new WorkUnitState(workUnits.get(0)); + + JsonExtractor extractor = new JsonExtractor(unitState, source.getHttpSourceKeys()); + + JsonObject record = extractor.readRecord(new JsonObject()); + + // should return 14 columns + Assert.assertEquals(14, record.entrySet().size()); + Assert.assertTrue(extractor.getWorkUnitStatus().getTotalCount() > 0); + Assert.assertTrue(extractor.getWorkUnitStatus().getSessionKey().length() > 0); + } + + /* + * basic test with no watermark created. + */ + @Test(enabled=false) + public void getWorkUnitsTestEmpty() { + HttpSource source = new HttpSource(); + List workUnits = source.getWorkunits(GobblinMultiStageTestHelpers.prepareSourceStateWithoutWaterMark()); + Assert.assertTrue(workUnits.size() == 1); + Assert.assertEquals(workUnits.get(0).getLowWatermark().getAsJsonObject().get("value").toString(), "-1"); + Assert.assertEquals(workUnits.get(0).getExpectedHighWatermark().getAsJsonObject().get("value").toString(), "-1"); + } + + /* + * basic test with watermark. + */ + @Test(enabled=false) + public void getWorkUnitsTest() { + HttpSource source = new HttpSource(); + List workUnits = source.getWorkunits(GobblinMultiStageTestHelpers.prepareSourceStateWithWaterMark()); + Assert.assertTrue(workUnits.size() == 1); + + //time stamps below corresponds to the date given in watermark fields in test data. + Assert.assertEquals(GobblinMultiStageTestHelpers + .getDateFromTimeStamp( + Long.parseLong(workUnits.get(0).getLowWatermark().getAsJsonObject().get("value").toString())), + "2019-08-01"); + Assert.assertEquals(GobblinMultiStageTestHelpers + .getDateFromTimeStamp( + Long.parseLong(workUnits.get(0).getExpectedHighWatermark().getAsJsonObject().get("value").toString())), + "2019-08-02"); + } + + /* + * precondition check failure test. + */ + @Test(enabled=false) + public void preConditionCheckFail() { + boolean isIllegalState = false; + try { + HttpSource source = new HttpSource(); + SourceState state = GobblinMultiStageTestHelpers.prepareSourceStateWithWaterMark(); + when(state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY)).thenReturn(null); + List workUnits = source.getWorkunits(state); + } catch (Exception e) { + isIllegalState = e.getClass().getCanonicalName() + .contains("IllegalStateException"); + } + Assert.assertTrue(isIllegalState); + } + + @Test + public void testGetAuthenticationHeader() { + SourceState state = new SourceState(); + HttpSource httpSource = new HttpSource(); + state.setProp("source.conn.username", "1"); + state.setProp("source.conn.password", "2"); + + state.setProp("ms.authentication", "{\"method\":\"basic\",\"encryption\":\"base64\", \"header\": \"Authorization\"}"); + httpSource.initialize(state); + Assert.assertEquals(httpSource.getHttpSourceKeys().getHttpRequestHeadersWithAuthentication().toString(), "{Authorization=Basic MToy}"); + + state.setProp("ms.authentication", "{\"method\":\"bearer\",\"encryption\":\"base64\", \"header\": \"Authorization\"}"); + httpSource.initialize(state); + Assert.assertEquals(httpSource.getHttpSourceKeys().getHttpRequestHeadersWithAuthentication().toString(), "{Authorization=Bearer MToy}"); + + state.setProp("ms.authentication", "{\"method\":\"bearer\",\"encryption\":\"base64\", \"header\": \"Authorization\", \"token\": \"xyz\"}"); + httpSource.initialize(state); + Assert.assertEquals(httpSource.getHttpSourceKeys().getHttpRequestHeadersWithAuthentication().toString(), "{Authorization=Bearer eHl6}"); + } + + /** + * Test getAuthenticationHeader + */ + @Test + public void testGetAuthenticationHeader2() { + PowerMockito.mockStatic(EncryptionUtils.class); + + HttpKeys httpSourceKeys = mock(HttpKeys.class); + source.setHttpSourceKeys(httpSourceKeys); + + JsonObject authObj = gson.fromJson("{\"method\":\"some-method\",\"encryption\":\"base32\",\"header\":\"Authorization\"}", JsonObject.class); + when(httpSourceKeys.getAuthentication()).thenReturn(authObj); + Assert.assertEquals(source.getAuthenticationHeader(state), new HashMap<>()); + + authObj = gson.fromJson("{\"method\":\"oauth\",\"encryption\":\"base32\",\"header\":\"Authorization\",\"token\":\"sdf23someresfsdwrw24234\"}", JsonObject.class); + when(httpSourceKeys.getAuthentication()).thenReturn(authObj); + String token = "someDecryptedToken"; + when(EncryptionUtils.decryptGobblin(any(), any())).thenReturn(token); + Assert.assertEquals(source.getAuthenticationHeader(state).get("Authorization"), OAUTH_TOKEN_PREFIX + TOKEN_PREFIX_SEPARATOR + token); + + authObj = gson.fromJson("{\"method\":\"custom\",\"encryption\":\"base32\",\"header\":\"Authorization\",\"token\":\"sdf23someresfsdwrw24234\"}", JsonObject.class); + when(httpSourceKeys.getAuthentication()).thenReturn(authObj); + Assert.assertEquals(source.getAuthenticationHeader(state).get("Authorization"), token); + } + + /** + * This test, by simulation, verifies that when the http error message is contained in a normal response, + * we will be able to retrieve that if the content type is different from the expected + * + * The test queries non-existent S3 endpoint, which returns 404 error as expected, but we are simulating 404 + * as success by overriding status codes. + * + * @throws Exception + */ + @Test (enabled=false) + void testHttpErrorInNormalResponse() throws Exception { + EmbeddedGobblin job = new EmbeddedGobblin("test"); + Assert.assertTrue(job.jobFile(getClass().getResource("/pull/http-error.pull").getPath()).run().isSuccessful()); + } + + /** + * Test getExtractor + */ + @Test + public void testGetExtractor() { + initializeHelper(); + PowerMockito.mockStatic(EncryptionUtils.class); + when(EncryptionUtils.decryptGobblin(token, state)).thenReturn(token); + source.getExtractor(state); + jobKeys = source.getJobKeys(); + Map paginationFields = new HashMap<>(); + Map paginationInitValues = new HashMap<>(); + JsonArray fields = pagination.get("fields").getAsJsonArray(); + for (int i = 0; i < fields.size(); i++) { + switch (fields.get(i).getAsString()) { + case "page_start": + paginationFields.put(ParameterTypes.PAGESTART, "page_start"); + break; + case "page_size": + paginationFields.put(ParameterTypes.PAGESIZE, "page_size"); + break; + case "page_number": + paginationFields.put(ParameterTypes.PAGENO, "page_number"); + break; + } + } + + JsonArray initialvalues = pagination.get("initialvalues").getAsJsonArray(); + for (int i = 0; i < initialvalues.size(); i++) { + switch (i) { + case 0: + paginationInitValues.put(ParameterTypes.PAGESTART, initialvalues.get(0).getAsLong()); + break; + case 1: + paginationInitValues.put(ParameterTypes.PAGESIZE, initialvalues.get(1).getAsLong()); + break; + case 2: + paginationInitValues.put(ParameterTypes.PAGENO, initialvalues.get(2).getAsLong()); + break; + } + } + + Assert.assertEquals(jobKeys.getPaginationFields(), paginationFields); + Assert.assertEquals(jobKeys.getPaginationInitValues(), paginationInitValues); + Assert.assertEquals(jobKeys.getSessionKeyField(), sessionKeyField); + Assert.assertEquals(jobKeys.getTotalCountField(), totalCountField); + Assert.assertEquals(jobKeys.getSourceParameters(), parameters); + Assert.assertEquals(jobKeys.getEncryptionField(), encryptionFields); + Assert.assertEquals(jobKeys.getDataField(), dataField); + Assert.assertEquals(jobKeys.getCallInterval(), callInterval.longValue()); + Assert.assertEquals(jobKeys.getSessionTimeout(), waitTimeoutSeconds.longValue() * 1000); + Assert.assertEquals(jobKeys.getWatermarkDefinition(), watermark); + Assert.assertEquals(jobKeys.getSecondaryInputs(), secondaryInput); + Assert.assertEquals(source.getHttpSourceKeys().getAuthentication(), authentication); + Assert.assertEquals(source.getHttpSourceKeys().getSourceUri(), sourceUri); + Assert.assertEquals(source.getHttpSourceKeys().getHttpRequestMethod(), httpRequestMethod); + + Map> httpStatuses = new HashMap<>(); + for (Map.Entry entry : httpStatus.entrySet()) { + String key = entry.getKey(); + List codes = new ArrayList<>(); + for (int i = 0; i < entry.getValue().getAsJsonArray().size(); i++) { + codes.add(entry.getValue().getAsJsonArray().get(i).getAsInt()); + } + httpStatuses.put(key, codes); + } + Assert.assertEquals(source.getHttpSourceKeys().getHttpStatuses(), httpStatuses); + + Map> StatusesReasons = new HashMap<>(); + for (Map.Entry entry : httpStatusReasons.entrySet()) { + String key = entry.getKey(); + List reasons = new ArrayList<>(); + for (int i = 0; i < entry.getValue().getAsJsonArray().size(); i++) { + reasons.add(entry.getValue().getAsJsonArray().get(i).getAsString()); + } + StatusesReasons.put(key, reasons); + } + Assert.assertEquals(source.getHttpSourceKeys().getHttpStatusReasons(), StatusesReasons); + } + + /** + * Test getHttpStatuses + */ + @Test + public void testGetHttpStatuses() throws Exception { + String statuses = "{\"success\":{\"someKey\":\"someValue\"},\"warning\":null}"; + when(state.getProp(MSTAGE_HTTP_STATUSES.getConfig(), new JsonObject().toString())).thenReturn(statuses); + Assert.assertEquals(Whitebox.invokeMethod(source, "getHttpStatuses", state), new HashMap<>()); + } + + /** + * Test getHttpStatusReasons + */ + @Test + public void testGetHttpStatusReasons() throws Exception { + String reasons = "{\"success\":{\"someReason\":\"someValue\"},\"warning\":null}"; + when(state.getProp(MSTAGE_HTTP_STATUS_REASONS.getConfig(), new JsonObject().toString())).thenReturn(reasons); + Assert.assertEquals(Whitebox.invokeMethod(source, "getHttpStatusReasons", state), new HashMap<>()); + } + + private void initializeHelper() { + JsonObject allKeys = gson.fromJson(new InputStreamReader(this.getClass().getResourceAsStream("/json/sample-data-for-source.json")), JsonObject.class); + pagination = allKeys.get(MSTAGE_PAGINATION.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_PAGINATION.getConfig(), new JsonObject().toString())).thenReturn(pagination.toString()); + + sessionKeyField = allKeys.get(MSTAGE_SESSION_KEY_FIELD.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_SESSION_KEY_FIELD.getConfig(), new JsonObject().toString())).thenReturn(sessionKeyField.toString()); + + totalCountField = allKeys.get(MSTAGE_TOTAL_COUNT_FIELD.getConfig()).getAsString(); + when(state.getProp(MSTAGE_TOTAL_COUNT_FIELD.getConfig(), StringUtils.EMPTY)).thenReturn(totalCountField); + + parameters = allKeys.get(MSTAGE_PARAMETERS.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_PARAMETERS.getConfig(), new JsonArray().toString())).thenReturn(parameters.toString()); + + encryptionFields = allKeys.get(MSTAGE_ENCRYPTION_FIELDS.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_ENCRYPTION_FIELDS.getConfig(), new JsonArray().toString())).thenReturn(encryptionFields.toString()); + + dataField = allKeys.get(MSTAGE_DATA_FIELD.getConfig()).getAsString(); + when(state.getProp(MSTAGE_DATA_FIELD.getConfig(), StringUtils.EMPTY)).thenReturn(dataField); + + callInterval = allKeys.get(MSTAGE_CALL_INTERVAL.getConfig()).getAsLong(); + when(state.getPropAsLong(MSTAGE_CALL_INTERVAL.getConfig(), 0L)).thenReturn(callInterval); + + waitTimeoutSeconds = allKeys.get(MSTAGE_WAIT_TIMEOUT_SECONDS.getConfig()).getAsLong(); + when(state.getPropAsLong(MSTAGE_WAIT_TIMEOUT_SECONDS.getConfig(), 0L)).thenReturn(waitTimeoutSeconds); + + enableCleansing = allKeys.get(MSTAGE_ENABLE_CLEANSING.getConfig()).getAsBoolean(); + when(state.getPropAsBoolean(MSTAGE_ENABLE_CLEANSING.getConfig())).thenReturn(enableCleansing); + + workUnitPartialPartition = allKeys.get(MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getConfig()).getAsBoolean(); + when(state.getPropAsBoolean(MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getConfig())).thenReturn(workUnitPartialPartition); + + watermark = allKeys.get(MSTAGE_WATERMARK.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_WATERMARK.getConfig(), new JsonArray().toString())).thenReturn(watermark.toString()); + + secondaryInput = allKeys.get(MSTAGE_SECONDARY_INPUT.getConfig()).getAsJsonArray(); + when(state.getProp(MSTAGE_SECONDARY_INPUT.getConfig(), new JsonArray().toString())).thenReturn(secondaryInput.toString()); + + httpClientFactory = allKeys.get(MSTAGE_HTTP_CLIENT_FACTORY.getConfig()).getAsString(); + when(state.getProp(MSTAGE_HTTP_CLIENT_FACTORY.getConfig(), StringUtils.EMPTY)).thenReturn(httpClientFactory); + + httpRequestHeaders = allKeys.get(MSTAGE_HTTP_REQUEST_HEADERS.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_HTTP_REQUEST_HEADERS.getConfig(), new JsonObject().toString())).thenReturn(httpRequestHeaders.toString()); + + sourceUri = allKeys.get(MSTAGE_SOURCE_URI.getConfig()).getAsString(); + when(state.getProp(MSTAGE_SOURCE_URI.getConfig(), StringUtils.EMPTY)).thenReturn(sourceUri); + + httpRequestMethod = allKeys.get(MSTAGE_HTTP_REQUEST_METHOD.getConfig()).getAsString(); + when(state.getProp(MSTAGE_HTTP_REQUEST_METHOD.getConfig(), StringUtils.EMPTY)).thenReturn(httpRequestMethod); + + extractorClass = allKeys.get(MSTAGE_EXTRACTOR_CLASS.getConfig()).getAsString(); + when(state.getProp(MSTAGE_EXTRACTOR_CLASS.getConfig(), StringUtils.EMPTY)).thenReturn(extractorClass); + + authentication = allKeys.get(MSTAGE_AUTHENTICATION.getConfig()).getAsJsonObject(); + token = authentication.get("token").getAsString(); + when(state.getProp(MSTAGE_AUTHENTICATION.getConfig(), new JsonObject().toString())).thenReturn(authentication.toString()); + + httpStatus = allKeys.get(MSTAGE_HTTP_STATUSES.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_HTTP_STATUSES.getConfig(), new JsonObject().toString())).thenReturn(httpStatus.toString()); + + httpStatusReasons = allKeys.get(MSTAGE_HTTP_STATUS_REASONS.getConfig()).getAsJsonObject(); + when(state.getProp(MSTAGE_HTTP_STATUS_REASONS.getConfig(), new JsonObject().toString())).thenReturn(httpStatusReasons.toString()); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/source/JdbcSourceTest.java b/dil/src/test/java/com/linkedin/dil/source/JdbcSourceTest.java new file mode 100644 index 0000000..8e7e1d3 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/source/JdbcSourceTest.java @@ -0,0 +1,19 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import gobblin.configuration.SourceState; +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class JdbcSourceTest { + @Test + public void testInitialize() { + JdbcSource jdbcSource = new JdbcSource(); + SourceState state = new SourceState(); + Assert.assertNotNull(jdbcSource.getWorkunits(state)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/source/MultistageSource2Test.java b/dil/src/test/java/com/linkedin/dil/source/MultistageSource2Test.java new file mode 100644 index 0000000..bd66102 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/source/MultistageSource2Test.java @@ -0,0 +1,99 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.io.InputStreamReader; +import java.util.List; +import org.apache.commons.lang.StringUtils; +import org.apache.gobblin.configuration.SourceState; +import com.linkedin.dil.util.VariableUtils; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static com.linkedin.dil.configuration.MultistageProperties.*; +import static org.mockito.Mockito.*; + + +@PrepareForTest({VariableUtils.class, MultistageSource.class}) +public class MultistageSource2Test extends PowerMockTestCase { + + private MultistageSource source; + private SourceState state; + private Gson gson; + @BeforeClass + public void setUp() { + source = new MultistageSource(); + state = mock(SourceState.class); + gson = new Gson(); + } + + @Test + public void testGetWorkunits() { + initializeHelper(state); + + List wuList = source.getWorkunits(state); + Assert.assertEquals(wuList.size(), 1); + WorkUnit workUnit = wuList.get(0); + Assert.assertEquals(workUnit.getSpecProperties().getProperty(MSTAGE_WATERMARK_GROUPS.getConfig()), "[\"watermark.system\",\"watermark.unit\"]"); + } + + @Test + public void testInitialize() { + initializeHelper(state); + + when(state.getProp(MSTAGE_ENABLE_CLEANSING.getConfig(), StringUtils.EMPTY)).thenReturn("true"); + when(state.getProp(MSTAGE_SECONDARY_INPUT.getConfig(), new JsonArray().toString())) + .thenReturn("[{\"fields\":[\"uuid\"],\"category\":\"authentication\",\"authentication\":{}}]"); + source.initialize(state); + + when(state.getProp(MSTAGE_ENABLE_CLEANSING.getConfig(), StringUtils.EMPTY)).thenReturn(""); + when(state.getProp(MSTAGE_SECONDARY_INPUT.getConfig(), new JsonArray().toString())) + .thenReturn("[{\"path\":\"${job.dir}/${extract.namespace}/getResults\",\"fields\":[\"access_token\"],\"category\":\"authentication\",\"retry\":{}}]"); + source.initialize(state); + + when(state.getProp(MSTAGE_ENABLE_CLEANSING.getConfig(), StringUtils.EMPTY)).thenReturn("false"); + source.initialize(state); + } + + private void initializeHelper(SourceState state) { + JsonObject allKeys = gson.fromJson(new InputStreamReader(this.getClass().getResourceAsStream("/json/sample-data-for-source.json")), JsonObject.class); + + when(state.getProp(MSTAGE_PAGINATION.getConfig(), new JsonObject().toString())).thenReturn(allKeys.get(MSTAGE_PAGINATION.getConfig()).getAsJsonObject().toString()); + when(state.getProp(MSTAGE_SESSION_KEY_FIELD.getConfig(), new JsonObject().toString())).thenReturn(allKeys.get(MSTAGE_SESSION_KEY_FIELD.getConfig()).getAsJsonObject().toString()); + when(state.getProp(MSTAGE_TOTAL_COUNT_FIELD.getConfig(), StringUtils.EMPTY)).thenReturn(allKeys.get(MSTAGE_TOTAL_COUNT_FIELD.getConfig()).getAsString()); + when(state.getProp(MSTAGE_PARAMETERS.getConfig(), new JsonArray().toString())).thenReturn(allKeys.get(MSTAGE_PARAMETERS.getConfig()).getAsJsonArray().toString()); + when(state.getProp(MSTAGE_ENCRYPTION_FIELDS.getConfig(), new JsonArray().toString())).thenReturn(allKeys.get(MSTAGE_ENCRYPTION_FIELDS.getConfig()).getAsJsonArray().toString()); + when(state.getProp(MSTAGE_DATA_FIELD.getConfig(), StringUtils.EMPTY)).thenReturn(allKeys.get(MSTAGE_DATA_FIELD.getConfig()).getAsString()); + when(state.getPropAsLong(MSTAGE_CALL_INTERVAL.getConfig(), 0L)).thenReturn(allKeys.get(MSTAGE_CALL_INTERVAL.getConfig()).getAsLong()); + when(state.getPropAsLong(MSTAGE_WAIT_TIMEOUT_SECONDS.getConfig(), 0L)).thenReturn(allKeys.get(MSTAGE_WAIT_TIMEOUT_SECONDS.getConfig()).getAsLong()); + when(state.getPropAsBoolean(MSTAGE_ENABLE_CLEANSING.getConfig())).thenReturn(allKeys.get(MSTAGE_ENABLE_CLEANSING.getConfig()).getAsBoolean()); + when(state.getPropAsBoolean(MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getConfig())).thenReturn(allKeys.get(MSTAGE_WORK_UNIT_PARTIAL_PARTITION.getConfig()).getAsBoolean()); + when(state.getProp(MSTAGE_WATERMARK.getConfig(), new JsonArray().toString())).thenReturn(allKeys.get(MSTAGE_WATERMARK.getConfig()).getAsJsonArray().toString()); + when(state.getProp(MSTAGE_SECONDARY_INPUT.getConfig(), new JsonArray().toString())).thenReturn(allKeys.get(MSTAGE_SECONDARY_INPUT.getConfig()).getAsJsonArray().toString()); + when(state.getProp(MSTAGE_HTTP_CLIENT_FACTORY.getConfig(), StringUtils.EMPTY)).thenReturn(allKeys.get(MSTAGE_HTTP_CLIENT_FACTORY.getConfig()).getAsString()); + when(state.getProp(MSTAGE_HTTP_REQUEST_HEADERS.getConfig(), new JsonObject().toString())).thenReturn(allKeys.get(MSTAGE_HTTP_REQUEST_HEADERS.getConfig()).getAsJsonObject().toString()); + when(state.getProp(MSTAGE_SOURCE_URI.getConfig(), StringUtils.EMPTY)).thenReturn(allKeys.get(MSTAGE_SOURCE_URI.getConfig()).getAsString()); + when(state.getProp(MSTAGE_HTTP_REQUEST_METHOD.getConfig(), StringUtils.EMPTY)).thenReturn(allKeys.get(MSTAGE_HTTP_REQUEST_METHOD.getConfig()).getAsString()); + when(state.getProp(MSTAGE_EXTRACTOR_CLASS.getConfig(), StringUtils.EMPTY)).thenReturn(allKeys.get(MSTAGE_EXTRACTOR_CLASS.getConfig()).getAsString()); + when(state.getProp(MSTAGE_AUTHENTICATION.getConfig(), new JsonObject().toString())).thenReturn(allKeys.get(MSTAGE_AUTHENTICATION.getConfig()).getAsJsonObject().toString()); + when(state.getProp(MSTAGE_HTTP_STATUSES.getConfig(), new JsonObject().toString())).thenReturn(allKeys.get(MSTAGE_HTTP_STATUSES.getConfig()).getAsJsonObject().toString()); + when(state.getProp(MSTAGE_HTTP_STATUS_REASONS.getConfig(), new JsonObject().toString())).thenReturn(allKeys.get(MSTAGE_HTTP_STATUS_REASONS.getConfig()).getAsJsonObject().toString()); + + when(state.getProp(MSTAGE_SOURCE_S3_PARAMETERS.getConfig(), new JsonObject().toString())).thenReturn("{\"region\" : \"us-east-1\", \"connection_timeout\" : 10}"); + when(state.getProp(MSTAGE_SOURCE_FILES_PATTERN.getConfig(), StringUtils.EMPTY)).thenReturn(StringUtils.EMPTY); + when(state.getPropAsInt(MSTAGE_S3_LIST_MAX_KEYS.getConfig())).thenReturn(100); + when(state.getProp(SOURCE_CONN_USERNAME.getConfig(), StringUtils.EMPTY)).thenReturn(StringUtils.EMPTY); + when(state.getProp(SOURCE_CONN_PASSWORD.getConfig(), StringUtils.EMPTY)).thenReturn(StringUtils.EMPTY); + when(state.getProp(MSTAGE_EXTRACTOR_TARGET_FILE_NAME.getConfig(), StringUtils.EMPTY)).thenReturn(StringUtils.EMPTY); + when(state.getProp(MSTAGE_OUTPUT_SCHEMA.getConfig(), StringUtils.EMPTY)).thenReturn(""); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/source/MultistageSourceTest.java b/dil/src/test/java/com/linkedin/dil/source/MultistageSourceTest.java new file mode 100644 index 0000000..5e2a623 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/source/MultistageSourceTest.java @@ -0,0 +1,688 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import com.google.common.collect.ImmutableList; +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.gobblin.configuration.SourceState; +import org.apache.gobblin.configuration.State; +import org.apache.gobblin.configuration.WorkUnitState; +import com.linkedin.dil.configuration.MultistageProperties; +import com.linkedin.dil.factory.reader.SchemaReader; +import com.linkedin.dil.keys.JobKeys; +import com.linkedin.dil.util.EndecoUtils; +import com.linkedin.dil.util.WatermarkDefinition; +import com.linkedin.dil.util.WorkUnitPartitionTypes; +import org.apache.gobblin.source.extractor.WatermarkInterval; +import org.apache.gobblin.source.extractor.extract.LongWatermark; +import org.apache.gobblin.source.workunit.Extract; +import org.apache.gobblin.source.workunit.WorkUnit; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.mockito.Matchers; +import org.mockito.Mockito; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +public class MultistageSourceTest { + private final static DateTimeFormatter JODA_DATE_TIME_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ssZ"); + private Gson gson; + private MultistageSource source; + + @BeforeMethod + public void setUp() { + gson = new Gson(); + source = new MultistageSource(); + } + + @Test + public void testWorkUnitPartitionDef(){ + SourceState state = mock(SourceState.class); + when(state.getProp("ms.work.unit.partition", "")).thenReturn("daily"); + when(state.getProp("ms.pagination", new JsonObject().toString())).thenReturn("{}"); + when(state.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + + MultistageSource source = new MultistageSource(); + source.getWorkunits(state); + + String expected = "daily"; + Assert.assertEquals(expected, MultistageProperties.MSTAGE_WORK_UNIT_PARTITION.getProp(state)); + } + + @Test + public void testWorkUnitPacingDef(){ + SourceState state = mock(SourceState.class); + when(state.getPropAsInt("ms.work.unit.pacing.seconds", 0)).thenReturn(10); + when(state.getProp("ms.pagination", new JsonObject().toString())).thenReturn("{}"); + when(state.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new MultistageSource(); + source.getWorkunits(state); + Assert.assertEquals(((Integer) MultistageProperties.MSTAGE_WORK_UNIT_PACING_SECONDS.getProp(state)).intValue(), 10); + } + + @Test + public void testWorkUnitPacingConversion(){ + SourceState state = mock(SourceState.class); + when(state.getPropAsInt("ms.work.unit.pacing.seconds", 0)).thenReturn(10); + when(state.getProp("ms.pagination", new JsonObject().toString())).thenReturn("{\"fields\": [\"start\"]}"); + when(state.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new MultistageSource(); + source.getWorkunits(state); + Assert.assertEquals(MultistageProperties.MSTAGE_WORK_UNIT_PACING_SECONDS.getMillis(state).longValue(), 10000L); + } + + @Test + public void testGetWorkUnitsTooManyPartitions() { + SourceState state = new SourceState(); + state.setProp("ms.watermark", + "[{\"name\": \"system\",\"type\": \"datetime\", \"range\": {\"from\": \"2000-01-01\", \"to\": \"-\"}}]"); + state.setProp("extract.table.type", "SNAPSHOT_ONLY"); + state.setProp("extract.namespace", "test"); + state.setProp("extract.table.name", "table1"); + state.setProp("ms.work.unit.partition", "hourly"); + state.setProp("ms.pagination", "{}"); + MultistageSource source = new MultistageSource(); + List wuList = source.getWorkunits(state); + // Expected max partition allowed maps to MultistageSource.MAX_DATETIME_PARTITION + Assert.assertEquals(wuList.size(), 3 * 30 * 24); + } + + @Test + public void testGetWorkUnitsDefault(){ + SourceState state = new SourceState(); + state.setProp("ms.watermark", "[{\"name\": \"system\",\"type\": \"datetime\", \"range\": {\"from\": \"2017-01-01\", \"to\": \"-\"}}]"); + state.setProp("extract.table.type", "SNAPSHOT_ONLY"); + state.setProp("extract.namespace", "test"); + state.setProp("extract.table.name", "table1"); + state.setProp("ms.work.unit.partition", ""); + state.setProp("ms.pagination", "{}"); + MultistageSource source = new MultistageSource(); + source.getWorkunits(state); + + //Assert.assertEquals(source.getMyProperty(MultistageProperties.WORK_UNIT_PARTITION), "weekly"); + Extract extract = source.createExtractObject(true); + WorkUnit workUnit = WorkUnit.create(extract, + new WatermarkInterval(new LongWatermark(1483257600000L), new LongWatermark(1572660000000L))); + workUnit.setProp("ms.watermark.groups", "[\"watermark.datetime\",\"watermark.unit\"]"); + workUnit.setProp("watermark.datetime", "(1483257600000,1572660000000)"); + workUnit.setProp("watermark.unit", "NONE"); + WorkUnit workUnit1 = (WorkUnit) source.getWorkunits(state).get(0); + Assert.assertEquals(workUnit1.getLowWatermark().toString(), workUnit.getLowWatermark().toString()); + Assert.assertEquals(workUnit1.getProp(MultistageProperties.DATASET_URN_KEY.toString()), "[watermark.system.1483257600000, watermark.unit.{}]"); + Assert.assertEquals(workUnit1.getProp(MultistageProperties.MSTAGE_WATERMARK_GROUPS.toString()), "[\"watermark.system\",\"watermark.unit\"]"); + } + + @Test + public void testParallismMaxSetting() { + SourceState state = mock(SourceState.class); + when(state.getPropAsInt("ms.work.unit.parallelism.max",0)).thenReturn(0); + when(state.getProp("ms.pagination", new JsonObject().toString())).thenReturn(""); + + Assert.assertFalse(MultistageProperties.MSTAGE_WORK_UNIT_PARALLELISM_MAX.validateNonblank(state)); + + when(state.getPropAsInt("ms.work.unit.parallelism.max",0)).thenReturn(10); + Assert.assertTrue(MultistageProperties.MSTAGE_WORK_UNIT_PARALLELISM_MAX.validateNonblank(state)); + } + + @Test + public void testDerivedFields() { + SourceState sourceState = mock(SourceState.class); + when(sourceState.getProp("extract.table.type", "SNAPSHOT_ONLY")).thenReturn("SNAPSHOT_ONLY"); + when(sourceState.getProp("extract.namespace", "")).thenReturn("test"); + when(sourceState.getProp("extract.table.name", "")).thenReturn("table1"); + when(sourceState.getProp("ms.derived.fields", new JsonArray().toString())).thenReturn("[{\"name\": \"activityDate\", \"formula\": {\"type\": \"epoc\", \"source\": \"fromDateTime\", \"format\": \"yyyy-MM-dd'T'HH:mm:ss'Z'\"}}]"); + when(sourceState.getProp("ms.output.schema", new JsonArray().toString())).thenReturn(""); + when(sourceState.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new MultistageSource(); + source.getWorkunits(sourceState); + + Assert.assertEquals(source.getJobKeys().getDerivedFields().keySet().toString(), "[activityDate]"); + } + + @Test + public void testOutputSchema(){ + SourceState state = mock(SourceState.class); + when(state.getProp("ms.output.schema", new JsonArray().toString())).thenReturn(""); + when(state.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new MultistageSource(); + source.getWorkunits(state); + Assert.assertEquals(0, source.getJobKeys().getOutputSchema().size()); + + // wrong format should be ignored + when(state.getProp("ms.output.schema", new JsonArray().toString())).thenReturn("{\"name\": \"responseTime\"}"); + source.getWorkunits(state); + Assert.assertEquals(0, source.getJobKeys().getOutputSchema().size()); + + // wrong format should be ignored + when(state.getProp("ms.output.schema", new JsonArray().toString())).thenReturn("[{\"name\": \"responseTime\"}]"); + source.getWorkunits(state); + Assert.assertEquals(1, source.getJobKeys().getOutputSchema().size()); + Assert.assertEquals(1, source.getJobKeys().getOutputSchema().size()); + } + + @Test + public void testSourceParameters(){ + SourceState sourceState = mock(SourceState.class); + when(sourceState.getProp(MultistageProperties.MSTAGE_OUTPUT_SCHEMA.getConfig(), "")).thenReturn(""); + MultistageSource source = new MultistageSource(); + source.getWorkunits(sourceState); + Assert.assertNotNull(source.getJobKeys().getSourceParameters()); + + when(sourceState.getProp("ms.parameters", new JsonArray().toString())).thenReturn("[{\"name\":\"cursor\",\"type\":\"session\"}]"); + source.getWorkunits(sourceState); + Assert.assertNotNull(source.getJobKeys().getSourceParameters()); + } + + @Test + public void testHadoopFsEncoding() { + String plain = "[watermark.system.1483257600000, watermark.activation.{\"s3key\":\"cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00000.gz\"}]"; + String expected = "[watermark.system.1483257600000, watermark.activation.{\"s3key\":\"cc-index%2Fcollections%2FCC-MAIN-2019-43%2Findexes%2Fcdx-00000.gz\"}]"; + String encoded = EndecoUtils.getHadoopFsEncoded(plain); + Assert.assertEquals(encoded, expected); + } + + @Test + public void testUrlEncoding() { + String plain = "{a b}"; + String expected = "%7Ba+b%7D"; + String encoded = EndecoUtils.getEncodedUtf8(plain); + Assert.assertEquals(encoded, expected); + } + + @Test + public void testUnitWatermark(){ + SourceState state = new SourceState(); + state.setProp("ms.watermark", "[{\"name\": \"system\",\"type\": \"datetime\", \"range\": {\"from\": \"2020-01-01\", \"to\": \"2020-01-31\"}}, {\"name\": \"units\",\"type\": \"unit\", \"units\": \"id1,id2,id3\"}]"); + state.setProp("extract.table.type", "SNAPSHOT_ONLY"); + state.setProp("extract.namespace", "test"); + state.setProp("extract.table.name", "table1"); + state.setProp("ms.work.unit.partition", ""); + state.setProp("ms.pagination", "{}"); + MultistageSource source = new MultistageSource(); + Assert.assertEquals(source.getWorkunits(state).size(), 3); + } + + @Test + public void testIsSecondaryAuthenticationEnabledWithInvalidSecondaryInput() { + JobKeys jobKeys = Mockito.mock(JobKeys.class); + source.jobKeys = jobKeys; + JsonArray secondaryInput = gson.fromJson("[\"test_field\"]", JsonArray.class); + when(jobKeys.getSecondaryInputs()).thenReturn(secondaryInput); + Assert.assertFalse(source.jobKeys.getIsSecondaryAuthenticationEnabled()); + } + + @Test + public void testReadSecondaryAuthentication() { + JsonArray secondaryInput = gson.fromJson("[{\"fields\": [\"access_token\"], \"category\": \"authentication\"}]", JsonArray.class); + JobKeys jobKeys = Mockito.mock(JobKeys.class); + State state = Mockito.mock(State.class); + when(jobKeys.getSecondaryInputs()).thenReturn(secondaryInput); + source.jobKeys = jobKeys; + Assert.assertEquals(source.readSecondaryAuthentication(state, 1L).toString(), "{}"); + } + + @Test + public void testGetUpdatedWorkUnitActivation() { + WorkUnit workUnit = Mockito.mock(WorkUnit.class); + JsonObject authentication = gson.fromJson("{\"method\": \"basic\", \"encryption\": \"base64\", \"header\": \"Authorization\"}", JsonObject.class); + when(workUnit.getProp(MultistageProperties.MSTAGE_ACTIVATION_PROPERTY.toString(), StringUtils.EMPTY)).thenReturn(StringUtils.EMPTY); + Assert.assertEquals(source.getUpdatedWorkUnitActivation(workUnit, authentication), authentication.toString()); + } + + /** + * Test getExtractor when exception is thrown + */ + @Test(expectedExceptions = RuntimeException.class) + public void testGetExtractorWithException() { + WorkUnitState state = Mockito.mock(WorkUnitState.class); + source.getExtractor(state); + } + + /** + * Test generateWorkUnits when there are more than one DATETIME datetime type watermarks + * Expected: RuntimeException + */ + @Test(expectedExceptions = RuntimeException.class) + public void testGenerateWorkUnitsWithException1() { + testGenerateWorkUnitsHelper(WatermarkDefinition.WatermarkTypes.DATETIME); + } + + /** + * Test generateWorkUnits when there are more than one UNIT type watermarks + * Expected: RuntimeException + */ + @Test(expectedExceptions = RuntimeException.class) + public void testGenerateWorkUnitsWithException2() { + testGenerateWorkUnitsHelper(WatermarkDefinition.WatermarkTypes.UNIT); + } + + private void testGenerateWorkUnitsHelper(WatermarkDefinition.WatermarkTypes watermarkTypes) { + SourceState sourceState = Mockito.mock(SourceState.class); + source.sourceState = sourceState; + + WatermarkDefinition watermarkDefinition1 = Mockito.mock(WatermarkDefinition.class); + WatermarkDefinition watermarkDefinition2 = Mockito.mock(WatermarkDefinition.class); + when(watermarkDefinition1.getType()).thenReturn(watermarkTypes); + when(watermarkDefinition2.getType()).thenReturn(watermarkTypes); + List definitions = ImmutableList.of(watermarkDefinition1, watermarkDefinition2); + + Map previousHighWatermarks = new HashMap<>(); + source.generateWorkUnits(definitions, previousHighWatermarks); + } + + @Test + public void testCheckFullExtractState() throws Exception { + State state = Mockito.mock(State.class); + Map map = Mockito.mock(Map.class); + Method method = MultistageSource.class.getDeclaredMethod("checkFullExtractState", State.class, Map.class); + method.setAccessible(true); + when(state.getProp("extract.table.type", StringUtils.EMPTY)).thenReturn("APPEND_ONLY"); + when(map.isEmpty()).thenReturn(true); + Assert.assertTrue((Boolean) method.invoke(source, state, map)); + + when(map.isEmpty()).thenReturn(false); + Assert.assertFalse((Boolean) method.invoke(source, state, map)); + + when(state.getProp("ms.enable.dynamic.full.load", StringUtils.EMPTY)).thenReturn("true"); + Assert.assertFalse((Boolean) method.invoke(source, state, map)); + } + + @Test + public void testGetPreviousHighWatermarks() throws Exception { + SourceState sourceState = Mockito.mock(SourceState.class); + WorkUnitState workUnitState = Mockito.mock(WorkUnitState.class); + source.sourceState = sourceState; + + Map> previousWorkUnitStatesByDatasetUrns = new HashMap<>(); + previousWorkUnitStatesByDatasetUrns.put("ColumnName.Number", ImmutableList.of(workUnitState)); + when(workUnitState.getActualHighWatermark(LongWatermark.class)).thenReturn(new LongWatermark(1000L)); + when(sourceState.getPreviousWorkUnitStatesByDatasetUrns()).thenReturn(previousWorkUnitStatesByDatasetUrns); + + Method method = MultistageSource.class.getDeclaredMethod("getPreviousHighWatermarks"); + method.setAccessible(true); + Map actual = (Map) method.invoke(source); + Assert.assertEquals(actual.size(), 1); + Assert.assertEquals((long) actual.get("ColumnName.Number"), 1000L); + } + + /** + * Test normal cases + */ + @Test + public void testGetWorkUnitPartitionTypes() { + SourceState state = new SourceState(); + source = new MultistageSource(); + + state.setProp("ms.work.unit.partition", ""); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.NONE); + + state.setProp("ms.work.unit.partition", "none"); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.NONE); + + state.setProp("ms.work.unit.partition", "weekly"); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.WEEKLY); + + state.setProp("ms.work.unit.partition", "monthly"); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.MONTHLY); + + state.setProp("ms.work.unit.partition", "daily"); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.DAILY); + + state.setProp("ms.work.unit.partition", "hourly"); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.HOURLY); + + state.setProp("ms.work.unit.partition", "{\"none\": [\"2020-01-01\", \"2020-02-18\"]}"); + state.setProp("ms.work.unit.partial.partition", false); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2020-01-01"), + DateTime.parse("2020-02-18"), + source.jobKeys.getIsPartialPartition()).size(), 1); + + state.setProp("ms.work.unit.partition", "{\"monthly\": [\"2020-01-01\", \"2020-02-18\"]}"); + state.setProp("ms.work.unit.partial.partition", false); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2020-01-01"), + DateTime.parse("2020-02-18"), + source.jobKeys.getIsPartialPartition()).size(), 1); + + state.setProp("ms.work.unit.partition", "{\"monthly\": [\"2020-01-01\", \"2020-02-18\"]}"); + state.setProp("ms.work.unit.partial.partition", true); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2020-01-01"), + DateTime.parse("2020-02-18"), + source.jobKeys.getIsPartialPartition()).size(), 2); + + state.setProp("ms.work.unit.partition", "{\"weekly\": [\"2020-01-01\", \"2020-02-01\"]}"); + state.setProp("ms.work.unit.partial.partition", true); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2020-01-01"), + DateTime.parse("2020-02-01"), + source.jobKeys.getIsPartialPartition()).size(), 5); + + // this should gives out 3 ranges: 1/1 - 2/1, 2/1 - 2/2, 2/2 - 2/3 + state.setProp("ms.work.unit.partition", "{\"monthly\": [\"2020-01-01T00:00:00-00:00\", \"2020-02-01T00:00:00-00:00\"], \"daily\": [\"2020-02-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + state.setProp("ms.work.unit.partial.partition", false); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2020-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()).size(), 3); + + + // this should gives out 3 ranges: 1/1 - 2/1, 2/1 - 2/2, 2/2 - 2/3 + state.setProp("ms.work.unit.partition", "{\"none\": [\"2010-01-01T00:00:00-00:00\", \"2020-02-01T00:00:00-00:00\"], \"daily\": [\"2020-02-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + state.setProp("ms.work.unit.partial.partition", false); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2010-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()).size(), 3); + + state.setProp("ms.work.unit.partition", "{\"monthly\": [\"2020-01-01\", \"-\"]}"); + state.setProp("ms.work.unit.partial.partition", true); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.now().monthOfYear().roundFloorCopy(), + DateTime.now().monthOfYear().roundCeilingCopy(), + source.jobKeys.getIsPartialPartition()).size(), 1); + } + + @Test + public void testGetWorkUnitRangesForYearlyWithPartialPartitioning() { + SourceState state = new SourceState(); + source = new MultistageSource(); + List> actualRanges; + List> expectedRanges; + state.setProp("ms.work.unit.partial.partition", true); + + // evenly partitioned, daily + // Expected: 2017, 2018, 2019, Each day of 2020 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-01-01T00:00:00-00:00\"], " + + "\"daily\": [\"2020-01-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 36); + + // evenly partitioned, weekly + // Expected: 2017, 2018, 2019, first 5 weeks of 2020 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-01-01T00:00:00-00:00\"], " + + "\"weekly\": [\"2020-01-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483228800000l, 1514764800000l), + ImmutablePair.of(1514764800000l, 1546300800000l), + ImmutablePair.of(1546300800000l, 1577836800000l), + ImmutablePair.of(1577836800000l, 1578441600000l), + ImmutablePair.of(1578441600000l, 1579046400000l), + ImmutablePair.of(1579046400000l, 1579651200000l), + ImmutablePair.of(1579651200000l, 1580256000000l), + ImmutablePair.of(1580256000000l, 1580688000000l)); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 8); + Assert.assertEquals(actualRanges, expectedRanges); + + // not evenly partitioned, daily + // Expected: 2017, 2018, 2019, Jan 2020, Each day of Feb 2020 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-02-01T00:00:00-00:00\"], " + + "\"daily\": [\"2020-02-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483228800000l, 1514764800000l), + ImmutablePair.of(1514764800000l, 1546300800000l), + ImmutablePair.of(1546300800000l, 1577836800000l), + ImmutablePair.of(1577836800000l, 1580515200000l), + ImmutablePair.of(1580515200000l, 1580601600000l), + ImmutablePair.of(1580601600000l, 1580688000000l)); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 6); + Assert.assertEquals(actualRanges, expectedRanges); + + // not evenly partitioned, weekly + // config is provided out of order + // Expected: 2017, 2018, 2019, Jan 2020, First week of Feb 2020, second week of feb 2020 + state.setProp("ms.work.unit.partition", "{\"weekly\": [\"2020-02-01T00:00:00-00:00\", \"2020-02-11T00:00:00-00:00\"], " + + "\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-02-01T00:00:00-00:00\"]}"); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-11T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483228800000l, 1514764800000l), + ImmutablePair.of(1514764800000l, 1546300800000l), + ImmutablePair.of(1546300800000l, 1577836800000l), + ImmutablePair.of(1577836800000l, 1580515200000l), + ImmutablePair.of(1580515200000l, 1581120000000l), + ImmutablePair.of(1581120000000l, 1581379200000l)); + Assert.assertEquals(actualRanges.size(), 6); + Assert.assertEquals(actualRanges, expectedRanges); + } + + @Test + public void testGetWorkUnitRangesForYearlyWithoutPartialPartitioning() { + SourceState state = new SourceState(); + source = new MultistageSource(); + List> actualRanges; + List> expectedRanges; + state.setProp("ms.work.unit.partial.partition", false); + + // evenly partitioned, daily + // Expected: 2017, 2018, 2019, Each day of 2020 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-01-01T00:00:00-00:00\"], " + + "\"daily\": [\"2020-01-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 36); + + // evenly partitioned, weekly + // Expected: 2017, 2018, 2019, first 4 weeks of 2020 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-01-01T00:00:00-00:00\"], " + + "\"weekly\": [\"2020-01-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483228800000l, 1514764800000l), + ImmutablePair.of(1514764800000l, 1546300800000l), + ImmutablePair.of(1546300800000l, 1577836800000l), + ImmutablePair.of(1577836800000l, 1578441600000l), + ImmutablePair.of(1578441600000l, 1579046400000l), + ImmutablePair.of(1579046400000l, 1579651200000l), + ImmutablePair.of(1579651200000l, 1580256000000l)); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 7); + Assert.assertEquals(actualRanges, expectedRanges); + + // not evenly partitioned, daily + // Expected: 2017, 2018, 2019, Each day of Feb 2020 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-02-01T00:00:00-00:00\"], " + + "\"daily\": [\"2020-02-01T00:00:00-00:00\", \"2020-02-03T00:00:00-00:00\"]}"); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-03T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483228800000l, 1514764800000l), + ImmutablePair.of(1514764800000l, 1546300800000l), + ImmutablePair.of(1546300800000l, 1577836800000l), + ImmutablePair.of(1580515200000l, 1580601600000l), + ImmutablePair.of(1580601600000l, 1580688000000l)); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 5); + Assert.assertEquals(actualRanges, expectedRanges); + + // not evenly partitioned, weekly + // Expected: 2017, 2018, 2019, First week of Feb 2020 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01T00:00:00-00:00\", \"2020-02-01T00:00:00-00:00\"], " + + "\"weekly\": [\"2020-02-01T00:00:00-00:00\", \"2020-02-11T00:00:00-00:00\"]}"); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01T00:00:00-00:00").withZone(DateTimeZone.UTC), + DateTime.parse("2020-02-1T00:00:00-00:00").withZone(DateTimeZone.UTC), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483228800000l, 1514764800000l), + ImmutablePair.of(1514764800000l, 1546300800000l), + ImmutablePair.of(1546300800000l, 1577836800000l), + ImmutablePair.of(1580515200000l, 1581120000000l)); + Assert.assertEquals(actualRanges.size(), 4); + Assert.assertEquals(actualRanges, expectedRanges); + } + + @Test + public void testGetWorkUnitRangesForYearlyWithOneSubRange() { + SourceState state = new SourceState(); + source = new MultistageSource(); + List> actualRanges; + List> expectedRanges; + + // Yearly without partial + // Expected: 3 - 2017, 2018, 2019 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01\", \"2020-02-18\"]}"); + state.setProp("ms.work.unit.partial.partition", false); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01"), + DateTime.parse("2020-02-18"), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483257600000l, 1514793600000l), + ImmutablePair.of(1514793600000l, 1546329600000l), + ImmutablePair.of(1546329600000l, 1577865600000l)); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 3); + Assert.assertEquals(actualRanges, expectedRanges); + + // Yearly with partial + // Expected: 3 - 2017, 2018, 2019, partial 2018 + state.setProp("ms.work.unit.partition", "{\"yearly\": [\"2017-01-01\", \"2020-02-18\"]}"); + state.setProp("ms.work.unit.partial.partition", true); + source.initialize(state); + actualRanges = source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2017-01-01"), + DateTime.parse("2020-02-18"), + source.jobKeys.getIsPartialPartition()); + expectedRanges = Arrays.asList(ImmutablePair.of(1483257600000l, 1514793600000l), + ImmutablePair.of(1514793600000l, 1546329600000l), + ImmutablePair.of(1546329600000l, 1577865600000l), + ImmutablePair.of(1577865600000l, 1582012800000l)); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(actualRanges.size(), 4); + Assert.assertEquals(actualRanges, expectedRanges); + } + + /** + * test incorrect Json format + */ + @Test + public void testGetWorkUnitPartitionTypesWithExceptions1() { + SourceState state = new SourceState(); + MultistageSource source = new MultistageSource(); + + state.setProp("ms.work.unit.partition", "{\"monthly\": \"2020-01-01\"]}"); + state.setProp("ms.work.unit.partial.partition", true); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), null); + } + + /** + * test nonconforming property format + */ + @Test + public void testGetWorkUnitPartitionTypesWithExceptions2() { + SourceState state = new SourceState(); + MultistageSource source = new MultistageSource(); + + // in this case, the partition range is ignored, and there is no partitioning + state.setProp("ms.work.unit.partition", "{\"monthly\": [\"2020-01-01\"]}"); + state.setProp("ms.work.unit.partial.partition", false); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.now().monthOfYear().roundFloorCopy(), + DateTime.now().monthOfYear().roundCeilingCopy(), + source.jobKeys.getIsPartialPartition()).size(), 0); + + // supposedly we wanted 5 weekly partitions, but the range end date time format is incorrect + // therefore it will not generate the number of partitions as wanted + state.setProp("ms.work.unit.partition", "{\"weekly\": [\"2020-01-01\", \"2020-02-1\"]}"); + state.setProp("ms.work.unit.partial.partition", true); + source.initialize(state); + Assert.assertEquals(source.jobKeys.getWorkUnitPartitionType(), WorkUnitPartitionTypes.COMPOSITE); + Assert.assertNotEquals(source.jobKeys.getWorkUnitPartitionType().getRanges( + DateTime.parse("2001-01-01"), + DateTime.parse("2090-01-01"), + source.jobKeys.getIsPartialPartition()).size(), 5); + } + + @Test + public void testPassingSchema2WorkUnits() { + SourceState state = new SourceState(); + String urn = "urn:li:dataset:(urn:li:dataPlatform:hive,rightnow.incidents,PROD)"; + JsonArray sampleSchema = gson.fromJson( + "[{\"columnName\":\"column1\",\"isNullable\":true,\"dataType\":{\"type\":\"timestamp\"}}]", + JsonArray.class); + + state.setProp(MultistageProperties.MSTAGE_SOURCE_SCHEMA_URN.toString(), urn); + MultistageSource source = new MultistageSource<>(); + + SchemaReader mockFactory = mock(SchemaReader.class); + when(mockFactory.read(Matchers.any(), Matchers.any())).thenReturn(sampleSchema); + + source.setSourceState(state); + source.jobKeys.setSchemaReader(mockFactory); + source.jobKeys.initialize(state); + Assert.assertTrue(source.jobKeys.hasOutputSchema()); + Assert.assertNotNull(source.generateWorkUnits(new ArrayList<>(), new HashMap<>())); + } + +} diff --git a/dil/src/test/java/com/linkedin/dil/source/S3SourceV2Test.java b/dil/src/test/java/com/linkedin/dil/source/S3SourceV2Test.java new file mode 100644 index 0000000..3215dd1 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/source/S3SourceV2Test.java @@ -0,0 +1,39 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.source; + +import com.google.gson.JsonObject; +import java.io.UnsupportedEncodingException; +import org.apache.gobblin.configuration.WorkUnitState; +import org.testng.Assert; +import org.testng.annotations.Test; +import software.amazon.awssdk.regions.Region; + +@Test +public class S3SourceV2Test { + /** + * This test depends on a publicly available common crawl file. That's why it is disabled by default. + * + * @throws UnsupportedEncodingException + */ + @Test (enabled = false) + public void testInitialization() throws UnsupportedEncodingException { + S3SourceV2 source = new S3SourceV2(); + WorkUnitState state = new WorkUnitState(); + state.setProp("ms.source.uri", "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-43/cc-index.paths.gz"); + state.setProp("ms.extractor.class", "com.linkedin.dil.extractor.CsvExtractor"); + JsonObject params = new JsonObject(); + params.addProperty("region", "us-east-1"); + params.addProperty("connection_timeout", 30); + state.setProp("ms.source.s3.parameters", params); + source.getExtractor(state); + Assert.assertEquals(source.getS3SourceV2Keys().getBucket(), "commoncrawl"); + Assert.assertEquals(source.getS3SourceV2Keys().getRegion().id(), Region.US_EAST_1.id()); + Assert.assertEquals(source.getS3SourceV2Keys().getEndpoint(), "https://s3.amazonaws.com"); + Assert.assertEquals(source.getS3SourceV2Keys().getPrefix(), "crawl-data/CC-MAIN-2019-43/cc-index.paths.gz"); + Assert.assertEquals(source.getS3SourceV2Keys().getMaxKeys(), new Integer(1000)); + Assert.assertEquals(source.getS3SourceV2Keys().getConnectionTimeout(), new Integer(30)); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/AvroSchemaUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/AvroSchemaUtilsTest.java new file mode 100644 index 0000000..a3ff158 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/AvroSchemaUtilsTest.java @@ -0,0 +1,68 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import java.util.List; +import lombok.SneakyThrows; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.converter.avro.UnsupportedDateTypeException; +import org.apache.gobblin.source.workunit.Extract; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +public class AvroSchemaUtilsTest { + WorkUnitState state; + String schemaString = "[{\"columnName\":\"asIs\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}," + + "{\"columnName\":\"normalized\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"string\"}}]"; + JsonArray schemaArray = new Gson().fromJson(schemaString, JsonArray.class); + Schema schema; + + @SneakyThrows + @BeforeMethod + public void beforeMethod() { + state = mock(WorkUnitState.class); + Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, "com.linkedin.test", "test"); + when(state.getExtract()).thenReturn(extract); + schema = AvroSchemaUtils.fromJsonSchema(schemaArray, state); + } + + @Test + public void testFromJsonSchema() throws UnsupportedDateTypeException { + List fields = schema.getFields(); + Assert.assertEquals(fields.size(), 2); + Assert.assertEquals(fields.get(0).name(), "asIs"); + Assert.assertEquals(fields.get(1).name(), "normalized"); + } + + @Test + public void testGetSchemaFieldNames() throws UnsupportedDateTypeException { + List fieldNames = AvroSchemaUtils.getSchemaFieldNames(schema); + Assert.assertEquals(fieldNames.size(), 2); + Assert.assertEquals(fieldNames.get(0), "asIs"); + Assert.assertEquals(fieldNames.get(1), "normalized"); + } + + @Test + public void testDeepCopySchemaField() { + Schema.Field originalField = schema.getField("asIs"); + Schema.Field copiedField = AvroSchemaUtils.deepCopySchemaField(originalField); + Assert.assertEquals(originalField, copiedField); + } + + @Test + public void testCreateEOF() { + GenericRecord row = AvroSchemaUtils.createEOF(state); + Assert.assertEquals(row.getSchema().getFields().size(), 1); + Assert.assertEquals(row.get("EOF"), "EOF"); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/util/DateTimeUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/DateTimeUtilsTest.java new file mode 100644 index 0000000..38021b3 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/DateTimeUtilsTest.java @@ -0,0 +1,104 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import org.joda.time.DateTimeZone; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class DateTimeUtilsTest { + @Test + public void testParser() { + Assert.assertEquals(DateTimeUtils.parse("2020-01-01").toString("yyyy-MM-dd"), "2020-01-01"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 11:11:11").toString("yyyy-MM-dd HH:mm:ss"), + "2020-01-01 11:11:11"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T11:11:11").toString("yyyy-MM-dd'T'HH:mm:ss"), + "2020-01-01T11:11:11"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T11:11:11.9").toString("yyyy-MM-dd'T'HH:mm:ss.SSS"), + "2020-01-01T11:11:11.900"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T11:11:11.99").toString("yyyy-MM-dd'T'HH:mm:ss.SSS"), + "2020-01-01T11:11:11.990"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T11:11:11.999").toString("yyyy-MM-dd'T'HH:mm:ss.SSS"), + "2020-01-01T11:11:11.999"); + // test microseconds truncation as Joda supports only milliseconds + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T11:11:11.9999").toString("yyyy-MM-dd'T'HH:mm:ss.SSS"), + "2020-01-01T11:11:11.999"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T11:11:11.99999").toString("yyyy-MM-dd'T'HH:mm:ss.SSS"), + "2020-01-01T11:11:11.999"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T11:11:11.999999").toString("yyyy-MM-dd'T'HH:mm:ss.SSS"), + "2020-01-01T11:11:11.999"); + + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 10:00:00-07:00") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T17:00:00.000+0000"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 10:00:00.000-0700") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T17:00:00.000+0000"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 10:00:00.000-07:00") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T17:00:00.000+0000"); + + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T10:00:00-0700") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T17:00:00.000+0000"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T10:00:00.000-0700") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T17:00:00.000+0000"); + Assert.assertEquals(DateTimeUtils.parse("2020-01-01T10:00:00.000-07:00") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T17:00:00.000+0000"); + + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 10:00:00PST").getZone().getID(), "America/Los_Angeles"); + Assert.assertEquals( + DateTimeUtils.parse("2020-01-01 10:00:00PST").withZone(DateTimeZone.UTC).toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), + "2020-01-01T18:00:00.000+0000"); + + // time will be truncated in this case because Joda doesn't process long form timezone name well + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 10:00:00America/Los_Angeles") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T08:00:00.000+0000"); + + // time will be truncated in this case because of unrecognizable format + Assert.assertEquals( + DateTimeUtils.parse("2020-01-01 10:00").withZone(DateTimeZone.UTC).toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), + "2020-01-01T08:00:00.000+0000"); + } + + /** + * Timezone: "America/Los_Angeles will be used if the timezone parameter is empty + */ + @Test + public void parse_emptyTimezone_defaultTimezone() { + Assert.assertEquals( + DateTimeUtils.parse("2020-01-01 10:00", "").withZone(DateTimeZone.UTC).toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), + "2020-01-01T08:00:00.000+0000"); + + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 10:00:30", "") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T18:00:30.000+0000"); + + Assert.assertEquals(DateTimeUtils.parse("2020-01-01 10:00:00-07:00", "") + .withZone(DateTimeZone.UTC) + .toString("yyyy-MM-dd'T'HH:mm:ss.SSSZ"), "2020-01-01T17:00:00.000+0000"); + } + + /** + * Exception is expected when dtString is null + */ + @Test(expectedExceptions = NullPointerException.class) + public void parse_nulldtString_illegalArgumentException() { + DateTimeUtils.parse(null, ""); + } + + /** + * Exception is expected when timezone is null + */ + @Test(expectedExceptions = NullPointerException.class) + public void parse_nullTimezone_illegalArgumentException() { + DateTimeUtils.parse("", null); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/EncryptionUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/EncryptionUtilsTest.java new file mode 100644 index 0000000..ef9b479 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/EncryptionUtilsTest.java @@ -0,0 +1,66 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.JsonObject; +import gobblin.configuration.SourceState; +import org.apache.gobblin.codec.StreamCodec; +import com.linkedin.dil.configuration.MultistageProperties; +import org.apache.gobblin.password.PasswordManager; +import org.mockito.Mock; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +@PrepareForTest({PasswordManager.class}) +public class EncryptionUtilsTest extends PowerMockTestCase { + private final static String PLAIN_PASSWORD = "password"; + private final static String ENC_PASSWORD = "ENC(M6nV+j0lhqZ36RgvuF5TQMyNvBtXmkPl)"; + private SourceState state; + @Mock + private PasswordManager passwordManager; + + @BeforeMethod + public void setUp() { + String masterKeyLoc = this.getClass().getResource("/key/master.key").toString(); + state = new SourceState(); + state.setProp(MultistageProperties.ENCRYPT_KEY_LOC.toString(), masterKeyLoc); + PowerMockito.mockStatic(PasswordManager.class); + PowerMockito.when(PasswordManager.getInstance(state)).thenReturn(passwordManager); + } + + @Test + void testDecryption() { + when(passwordManager.readPassword(ENC_PASSWORD)).thenReturn(PLAIN_PASSWORD); + Assert.assertEquals(EncryptionUtils.decryptGobblin(ENC_PASSWORD, state), PLAIN_PASSWORD); + Assert.assertEquals(EncryptionUtils.decryptGobblin(PLAIN_PASSWORD, state), PLAIN_PASSWORD); + } + + @Test + void testEncryption() { + when(passwordManager.encryptPassword(PLAIN_PASSWORD)).thenReturn(ENC_PASSWORD); + when(passwordManager.readPassword(ENC_PASSWORD)).thenReturn(PLAIN_PASSWORD); + Assert.assertEquals(EncryptionUtils.decryptGobblin(EncryptionUtils.encryptGobblin(PLAIN_PASSWORD, state), state), + PLAIN_PASSWORD); + + when(passwordManager.encryptPassword(ENC_PASSWORD)).thenReturn(ENC_PASSWORD); + Assert.assertEquals(EncryptionUtils.decryptGobblin(EncryptionUtils.encryptGobblin(ENC_PASSWORD, state), state), + PLAIN_PASSWORD); + } + + @Test + void testGetGpgCodec() { + JsonObject parameters = new JsonObject(); + parameters.addProperty("cipher", "AES_256"); + parameters.addProperty("keystore_path","/tmp/public.key"); + Assert.assertTrue(EncryptionUtils.getGpgCodec(parameters) instanceof StreamCodec); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/util/HttpRequestMethodTest.java b/dil/src/test/java/com/linkedin/dil/util/HttpRequestMethodTest.java new file mode 100644 index 0000000..bbe33e3 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/HttpRequestMethodTest.java @@ -0,0 +1,174 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableMap; +import com.google.gson.Gson; +import com.google.gson.JsonObject; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import okhttp3.HttpUrl; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.http.client.methods.HttpDelete; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpPut; +import org.apache.http.client.methods.HttpUriRequest; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.powermock.api.mockito.PowerMockito.*; + + +/** + * Unit test for {@link HttpRequestMethod} + * @author chrli + * + */ + +@Test +@PrepareForTest({HttpUrl.class, VariableUtils.class}) +public class HttpRequestMethodTest extends PowerMockTestCase { + + final static String FROM_DATETIME = "2017-01-02T00:00:00-0800"; + final static String TO_DATETIME = "2019-10-25T15:00:00-0700"; + final static String HTTP_POST_FIX = "HTTP/1.1"; + final static String VERSION_2 = "v2"; + final static String CONTENT_TYPE = "Content-Type"; + final static String CONTENT_TYPE_VALUE = "application/x-www-form-urlencoded"; + final static String BASE_URI = "https://domain/%s/calls"; + private static Gson gson = new Gson(); + private Map headers; + private String expected; + private JsonObject parameters; + + static JsonObject generateParameterString(String fromDateTime, String toDateTime, String version) { + String parameterString = String.format("{\"fromDateTime\":\"%s\",\"toDateTime\":\"%s\"}", fromDateTime, toDateTime); + if (!Strings.isNullOrEmpty(version)) { + parameterString = + String.format("{\"fromDateTime\":\"%s\",\"toDateTime\":\"%s\",\"version\":\"%s\"}", fromDateTime, toDateTime, version); + } + return gson.fromJson(parameterString, JsonObject.class); + } + + @BeforeMethod + public void setUp() { + headers = new HashMap<>(); + } + + /** + * Test HttpGet method with parameters + * @throws UnsupportedEncodingException + */ + @Test + public void testGetHttpGetRequest() throws UnsupportedEncodingException { + expected = String.format( + "%s %s %s", + "GET", + String.format("%s?fromDateTime=%s&toDateTime=%s", String.format(BASE_URI, VERSION_2), + URLEncoder.encode(FROM_DATETIME, StandardCharsets.UTF_8.toString()), + URLEncoder.encode(TO_DATETIME, StandardCharsets.UTF_8.toString())), + HTTP_POST_FIX); + parameters = generateParameterString(FROM_DATETIME, TO_DATETIME, VERSION_2); + HttpUriRequest getRequest = HttpRequestMethod.GET.getHttpRequest(String.format(BASE_URI, "{{version}}"), parameters, headers); + Assert.assertEquals(getRequest.toString(), expected); + + addContentType(); + getRequest = HttpRequestMethod.GET.getHttpRequest(String.format(BASE_URI, "{{version}}"), parameters, headers); + Assert.assertEquals(getRequest.toString(), expected); + } + + /** + * Test HttpDelete method + * @throws UnsupportedEncodingException + */ + @Test + public void testGetHttpDeleteRequest() throws IOException { + String expected = String.format("%s %s %s", "DELETE", String.format(BASE_URI, VERSION_2), HTTP_POST_FIX); + parameters = generateParameterString(FROM_DATETIME, TO_DATETIME, ""); + + HttpDelete deleteRequest = (HttpDelete) HttpRequestMethod.DELETE.getHttpRequest(String.format(BASE_URI, VERSION_2), parameters, headers); + Assert.assertEquals(deleteRequest.toString(), expected); + + addContentType(); + deleteRequest = (HttpDelete) HttpRequestMethod.DELETE.getHttpRequest(String.format(BASE_URI, VERSION_2), parameters, headers); + Assert.assertEquals(deleteRequest.toString(), expected); + } + + /** + * Test HttpPost method with parameters + * @throws IOException + */ + @Test + public void testGetHttpPostRequest() throws IOException { + expected = String.format("%s %s %s", "POST", String.format(BASE_URI, VERSION_2), HTTP_POST_FIX); + parameters = generateParameterString(FROM_DATETIME, TO_DATETIME, ""); + HttpPost postRequest = (HttpPost) HttpRequestMethod.POST.getHttpRequest(String.format(BASE_URI, VERSION_2), parameters, headers); + Assert.assertEquals(expected, postRequest.toString()); + Assert.assertEquals(parameters.toString(), IOUtils.toString(postRequest.getEntity().getContent(), StandardCharsets.UTF_8)); + + addContentType(); + postRequest = (HttpPost) HttpRequestMethod.POST.getHttpRequest(String.format(BASE_URI, VERSION_2), parameters, headers); + Assert.assertEquals(postRequest.toString(), expected); + } + + /** + * Test HttpPut method with parameters + * @throws IOException + */ + @Test + public void testGetHttpPutRequest() throws IOException { + expected = String.format("%s %s %s", "PUT", String.format(BASE_URI, VERSION_2), HTTP_POST_FIX); + parameters = generateParameterString(FROM_DATETIME, TO_DATETIME, ""); + + HttpPut putRequest = (HttpPut) HttpRequestMethod.PUT.getHttpRequest(String.format(BASE_URI, VERSION_2), parameters, headers); + Assert.assertEquals(expected, putRequest.toString()); + Assert.assertEquals(parameters.toString(), IOUtils.toString(putRequest.getEntity().getContent(), StandardCharsets.UTF_8)); + + addContentType(); + putRequest = (HttpPut) HttpRequestMethod.PUT.getHttpRequest(String.format(BASE_URI, VERSION_2), parameters, headers); + Assert.assertEquals(expected, putRequest.toString()); + } + + /** + * Test getHttpRequest + */ + @Test + public void testGetHttpRequest() throws IOException { + PowerMockito.mockStatic(VariableUtils.class); + String uri = String.format(BASE_URI, VERSION_2); + Map headers = ImmutableMap.of(CONTENT_TYPE, CONTENT_TYPE_VALUE); + String expected = String.format("%s %s %s", "POST", uri, HTTP_POST_FIX); + JsonObject parameters = HttpRequestMethodTest.generateParameterString(FROM_DATETIME, TO_DATETIME, ""); + when(VariableUtils.replaceWithTracking(uri, parameters, true)).thenReturn(new ImmutablePair<>(uri, parameters)); + when(VariableUtils.replaceWithTracking(headers.get(CONTENT_TYPE), parameters)).thenReturn(new ImmutablePair<>(CONTENT_TYPE, parameters)); + Assert.assertEquals(HttpRequestMethod.POST.getHttpRequest(String.format(BASE_URI, VERSION_2), parameters, headers).toString(), expected); + } + + /** + * Test appendParameters with null uri + * Expected: null + */ + @Test + public void testAppendParametersWithNullUri() { + PowerMockito.mockStatic(HttpUrl.class); + when(HttpUrl.parse(null)).thenReturn(null); + Assert.assertEquals(HttpRequestMethod.PUT.appendParameters(null, null), null); + } + + private void addContentType() { + headers.clear(); + headers.put(CONTENT_TYPE, CONTENT_TYPE_VALUE); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/InputStreamUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/InputStreamUtilsTest.java new file mode 100644 index 0000000..6db1d2c --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/InputStreamUtilsTest.java @@ -0,0 +1,26 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class InputStreamUtilsTest { + @Test + public void testConvertListToInputStream() throws IOException { + InputStream stream = InputStreamUtils.convertListToInputStream(Arrays.asList("a", "b", "c")); + String data = InputStreamUtils.extractText(stream); + Assert.assertEquals(data, "a\nb\nc"); + + Assert.assertNull(InputStreamUtils.convertListToInputStream(null)); + Assert.assertNull(InputStreamUtils.convertListToInputStream(new ArrayList<>())); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/JdbcUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/JdbcUtilsTest.java new file mode 100644 index 0000000..35bfe46 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/JdbcUtilsTest.java @@ -0,0 +1,214 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.sun.rowset.JdbcRowSetImpl; +import java.sql.SQLException; +import java.sql.Types; +import javax.sql.rowset.RowSetMetaDataImpl; +import javax.sql.rowset.serial.SerialBlob; +import javax.sql.rowset.serial.SerialClob; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.lang.StringUtils; +import org.mockito.Mock; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.testng.PowerMockTestCase; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.*; + + +/** + * Test following functions of JdbcUtils + * 1. Column Type conversion from java.sql.Types to JsonElementTypes + * 2. + */ + +@PrepareForTest({Base64.class}) +public class JdbcUtilsTest extends PowerMockTestCase { + + @Mock + private SerialBlob blob; + @Mock + private SerialClob clob; + @Mock + private JdbcRowSetImpl rowSet; + @Mock + private RowSetMetaDataImpl rowSetMetaData; + + /** + * Test none nullable column type conversion per following rules + * Types.BOOLEAN, JsonElementTypes.BOOLEAN + * Types.DATE, JsonElementTypes.TIMESTAMP + * Types.TIMESTAMP, JsonElementTypes.TIMESTAMP + * Types.TIMESTAMP_WITH_TIMEZONE, JsonElementTypes.TIMESTAMP + * Types.TIME, JsonElementTypes.TIME + * Types.TIME_WITH_TIMEZONE, JsonElementTypes.TIME + * Types.TINYINT, JsonElementTypes.INT + * Types.SMALLINT, JsonElementTypes.INT + * Types.INTEGER, JsonElementTypes.INT + * Types.BIGINT, JsonElementTypes.LONG + * Types.DECIMAL, JsonElementTypes.DOUBLE + * Types.DOUBLE, JsonElementTypes.DOUBLE + * Types.FLOAT, JsonElementTypes.DOUBLE + * Types.REAL, JsonElementTypes.DOUBLE + * Types.NUMERIC, JsonElementTypes.DOUBLE + * Types.STRUCT, JsonElementTypes.RECORD + * Types.ARRAY, JsonElementTypes.ARRAY + * Everything else, JsonElementTypes.STRING + */ + @Test + public void testParseColumnTypeNotNull() { + Assert.assertEquals(JdbcUtils.parseColumnType(Types.BOOLEAN), JsonElementTypes.BOOLEAN); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.DATE), JsonElementTypes.TIMESTAMP); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TIMESTAMP), JsonElementTypes.TIMESTAMP); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TIMESTAMP_WITH_TIMEZONE), JsonElementTypes.TIMESTAMP); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TIME_WITH_TIMEZONE), JsonElementTypes.TIME); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TINYINT), JsonElementTypes.INT); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.SMALLINT), JsonElementTypes.INT); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.INTEGER), JsonElementTypes.INT); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.BIGINT), JsonElementTypes.LONG); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.DECIMAL), JsonElementTypes.DOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.DOUBLE), JsonElementTypes.DOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.FLOAT), JsonElementTypes.DOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.NUMERIC), JsonElementTypes.DOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.STRUCT), JsonElementTypes.RECORD); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.ARRAY), JsonElementTypes.ARRAY); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.CHAR), JsonElementTypes.STRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.NCHAR), JsonElementTypes.STRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.VARCHAR), JsonElementTypes.STRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.NVARCHAR), JsonElementTypes.STRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.LONGVARCHAR), JsonElementTypes.STRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.LONGNVARCHAR), JsonElementTypes.STRING); + } + + /** + * Test nullable column type conversion per following rules + * Types.BOOLEAN, JsonElementTypes.NULLABLEBOOLEAN + * Types.DATE, JsonElementTypes.NULLABLETIMESTAMP + * Types.TIMESTAMP, JsonElementTypes.NULLABLETIMESTAMP + * Types.TIMESTAMP_WITH_TIMEZONE, JsonElementTypes.NULLABLETIMESTAMP + * Types.TIME, JsonElementTypes.NULLABLETIME + * Types.TIME_WITH_TIMEZONE, JsonElementTypes.NULLABLETIME + * Types.TINYINT, JsonElementTypes.NULLABLEINT + * Types.SMALLINT, JsonElementTypes.NULLABLEINT + * Types.INTEGER, JsonElementTypes.NULLABLEINT + * Types.BIGINT, JsonElementTypes.NULLABLELONG + * Types.DECIMAL, JsonElementTypes.NULLABLEDOUBLE + * Types.DOUBLE, JsonElementTypes.NULLABLEDOUBLE + * Types.FLOAT, JsonElementTypes.NULLABLEDOUBLE + * Types.REAL, JsonElementTypes.NULLABLEDOUBLE + * Types.NUMERIC, JsonElementTypes.NULLABLEDOUBLE + * Types.STRUCT, JsonElementTypes.NULLABLERECORD + * Types.ARRAY, JsonElementTypes.NULLABLEARRAY + * Everything else, JsonElementTypes.NULLABLESTRING + */ + @Test + public void testParseColumnTypeNullable() { + Assert.assertEquals(JdbcUtils.parseColumnType(Types.BOOLEAN, true), JsonElementTypes.NULLABLEBOOLEAN); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.DATE, true), JsonElementTypes.NULLABLETIMESTAMP); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TIMESTAMP, true), JsonElementTypes.NULLABLETIMESTAMP); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TIMESTAMP_WITH_TIMEZONE, true), JsonElementTypes.NULLABLETIMESTAMP); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TIME_WITH_TIMEZONE, true), JsonElementTypes.NULLABLETIME); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.TINYINT, true), JsonElementTypes.NULLABLEINT); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.SMALLINT, true), JsonElementTypes.NULLABLEINT); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.INTEGER, true), JsonElementTypes.NULLABLEINT); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.BIGINT, true), JsonElementTypes.NULLABLELONG); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.DECIMAL, true), JsonElementTypes.NULLABLEDOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.DOUBLE, true), JsonElementTypes.NULLABLEDOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.FLOAT, true), JsonElementTypes.NULLABLEDOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.NUMERIC, true), JsonElementTypes.NULLABLEDOUBLE); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.STRUCT, true), JsonElementTypes.NULLABLERECORD); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.ARRAY, true), JsonElementTypes.NULLABLEARRAY); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.CHAR, true), JsonElementTypes.NULLABLESTRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.NCHAR, true), JsonElementTypes.NULLABLESTRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.VARCHAR, true), JsonElementTypes.NULLABLESTRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.NVARCHAR, true), JsonElementTypes.NULLABLESTRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.LONGVARCHAR, true), JsonElementTypes.NULLABLESTRING); + Assert.assertEquals(JdbcUtils.parseColumnType(Types.LONGNVARCHAR, true), JsonElementTypes.NULLABLESTRING); + } + + @Test + public void testParseColumnAsString() throws SQLException { + + int row = 6; + when(rowSetMetaData.getColumnType(row)).thenReturn(Types.BINARY); + when(rowSet.getBlob(row)).thenReturn(null); + Assert.assertEquals(JdbcUtils.parseColumnAsString(rowSet, rowSetMetaData, row), StringUtils.EMPTY); + + when(rowSetMetaData.getColumnType(row)).thenReturn(Types.CLOB); + Assert.assertEquals(JdbcUtils.parseColumnAsString(rowSet, rowSetMetaData, row), StringUtils.EMPTY); + + when(rowSetMetaData.getColumnType(row)).thenReturn(Types.BIT); + when(rowSet.getBoolean(row)).thenReturn(false); + when(rowSet.wasNull()).thenReturn(false); + Assert.assertEquals(JdbcUtils.parseColumnAsString(rowSet, rowSetMetaData, row), Boolean.toString(false)); + + when(rowSetMetaData.getColumnType(row)).thenReturn(Types.BIT); + when(rowSet.getBoolean(row)).thenReturn(false); + when(rowSet.wasNull()).thenReturn(true); + Assert.assertEquals(JdbcUtils.parseColumnAsString(rowSet, rowSetMetaData, row), null); + + when(rowSetMetaData.getColumnType(row)).thenReturn(Types.BIT); + when(rowSet.getBoolean(row)).thenReturn(true); + when(rowSet.wasNull()).thenReturn(false); + Assert.assertEquals(JdbcUtils.parseColumnAsString(rowSet, rowSetMetaData, row), Boolean.toString(true)); + + when(rowSetMetaData.getColumnType(row)).thenReturn(Types.BOOLEAN); + when(rowSet.getBoolean(row)).thenReturn(true); + Assert.assertEquals(JdbcUtils.parseColumnAsString(rowSet, rowSetMetaData, row), Boolean.toString(true)); + + when(rowSetMetaData.getColumnType(row)).thenReturn(Types.DATE); + when(rowSet.getString(row)).thenReturn(null); + Assert.assertEquals(JdbcUtils.parseColumnAsString(rowSet, rowSetMetaData, row), null); + } + + @Test + public void testIsBlob() { + Assert.assertEquals(JdbcUtils.isBlob(Types.LONGVARBINARY), true); + Assert.assertEquals(JdbcUtils.isBlob(Types.BINARY), true); + Assert.assertEquals(JdbcUtils.isBlob(Types.ARRAY), false); + } + + @Test + public void testIsClob() { + Assert.assertEquals(JdbcUtils.isClob(Types.CLOB), true); + Assert.assertEquals(JdbcUtils.isClob(Types.BLOB), false); + } + + @Test + public void testReadBlobAsString() throws SQLException { + Assert.assertEquals(JdbcUtils.readBlobAsString(null), StringUtils.EMPTY); + + when(blob.length()).thenReturn(1000L); + when(blob.getBytes(1L, (int) 1000L)).thenReturn(null); + Assert.assertEquals(JdbcUtils.readBlobAsString(blob), StringUtils.EMPTY); + + byte[] ba = "testbytes".getBytes(); + when(blob.getBytes(1L, (int) 1000L)).thenReturn(ba); + PowerMockito.mockStatic(Base64.class); + String expectedBase64String = "TestBase64String"; + when(Base64.encodeBase64String(ba)).thenReturn(expectedBase64String); + Assert.assertEquals(JdbcUtils.readBlobAsString(blob), expectedBase64String); + } + + @Test + public void testReadClobAsString() throws SQLException { + Assert.assertEquals(JdbcUtils.readClobAsString(null), StringUtils.EMPTY); + + when(clob.length()).thenReturn(1000L); + String testingClobString = "testingClobString"; + when(clob.getSubString(1, (int) 1000L)).thenReturn(testingClobString); + + Assert.assertEquals(JdbcUtils.readClobAsString(clob), testingClobString); + } + + @Test + public void testConvertBitToBoolean() { + Assert.assertEquals(JdbcUtils.convertBitToBoolean(), true); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/JsonElementTypesTest.java b/dil/src/test/java/com/linkedin/dil/util/JsonElementTypesTest.java new file mode 100644 index 0000000..d302a84 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/JsonElementTypesTest.java @@ -0,0 +1,254 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonNull; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static com.linkedin.dil.util.JsonElementTypes.*; + + +/** + * Test following methods of JsonElementTypes + * + * test 1: map element type to an avro alternative type + * test 2: primitive flag + * test 3: array flag, object flag, and null flag + * test 4: reverse nullability + * test 5: infer type from values + * test 6: infer type from a union + */ +@Test +public class JsonElementTypesTest { + /** + * Test element type to avro alternative type mapping + */ + @Test + public void testAltName() { + Assert.assertEquals(JsonElementTypes.ARRAY.getAltName(), "array"); + Assert.assertEquals(JsonElementTypes.BOOLEAN.getAltName(), "boolean"); + Assert.assertEquals(JsonElementTypes.DATE.getAltName(), "date"); + Assert.assertEquals(JsonElementTypes.DOUBLE.getAltName(), "double"); + Assert.assertEquals(JsonElementTypes.ENUM.getAltName(), "enum"); + Assert.assertEquals(JsonElementTypes.FLOAT.getAltName(), "double"); + Assert.assertEquals(JsonElementTypes.INT.getAltName(), "int"); + Assert.assertEquals(JsonElementTypes.INTEGER.getAltName(), "int"); + Assert.assertEquals(JsonElementTypes.INT64.getAltName(), "long"); + Assert.assertEquals(JsonElementTypes.LONG.getAltName(), "long"); + Assert.assertEquals(JsonElementTypes.NULL.getAltName(), "null"); + Assert.assertEquals(JsonElementTypes.NUMBER.getAltName(), "double"); + Assert.assertEquals(JsonElementTypes.OBJECT.getAltName(), "record"); + Assert.assertEquals(JsonElementTypes.PRIMITIVE.getAltName(), "primitive"); + Assert.assertEquals(JsonElementTypes.RECORD.getAltName(), "record"); + Assert.assertEquals(JsonElementTypes.STRING.getAltName(), "string"); + Assert.assertEquals(JsonElementTypes.TIME.getAltName(), "time"); + Assert.assertEquals(JsonElementTypes.TIMESTAMP.getAltName(), "timestamp"); + Assert.assertEquals(JsonElementTypes.UNION.getAltName(), "union"); + Assert.assertEquals(JsonElementTypes.UNKNOWN.getAltName(), "unknown"); + Assert.assertEquals(JsonElementTypes.NULLABLEARRAY.getAltName(), "array"); + Assert.assertEquals(JsonElementTypes.NULLABLEBOOLEAN.getAltName(), "boolean"); + Assert.assertEquals(JsonElementTypes.NULLABLEDOUBLE.getAltName(), "double"); + Assert.assertEquals(JsonElementTypes.NULLABLEINT.getAltName(), "int"); + Assert.assertEquals(JsonElementTypes.NULLABLELONG.getAltName(), "long"); + Assert.assertEquals(JsonElementTypes.NULLABLEOBJECT.getAltName(), "record"); + Assert.assertEquals(JsonElementTypes.NULLABLERECORD.getAltName(), "record"); + Assert.assertEquals(NULLABLESTRING.getAltName(), "string"); + Assert.assertEquals(JsonElementTypes.NULLABLETIME.getAltName(), "time"); + Assert.assertEquals(JsonElementTypes.NULLABLETIMESTAMP.getAltName(), "timestamp"); + } + + /** + * Test primitive flag of each element + */ + @Test + public void testPrimitiveFlag() { + Assert.assertFalse(JsonElementTypes.ARRAY.isPrimitive()); + Assert.assertTrue(JsonElementTypes.BOOLEAN.isPrimitive()); + Assert.assertTrue(JsonElementTypes.DATE.isPrimitive()); + Assert.assertTrue(JsonElementTypes.DOUBLE.isPrimitive()); + Assert.assertTrue(JsonElementTypes.ENUM.isPrimitive()); + Assert.assertTrue(JsonElementTypes.FLOAT.isPrimitive()); + Assert.assertTrue(JsonElementTypes.INT.isPrimitive()); + Assert.assertTrue(JsonElementTypes.INTEGER.isPrimitive()); + Assert.assertTrue(JsonElementTypes.INT64.isPrimitive()); + Assert.assertTrue(JsonElementTypes.LONG.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NULL.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NUMBER.isPrimitive()); + Assert.assertFalse(JsonElementTypes.OBJECT.isPrimitive()); + Assert.assertTrue(JsonElementTypes.PRIMITIVE.isPrimitive()); + Assert.assertFalse(JsonElementTypes.RECORD.isPrimitive()); + Assert.assertTrue(JsonElementTypes.STRING.isPrimitive()); + Assert.assertTrue(JsonElementTypes.TIME.isPrimitive()); + Assert.assertTrue(JsonElementTypes.TIMESTAMP.isPrimitive()); + Assert.assertFalse(JsonElementTypes.UNION.isPrimitive()); + Assert.assertTrue(JsonElementTypes.UNKNOWN.isPrimitive()); + Assert.assertFalse(JsonElementTypes.NULLABLEARRAY.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NULLABLEBOOLEAN.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NULLABLEDOUBLE.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NULLABLEINT.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NULLABLELONG.isPrimitive()); + Assert.assertFalse(JsonElementTypes.NULLABLEOBJECT.isPrimitive()); + Assert.assertFalse(JsonElementTypes.NULLABLERECORD.isPrimitive()); + Assert.assertTrue(NULLABLESTRING.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NULLABLETIME.isPrimitive()); + Assert.assertTrue(JsonElementTypes.NULLABLETIMESTAMP.isPrimitive()); + } + + /** + * check array flag + * Input : JsonElementTypes.ARRAY or NULLABLEARRAY + * Output: true + * + * Input : anything else + */ + public void testArrayFlag() { + Assert.assertTrue(JsonElementTypes.ARRAY.isArray()); + Assert.assertTrue(JsonElementTypes.NULLABLEARRAY.isArray()); + + Assert.assertFalse(JsonElementTypes.BOOLEAN.isArray()); + Assert.assertFalse(JsonElementTypes.INT.isArray()); + Assert.assertFalse(JsonElementTypes.OBJECT.isArray()); + Assert.assertFalse(JsonElementTypes.STRING.isArray()); + Assert.assertFalse(JsonElementTypes.NULLABLEBOOLEAN.isArray()); + Assert.assertFalse(JsonElementTypes.NULLABLEINT.isArray()); + Assert.assertFalse(NULLABLESTRING.isArray()); + } + + /** + * check object flag + * Input : JsonElementTypes.OBJECT or NULLABLEOBJECT + * Output: true + * + * Input : anything else + */ + public void testObjectFlag() { + Assert.assertTrue(JsonElementTypes.OBJECT.isObject()); + Assert.assertTrue(JsonElementTypes.NULLABLEOBJECT.isObject()); + + Assert.assertFalse(JsonElementTypes.ARRAY.isObject()); + Assert.assertFalse(JsonElementTypes.BOOLEAN.isObject()); + Assert.assertFalse(JsonElementTypes.INT.isObject()); + Assert.assertFalse(JsonElementTypes.STRING.isObject()); + Assert.assertFalse(JsonElementTypes.NULLABLEARRAY.isObject()); + Assert.assertFalse(JsonElementTypes.NULLABLEBOOLEAN.isObject()); + Assert.assertFalse(JsonElementTypes.NULLABLEINT.isObject()); + Assert.assertFalse(NULLABLESTRING.isObject()); + } + + /** + * check NULL flag + * Input : JsonElementTypes.NULL + * Output: true + * + * Input : anything else + * Output: false + */ + public void testNullFlag() { + Assert.assertTrue(JsonElementTypes.NULL.isNull()); + + Assert.assertFalse(JsonElementTypes.ARRAY.isNull()); + Assert.assertFalse(JsonElementTypes.BOOLEAN.isNull()); + Assert.assertFalse(JsonElementTypes.INT.isNull()); + Assert.assertFalse(JsonElementTypes.OBJECT.isNull()); + Assert.assertFalse(JsonElementTypes.STRING.isNull()); + Assert.assertFalse(JsonElementTypes.NULLABLEARRAY.isNull()); + Assert.assertFalse(JsonElementTypes.NULLABLEBOOLEAN.isNull()); + Assert.assertFalse(JsonElementTypes.NULLABLEINT.isNull()); + Assert.assertFalse(JsonElementTypes.NULLABLEOBJECT.isNull()); + Assert.assertFalse(NULLABLESTRING.isNull()); + } + + /** + * test reverse nullability + */ + public void testReverseNullability() { + Assert.assertEquals(JsonElementTypes.ARRAY.reverseNullability(), JsonElementTypes.NULLABLEARRAY); + Assert.assertEquals(JsonElementTypes.BOOLEAN.reverseNullability(), JsonElementTypes.NULLABLEBOOLEAN); + Assert.assertEquals(JsonElementTypes.INT.reverseNullability(), JsonElementTypes.NULLABLEINT); + Assert.assertEquals(JsonElementTypes.OBJECT.reverseNullability(), JsonElementTypes.NULLABLEOBJECT); + Assert.assertEquals(JsonElementTypes.RECORD.reverseNullability(), JsonElementTypes.NULLABLERECORD); + Assert.assertEquals(JsonElementTypes.STRING.reverseNullability(), NULLABLESTRING); + Assert.assertEquals(JsonElementTypes.NULLABLEARRAY.reverseNullability(), JsonElementTypes.ARRAY); + Assert.assertEquals(JsonElementTypes.NULLABLEBOOLEAN.reverseNullability(), JsonElementTypes.BOOLEAN); + Assert.assertEquals(JsonElementTypes.NULLABLEINT.reverseNullability(), JsonElementTypes.INT); + Assert.assertEquals(JsonElementTypes.NULLABLEOBJECT.reverseNullability(), JsonElementTypes.OBJECT); + Assert.assertEquals(JsonElementTypes.NULLABLERECORD.reverseNullability(), JsonElementTypes.RECORD); + Assert.assertEquals(NULLABLESTRING.reverseNullability(), JsonElementTypes.STRING); + } + + /** + * Test getTypeFromMultiple() + */ + public void testGetTypeFromMultiple() { + JsonArray test1 = new Gson().fromJson("[]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test1), JsonElementTypes.NULL); + + JsonArray test2 = new Gson().fromJson("[10, 100]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test2), JsonElementTypes.INT); + + JsonArray test3 = new Gson().fromJson("[10, 100, null]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test3), JsonElementTypes.NULLABLEINT); + + JsonArray test4 = new Gson().fromJson("[{\"value\": 10}, {\"value\": 100}]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test4), JsonElementTypes.OBJECT); + + JsonArray test5 = new Gson().fromJson("[null, {\"value\": 10}, {\"value\": 100}]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test5), JsonElementTypes.NULLABLEOBJECT); + + JsonArray test6 = new Gson().fromJson("[\"test.string\"]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test6), STRING); + + JsonArray test7 = new Gson().fromJson("[null,\"test.string\"]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test7), NULLABLESTRING); + + JsonArray test8 = new Gson().fromJson("[true]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test8), BOOLEAN); + + JsonArray test9 = new Gson().fromJson("[true,null,false]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test9), NULLABLEBOOLEAN); + + JsonArray jsonArray = new JsonArray(); + jsonArray.add(JsonNull.INSTANCE); + jsonArray.add(-7223372036854775808L); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(jsonArray), NULLABLELONG); + + JsonArray test11 = new Gson().fromJson("[null,123.23]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test11), NULLABLEDOUBLE); + + JsonArray test12 = new Gson().fromJson("[123.23]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test12), DOUBLE); + + JsonArray test13 = new Gson().fromJson("[null, 10]", JsonArray.class); + Assert.assertEquals(JsonElementTypes.getTypeFromMultiple(test13), JsonElementTypes.NULLABLEINT); + } + + /** + * Test forType() converting a UNION string to JsonElementType + * Input : "boolean" + * Output: JsonElementTypes.BOOLEAN + * + * Input : "string" + * Output: JsonElementTypes.STRING + * + * Input : ["string", "null"] + * Output: JsonElementTypes.NULLABLESTRING + */ + public void testForType() { + String test1 = "boolean"; + Assert.assertEquals(JsonElementTypes.forType(test1), JsonElementTypes.BOOLEAN); + + String test2 = "string"; + Assert.assertEquals(JsonElementTypes.forType(test2), JsonElementTypes.STRING); + + String test3 = "[\"string\", \"null\"]"; + Assert.assertEquals(JsonElementTypes.forType(test3), NULLABLESTRING); + + String test4 = "[\"string\", \"primitive\", \"integer\"]"; + Assert.assertEquals(JsonElementTypes.forType(test4), UNION); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/JsonIntermediateSchemaTest.java b/dil/src/test/java/com/linkedin/dil/util/JsonIntermediateSchemaTest.java new file mode 100644 index 0000000..9db7466 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/JsonIntermediateSchemaTest.java @@ -0,0 +1,111 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +@Test +public class JsonIntermediateSchemaTest { + private Gson gson = new Gson(); + + @Mock + private JsonIntermediateSchema jsonIntermediateSchema; + + @BeforeClass + public void setUp() { + MockitoAnnotations.initMocks(this); + } + + @Test + public void test_jisColumn_first_constructor() { + String jsonElementTypeString = "[{\"City\":\"Seattle\"}]"; + JsonIntermediateSchema.JisColumn jisColumn = + jsonIntermediateSchema.new JisColumn("name", true, jsonElementTypeString); + Assert.assertEquals("name", jisColumn.getColumnName()); + Assert.assertTrue(jisColumn.getIsNullable()); + Assert.assertEquals(JsonElementTypes.UNION, jisColumn.getDataType().type); + } + + @Test(expectedExceptions = RuntimeException.class) + public void test_jisColumn_second_constructor_failed() { + JsonObject obj = new JsonObject(); + obj.addProperty("name", "tester"); + jsonIntermediateSchema.new JisColumn(obj); + } + + @Test + public void test_jisColumn_second_constructor_succeeded() { + JsonObject dataTypeObj = new JsonObject(); + dataTypeObj.addProperty("name", "tester"); + dataTypeObj.addProperty("type", "[[\"name\"]]"); + JsonArray jsonArray = new JsonArray(); + jsonArray.add("symbolA"); + jsonArray.add("symbolB"); + dataTypeObj.add("symbols", jsonArray); + + JsonObject obj = new JsonObject(); + obj.addProperty("columnName", "testColumn"); + obj.addProperty("isNullable", false); + obj.add("dataType", dataTypeObj); + JsonIntermediateSchema.JisColumn jisColumn = jsonIntermediateSchema.new JisColumn(obj); + Assert.assertEquals("testColumn", jisColumn.getColumnName()); + Assert.assertFalse(jisColumn.isNullable); + Assert.assertEquals(JsonElementTypes.UNION, jisColumn.getDataType().getType()); + + obj = new JsonObject(); + dataTypeObj.addProperty("type", "ENUM"); + obj.add("dataType", dataTypeObj); + jisColumn = jsonIntermediateSchema.new JisColumn(obj); + Assert.assertEquals("unknown", jisColumn.getColumnName()); + Assert.assertTrue(jisColumn.isNullable); + Assert.assertEquals(JsonElementTypes.ENUM, jisColumn.getDataType().getType()); + Assert.assertEquals(jsonArray, jisColumn.getDataType().getSymbols()); + Assert.assertEquals("tester", jisColumn.getDataType().getName()); + } + + @Test + public void test_JsonIntermediateSchema_constructor_succeeded() { + JsonObject dataTypeObj = new JsonObject(); + dataTypeObj.addProperty("name", "tester"); + dataTypeObj.addProperty("type", "[[\"name\"]]"); + JsonArray jsonArray = new JsonArray(); + JsonObject obj = new JsonObject(); + obj.addProperty("columnName", "testColumn"); + obj.addProperty("isNullable", false); + obj.add("dataType", dataTypeObj); + jsonArray.add(obj); + + JsonIntermediateSchema jsonIntermediateSchema = new JsonIntermediateSchema(jsonArray); + Assert.assertEquals(jsonIntermediateSchema.schemaName, "root"); + JsonIntermediateSchema.JisColumn jisColumn = jsonIntermediateSchema.getColumns().get("testColumn"); + Assert.assertEquals("testColumn", jisColumn.getColumnName()); + Assert.assertFalse(jisColumn.getIsNullable()); + Assert.assertEquals(JsonElementTypes.UNION, jisColumn.getDataType().getType()); + Assert.assertEquals("tester", jisColumn.getDataType().name); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void test_JsonIntermediateSchema_constructor_non_object_failed() { + JsonArray jsonArray = new JsonArray(); + jsonArray.add("tester"); + new JsonIntermediateSchema(jsonArray); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void test_JsonIntermediateSchema_constructor_non_failed() { + JsonArray jsonArray = new JsonArray(); + jsonArray.add((JsonElement) null); + new JsonIntermediateSchema(jsonArray); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/JsonParameterTest.java b/dil/src/test/java/com/linkedin/dil/util/JsonParameterTest.java new file mode 100644 index 0000000..30c8b74 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/JsonParameterTest.java @@ -0,0 +1,184 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; +import gobblin.configuration.SourceState; +import java.io.InputStreamReader; +import java.lang.reflect.Method; +import org.apache.gobblin.configuration.State; +import com.linkedin.dil.configuration.MultistageProperties; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +/** + * Unit test for {@link JsonParameter} + * @author chrli + * + */ +@Test(groups = {"org.apache.gobblin.util"}) +public class JsonParameterTest { + private Gson gson; + private State state; + + @BeforeClass + public void setup() { + gson = new Gson(); + state = new State(); + } + + @Test + public void testJsonArrayParameters() { + + String results = "{\"jsonarray_parameter\":[{\"jsonkey\":\"jsonvalue\"},{\"jsonkey\":\"jsonvalue\"}]}"; + + JsonArray jsonArray = gson.fromJson(new InputStreamReader(this.getClass().getResourceAsStream("/util/parameter-jsonarray.json")), JsonArray.class); + Assert.assertEquals(results, JsonParameter.getParametersAsJsonString(jsonArray.toString(), new JsonObject(), new State())); + } + + @Test + public void testParameterEncryption() { + String expected = "{\"test-parameter\":\"password\"}"; + String encrypted = "ENC(M6nV+j0lhqZ36RgvuF5TQMyNvBtXmkPl)"; + String masterKeyLoc = this.getClass().getResource("/key/master.key").toString(); + SourceState state = new SourceState(); + state.setProp(MultistageProperties.ENCRYPT_KEY_LOC.toString(), masterKeyLoc); + JsonArray jsonArray = gson.fromJson(new InputStreamReader(this.getClass().getResourceAsStream("/json/parameter-encryption.json")), JsonArray.class); + Assert.assertEquals(expected, JsonParameter.getParametersAsJsonString(jsonArray.toString(), new JsonObject(), state)); + } + + @Test + public void testJsonParameterConstructor() { + JsonObject object = new JsonObject(); + JsonParameter jsonParameter = new JsonParameter(null, object, state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + Assert.assertEquals(jsonParameter.getParametersAsJsonString(), "{}"); + + String parameterString = "[{\"type\":\"JSONOBJECT\",\"value\":\"testValue\",\"name\":\"testName\",\"value\":{\"valueType\":\"testValue\"}}]"; + jsonParameter = new JsonParameter(parameterString, object, state); + Assert.assertEquals(jsonParameter.getParametersAsJsonString(), "{\"testName\":{\"valueType\":\"testValue\"}}"); + + parameterString = "[{\"type\":\"WATERMARK\",\"name\":\"system\",\"value\":\"low\",\"format\":\"datetime\"}]"; + jsonParameter = new JsonParameter(parameterString, object, state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + + parameterString = "[{\"type\":\"WATERMARK\",\"name\":\"system\",\"value\":\"low\",\"format\":\"datetime\",\"timezone\":\"America/Los_Angeles\"}]"; + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"watermark\":{\"low\":-100,\"high\":1564642800}}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"mark\":{\"low\":-100,\"high\":1564642800}}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + + parameterString = "[{\"type\":\"SESSION\",\"name\":\"system\"}]"; + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"session\": \"testSession\"}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJsonString(), "{\"system\":\"testSession\"}"); + + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"no_session\":{\"name\": \"records.cursor\"}}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + + parameterString = "[{\"type\":\"PAGESTART\",\"name\":\"page\"}]"; + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"pagestart\": 10}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJsonString(), "{\"page\":10}"); + + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"no_pagestart\":{\"name\": \"records.cursor\"}}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + + parameterString = "[{\"type\":\"PAGENO\",\"name\":\"num\"}]"; + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"pageno\": 9}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJsonString(), "{\"num\":9}"); + + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"no_pageno\":{\"name\": \"records.cursor\"}}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + + parameterString = "[{\"type\":\"PAGESIZE\",\"name\":\"num\"}]"; + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"pagesize\": {\"name\": \"records.cursor\"}}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + + jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"pagesize\":\"\"}", JsonObject.class), state); + Assert.assertEquals(jsonParameter.getParametersAsJson(), object); + } + + /** + * Test JsonParameter constructor with invalid parameter + * Expected: IllegalArgumentException + */ + @Test(expectedExceptions = IllegalArgumentException.class) + public void testJsonParameterConstructorWithInvalidInput() { + String parameterString = "[{\"type\":\"RANDOM\",\"name\":\"num\"}]"; + JsonObject values = gson.fromJson("{\"pagesize\": {\"name\": \"records.cursor\"}}", JsonObject.class); + JsonParameter jsonParameter = new JsonParameter(parameterString, values, state); + jsonParameter.getParametersAsJson(); + } + + /** + * Test valueCheck when bRequirePrimitive is false, or bAllowBlank is true + * @throws Exception + */ + @Test + public void testValueCheck() throws Exception { + String parameterString = "[{\"type\":\"pagesize\",\"name\":\"num\"}]"; + JsonParameter jsonParameter = new JsonParameter(parameterString, gson.fromJson("{\"pagesize\":10}", JsonObject.class), new State()); + JsonObject values = gson.fromJson("{\"test\": \"testValue\"}", JsonObject.class); + Method method = JsonParameter.class.getDeclaredMethod("valueCheck", JsonObject.class, String.class, boolean.class, boolean.class); + method.setAccessible(true); + Assert.assertTrue((Boolean) method.invoke(jsonParameter, values, "test", false, true)); + } + + /** + * Test LIST type parameter with choices based on extract mode + * + * Scenario 1: Full load mode, value is an array with 2 elements + * Input 1: full load mode, ms.paraemters = [{"name": "column", "value": ["createdDate", "updatedDate"]}] + * Output 1: {"column":"createdDate"} + * + * Scenario 2: Incremental load mode, values is an array with 2 elements + * Input 2: incremental load mode, ms.paraemters = [{"name": "column", "value": ["createdDate", "updatedDate"]}] + * Output 2: {"column":"createdDate"} + * + * Scenario 3: Incremental load mode, value is a primitive + * Input 3: incremental load mode, ms.paraemters = [{"name": "column", "value": "createdDate"}] + * Output 3: {"column":"createdDate"} + * + * Scenario 4: Incremental load mode, value is an array with 1 element + * Input 4: incremental load mode, ms.paraemters = [{"name": "column", "value": ["createdDate"]}] + * Output 4: {"column":"createdDate"} + * + * Scenario 5: Incremental load mode, value is an array with 0 element + * Input 5: incremental load mode, ms.paraemters = [{"name": "column", "value": []}] + * Output 5: {"column":""} + */ + @Test + public void testListParameterByExtractMode() { + JsonArray msParameters = gson.fromJson("[{\"name\": \"column\", \"value\": [\"createdDate\", \"updatedDate\"]}]", JsonArray.class); + JsonArray msParameters2 = gson.fromJson("[{\"name\": \"column\", \"value\": \"createdDate\"}]", JsonArray.class); + JsonArray msParameters3 = gson.fromJson("[{\"name\": \"column\", \"value\": [\"createdDate\"]}]", JsonArray.class); + JsonArray msParameters4 = gson.fromJson("[{\"name\": \"column\", \"value\": []}]", JsonArray.class); + SourceState state = new SourceState(); + + String expected = "{\"column\":\"createdDate\"}"; + state.setProp(MultistageProperties.EXTRACT_IS_FULL.toString(), true); + Assert.assertEquals(expected, JsonParameter.getParametersAsJsonString(msParameters.toString(), new JsonObject(), state)); + + expected = "{\"column\":\"updatedDate\"}"; + state.setProp(MultistageProperties.EXTRACT_IS_FULL.toString(), false); + Assert.assertEquals(expected, JsonParameter.getParametersAsJsonString(msParameters.toString(), new JsonObject(), state)); + + expected = "{\"column\":\"createdDate\"}"; + state.setProp(MultistageProperties.EXTRACT_IS_FULL.toString(), false); + Assert.assertEquals(expected, JsonParameter.getParametersAsJsonString(msParameters2.toString(), new JsonObject(), state)); + + expected = "{\"column\":\"createdDate\"}"; + state.setProp(MultistageProperties.EXTRACT_IS_FULL.toString(), false); + Assert.assertEquals(expected, JsonParameter.getParametersAsJsonString(msParameters3.toString(), new JsonObject(), state)); + + expected = "{\"column\":\"\"}"; + state.setProp(MultistageProperties.EXTRACT_IS_FULL.toString(), false); + Assert.assertEquals(expected, JsonParameter.getParametersAsJsonString(msParameters4.toString(), new JsonObject(), state)); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/util/JsonUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/JsonUtilsTest.java new file mode 100644 index 0000000..f325d7f --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/JsonUtilsTest.java @@ -0,0 +1,62 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.JsonObject; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class JsonUtilsTest { + @Test + public void testDeepCopy() { + JsonObject source = new JsonObject(); + source.addProperty("name", "value"); + + JsonObject replica = JsonUtils.deepCopy(source).getAsJsonObject(); + JsonObject same = source; + + source.remove("name"); + source.addProperty("name", "newValue"); + + Assert.assertEquals(source.get("name").getAsString(), same.get("name").getAsString()); + Assert.assertNotEquals(source.get("name").getAsString(), replica.get("name").getAsString()); + } + + @Test + public void testContains() { + JsonObject a = new JsonObject(); + JsonObject b = new JsonObject(); + + a.addProperty("name1", "value1"); + a.addProperty("name2", "value2"); + b.addProperty("name1", "value1"); + + Assert.assertTrue(JsonUtils.contains(a, b)); + Assert.assertTrue(JsonUtils.contains("{\"name1\": \"value1\", \"name2\": \"value2\"}", b)); + + b.addProperty("name2", "value2x"); + Assert.assertFalse(JsonUtils.contains(a, b)); + Assert.assertFalse(JsonUtils.contains("{\"name1\": \"value1\", \"name2\": \"value2\"}", b)); + + b.addProperty("name3", "value3"); + Assert.assertFalse(JsonUtils.contains(a, b)); + } + + @Test + public void testReplace() { + JsonObject a = new JsonObject(); + JsonObject b = new JsonObject(); + + a.addProperty("name1", "value1"); + b.addProperty("name1", "newValue1"); + + Assert.assertEquals(JsonUtils.replace(a, b).toString(), "{\"name1\":\"newValue1\"}"); + + a.addProperty("name2", "value1"); + Assert.assertEquals(JsonUtils.replace(a, b).toString(), "{\"name1\":\"newValue1\",\"name2\":\"value1\"}"); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/SchemaBuilderTest.java b/dil/src/test/java/com/linkedin/dil/util/SchemaBuilderTest.java new file mode 100644 index 0000000..8fb5197 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/SchemaBuilderTest.java @@ -0,0 +1,48 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import java.util.HashMap; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class SchemaBuilderTest { + @Test + public void testReverseJsonSchema() { + String originSchema = "{\"id\":{\"type\":\"string\"}}"; + SchemaBuilder builder = SchemaBuilder.fromJsonSchema(originSchema); + Assert.assertEquals(builder.buildJsonSchema().toString(), originSchema); + + originSchema = "{\"id\":{\"type\":[\"string\",\"null\"]}}"; + builder = SchemaBuilder.fromJsonSchema(originSchema); + Assert.assertEquals(builder.buildJsonSchema().toString(), originSchema); + + originSchema = "{\"methods\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}}}"; + builder = SchemaBuilder.fromJsonSchema(originSchema); + Assert.assertEquals(builder.buildJsonSchema().toString(), originSchema); + } + + @Test + public void testAltSchema() { + String originSchema = "{\"id\":{\"type\":\"string\"}}"; + SchemaBuilder builder = SchemaBuilder.fromJsonSchema(originSchema); + Assert.assertEquals(builder.buildAltSchema().toString(), + "[{\"columnName\":\"id\",\"isNullable\":false,\"dataType\":{\"type\":\"string\"}}]"); + Assert.assertEquals(builder.buildAltSchema(new HashMap<>(), false, null, null, true).toString(), + "[{\"columnName\":\"id\",\"isNullable\":true,\"dataType\":{\"type\":\"string\"}}]"); + + originSchema = "{\"id\":{\"type\":[\"string\",\"null\"]}}"; + builder = SchemaBuilder.fromJsonSchema(originSchema); + Assert.assertEquals(builder.buildAltSchema().toString(), + "[{\"columnName\":\"id\",\"isNullable\":true,\"dataType\":{\"type\":\"string\"}}]"); + + originSchema = "{\"methods\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}}}"; + builder = SchemaBuilder.fromJsonSchema(originSchema); + Assert.assertEquals(builder.buildAltSchema().toString(), + "[{\"columnName\":\"methods\",\"isNullable\":false,\"dataType\":{\"type\":\"array\",\"name\":\"methods\",\"items\":\"string\"}}]"); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/SchemaUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/SchemaUtilsTest.java new file mode 100644 index 0000000..9d78293 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/SchemaUtilsTest.java @@ -0,0 +1,37 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import java.util.Arrays; +import java.util.List; +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class SchemaUtilsTest { + + @Test + public void testIsValidOutputSchema() { + // valid schema + List schemaColumns = Arrays.asList("a", "b"); + List sourceColumns = Arrays.asList("a", "B", "C"); + Assert.assertTrue(SchemaUtils.isValidOutputSchema(schemaColumns, sourceColumns)); + + // valid schema + schemaColumns = Arrays.asList("a", "c"); + sourceColumns = Arrays.asList("a", "B", "C"); + Assert.assertTrue(SchemaUtils.isValidOutputSchema(schemaColumns, sourceColumns)); + + // some columns in the schema is nowhere to be found in the source + schemaColumns = Arrays.asList("a", "e"); + sourceColumns = Arrays.asList("a", "B", "C"); + Assert.assertFalse(SchemaUtils.isValidOutputSchema(schemaColumns, sourceColumns)); + + // order mismatch + schemaColumns = Arrays.asList("c", "a", "b"); + sourceColumns = Arrays.asList("a", "B", "C"); + Assert.assertFalse(SchemaUtils.isValidOutputSchema(schemaColumns, sourceColumns)); + } +} \ No newline at end of file diff --git a/dil/src/test/java/com/linkedin/dil/util/VariableUtilsTest.java b/dil/src/test/java/com/linkedin/dil/util/VariableUtilsTest.java new file mode 100644 index 0000000..30d4f7b --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/VariableUtilsTest.java @@ -0,0 +1,70 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonObject; +import java.io.UnsupportedEncodingException; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +@Test +public class VariableUtilsTest { + private final static String TEMPLATE = "{\"%s\":\"%s\"}"; + private final static String TEST_DATE_STRING = "2019-11-01 12:00:00"; + private final static String START_DATE_NAME = "startDate"; + private final static String END_DATE_NAME = "endDate"; + private Gson gson; + private JsonObject parameters; + + @BeforeClass + public void setUp() { + gson = new Gson(); + } + + @Test + void testReplaceWithTracking() throws UnsupportedEncodingException { + String template = "\\'{{Activity.CreatedAt}}\\' >= \\'{{startDate}}\\' and \\'{{Activity.CreatedAt}}\\' < \\'{{endDate}}\\'"; + JsonObject parameters = new JsonObject(); + parameters.addProperty("startDate", "2019-11-01 12:00:00"); + parameters.addProperty("endDate", "2019-11-02 12:00:00"); + String expected = "\\'{{Activity.CreatedAt}}\\' >= \\'2019-11-01 12:00:00\\' and \\'{{Activity.CreatedAt}}\\' < \\'2019-11-02 12:00:00\\'"; + Assert.assertEquals(VariableUtils.replaceWithTracking(template, parameters, false).getKey(), expected); + Assert.assertEquals(VariableUtils.replaceWithTracking(template, parameters).getKey(), expected); + + expected = "\\'{{Activity.CreatedAt}}\\' >= \\'2019-11-01+12%3A00%3A00\\' and \\'{{Activity.CreatedAt}}\\' < \\'2019-11-02+12%3A00%3A00\\'"; + Assert.assertEquals(VariableUtils.replaceWithTracking(template, parameters, true).getKey(), expected); + } + + /** + * Test: parameters contains value for placeholders in template + * Expected: placeholder replaced + * @throws UnsupportedEncodingException + */ + @Test + public void testReplaceWithTrackingII() throws UnsupportedEncodingException { + parameters = new JsonObject(); + parameters.addProperty(START_DATE_NAME, TEST_DATE_STRING); + Assert.assertEquals(VariableUtils.replace(gson.fromJson(String.format(TEMPLATE, START_DATE_NAME, "{{startDate}}"), JsonObject.class), parameters).toString(), + String.format(TEMPLATE, START_DATE_NAME, TEST_DATE_STRING)); + } + + /** + * Test: parameters doesn't contains value for placeholders in template + * Expected: placeholder not replaced + * @throws UnsupportedEncodingException + */ + @Test + public void testReplace() throws UnsupportedEncodingException { + String expected = String.format(String.format(TEMPLATE, START_DATE_NAME, "{{startDate}}")); + parameters = new JsonObject(); + parameters.addProperty(END_DATE_NAME, TEST_DATE_STRING); + Assert.assertEquals(VariableUtils.replaceWithTracking(expected, parameters, false), + new ImmutablePair<>(expected, gson.fromJson(String.format(TEMPLATE, END_DATE_NAME, TEST_DATE_STRING), JsonObject.class))); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/WatermarkDefinitionTest.java b/dil/src/test/java/com/linkedin/dil/util/WatermarkDefinitionTest.java new file mode 100644 index 0000000..ef3abd9 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/WatermarkDefinitionTest.java @@ -0,0 +1,215 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.testng.Assert; +import org.testng.annotations.Test; + +import static com.linkedin.dil.util.DateTimeUtils.*; + + +@Test +public class WatermarkDefinitionTest { + private String expected; + WatermarkDefinition definitions; + /** + * test typical watermark definitions + */ + @Test + public void testInitialization() { + expected = "(1546329600000,1546416000000)"; + definitions = new WatermarkDefinition("primary", "2019-01-01", "2019-01-02"); + Assert.assertEquals(expected, definitions.getRangeInMillis().toString()); + + expected = "(2019-01-01T00:00:00.000-08:00,2019-01-02T00:00:00.000-08:00)"; + Assert.assertEquals(expected, definitions.getRangeInDateTime().toString()); + + Gson gson = new Gson(); + String def = "[{\"name\":\"system\",\"type\":\"datetime\",\"range\":{\"from\":\"2017-01-02\",\"to\":\"-\"}}]"; + JsonArray defArray = gson.fromJson(def, JsonArray.class); + definitions = new WatermarkDefinition(defArray.get(0).getAsJsonObject(), false); + Assert.assertNotNull(definitions); + } + + /** + * test partition precision at day level for Weekly and Monthly partition types + */ + @Test + public void testPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes() { + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + + // Testing with 'to' as 'current date time(-)' + String expectedInMillis = String.format("(%s,%s)", + "1585810800000", + DateTime.now().withZone(timeZone).dayOfMonth().roundFloorCopy().getMillis()); + String expectedInDateTime = String.format("(%s,%s)", + "2020-04-02T00:00:00.000-07:00", + DateTime.now().withZone(timeZone).dayOfMonth().roundFloorCopy()); + String jsonDef = "[{\"name\":\"system\",\"type\":\"datetime\",\"range\":{\"from\":\"2020-04-02\",\"to\":\"-\"}}]"; + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, false, WorkUnitPartitionTypes.WEEKLY, + expectedInMillis, expectedInDateTime); + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, true, WorkUnitPartitionTypes.WEEKLY, + expectedInMillis, expectedInDateTime); + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, false, WorkUnitPartitionTypes.MONTHLY, + expectedInMillis, expectedInDateTime); + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, true, WorkUnitPartitionTypes.MONTHLY, + expectedInMillis, expectedInDateTime); + + // Testing with 'to' as 'two days ago(P2D)' + expectedInMillis = String.format("(%s,%s)", + "1585810800000", + DateTime.now().withZone(timeZone).minusDays(2).dayOfMonth().roundFloorCopy().getMillis()); + expectedInDateTime = String.format("(%s,%s)", + "2020-04-02T00:00:00.000-07:00", + DateTime.now().withZone(timeZone).minusDays(2).dayOfMonth().roundFloorCopy()); + jsonDef = "[{\"name\":\"system\",\"type\":\"datetime\",\"range\":{\"from\":\"2020-04-02\",\"to\":\"P2D\"}}]"; + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, false, WorkUnitPartitionTypes.WEEKLY, + expectedInMillis, expectedInDateTime); + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, true, WorkUnitPartitionTypes.WEEKLY, + expectedInMillis, expectedInDateTime); + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, false, WorkUnitPartitionTypes.MONTHLY, + expectedInMillis, expectedInDateTime); + helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(jsonDef, true, WorkUnitPartitionTypes.MONTHLY, + expectedInMillis, expectedInDateTime); + } + + private void helperPartitionPrecisionAtDayLevelForWeeklyAndMonthlyPartitionTypes(String jsonDef, + boolean isPartialPartition, WorkUnitPartitionTypes workUnitPartitionType, + String expectedInMillis, String expectedInDateTime) { + Gson gson = new Gson(); + JsonArray defArray = gson.fromJson(jsonDef, JsonArray.class); + definitions = new WatermarkDefinition(defArray.get(0).getAsJsonObject(), + isPartialPartition, workUnitPartitionType); + + Assert.assertEquals(definitions.getRangeInMillis().toString(), expectedInMillis); + Assert.assertEquals(definitions.getRangeInDateTime().toString(), expectedInDateTime); + } + + + /** + * a unit watermark can be a simple list of value strings, in such case the watermark name + * and individual values in the string will be made name : value pairs + */ + @Test + public void testSimpleUnitWatermarkDefintion() { + expected = "[{\"secondary\":\"2018\"}, {\"secondary\":\"2019\"}]"; + definitions = new WatermarkDefinition("secondary", "2018,2019"); + Assert.assertEquals(definitions.getUnits().toString(), expected); + } + + @Test + public void testGetDateTimePartialWithJson() { + Gson gson = new Gson(); + String def = "[{\"name\":\"system\",\"type\":\"datetime\",\"range\":{\"from\":\"2017-01-02\",\"to\":\"-\"}}]"; + JsonArray defArray = gson.fromJson(def, JsonArray.class); + definitions = new WatermarkDefinition(defArray.get(0).getAsJsonObject(), true); + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + // If partial partition is set to true, time should not round to 00:00:00-000 + Assert.assertNotEquals(definitions.getRangeInDateTime().getRight(), + DateTime.now().withZone(timeZone).dayOfMonth().roundFloorCopy()); + definitions = new WatermarkDefinition(defArray.get(0).getAsJsonObject(), false); + // If partial partition is set to false, time should be rounded to 00:00:00-000 + Assert.assertEquals(definitions.getRangeInDateTime().getRight(), + DateTime.now().withZone(timeZone).dayOfMonth().roundFloorCopy()); + } + + @Test + public void testGetDateTime() { + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + + // P1D + definitions = new WatermarkDefinition("primary", "2020-01-01", "P1D", false); + WatermarkDefinition definitionsIsPartial = new WatermarkDefinition("primary", "2020-01-01", "P1D", true); + + Assert.assertEquals(definitions.getRangeInDateTime().getRight(), + DateTime.now().withZone(timeZone).minusDays(1).dayOfMonth().roundFloorCopy()); + + // the millis second difference can often fail the test + // truncating the seconds and milli seconds to ensure success + Assert.assertEquals(definitionsIsPartial.getRangeInDateTime().getRight().minuteOfDay().roundFloorCopy(), + DateTime.now().withZone(timeZone).minusDays(1).minuteOfDay().roundFloorCopy()); + + // P2DT5H + definitions = new WatermarkDefinition("primary", "2020-01-01", "P2DT5H", false); + definitionsIsPartial = new WatermarkDefinition("primary", "2020-01-01", "P2DT5H", true); + + Assert.assertEquals(definitions.getRangeInDateTime().getRight(), + DateTime.now().withZone(timeZone).minusDays(2).minusHours(5).dayOfMonth().roundFloorCopy()); + + // the millis second difference can often fail the test + // truncating the seconds and milli seconds to ensure success + Assert.assertEquals(definitionsIsPartial.getRangeInDateTime().getRight().minuteOfDay().roundFloorCopy(), + DateTime.now().withZone(timeZone).minusDays(2).minusHours(5).minuteOfDay().roundFloorCopy()); + + // P0DT7H + definitions = new WatermarkDefinition("primary", "2020-01-01", "P0DT7H", false); + definitionsIsPartial = new WatermarkDefinition("primary", "2020-01-01", "P0DT7H", true); + + Assert.assertEquals(definitions.getRangeInDateTime().getRight(), + DateTime.now().withZone(timeZone).minusHours(7).dayOfMonth().roundFloorCopy()); + + // the millis second difference can often fail the test + // truncating the seconds and milli seconds to ensure success + Assert.assertEquals(definitionsIsPartial.getRangeInDateTime().getRight().minuteOfDay().roundFloorCopy(), + DateTime.now().withZone(timeZone).minusHours(7).minuteOfDay().roundFloorCopy()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetDateTimeWithInvalidIsoFormat1() { + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + definitions = new WatermarkDefinition("primary", "2020-01-01", "Pfoobarfoobar", false); + definitions.getRangeInDateTime().getRight(); + } + + @Test(expectedExceptions = StringIndexOutOfBoundsException.class) + public void testGetDateTimeWithInvalidIsoFormat2() { + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + definitions = new WatermarkDefinition("primary", "2020-01-01", "Pfoobar", false); + definitions.getRangeInDateTime().getRight(); + } + + @Test(expectedExceptions = StringIndexOutOfBoundsException.class) + public void testGetDateTimeWithValidMonthButUnsupportedIsoFormat() { + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + definitions = new WatermarkDefinition("primary", "2020-01-01", "P1M", false); + definitions.getRangeInDateTime().getRight(); + } + + @Test(expectedExceptions = StringIndexOutOfBoundsException.class) + public void testGetDateTimeWithValidMinutesButUnsupportedIsoFormat() { + DateTimeZone timeZone = DateTimeZone.forID("America/Los_Angeles"); + definitions = new WatermarkDefinition("primary", "2020-01-01", "PT10M", false); + definitions.getRangeInDateTime().getRight(); + } + + @Test + public void testToString() { + Assert.assertEquals(WatermarkDefinition.WatermarkTypes.UNIT.toString(), "unit"); + Assert.assertEquals(WatermarkDefinition.WatermarkTypes.DATETIME.toString(), "datetime"); + } + + @Test + public void testSetUnits() { + String watermarkString = "[{\"secondary\":\"2018\"}, {\"secondary\":\"2019\"}]"; + definitions = new WatermarkDefinition("secondary", watermarkString); + Assert.assertEquals(definitions.getName(), "secondary"); + Assert.assertEquals(definitions.getUnits().toString(), watermarkString); + } + + @Test + public void testGetDateTimeII() { + definitions = new WatermarkDefinition("primary", "[{\"secondary\":\"2018\"}]"); + Assert.assertEquals(definitions.getDateTime("2020-01-01 10:00:30").toString(), + "2020-01-01T10:00:30.000-08:00"); + + definitions.setTimezone(DEFAULT_TIMEZONE); + Assert.assertEquals(definitions.getDateTime("2020-01-01 10:00:30").toString(), + "2020-01-01T10:00:30.000-08:00"); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/WorkUnitPartitionTypesTest.java b/dil/src/test/java/com/linkedin/dil/util/WorkUnitPartitionTypesTest.java new file mode 100644 index 0000000..cc73807 --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/WorkUnitPartitionTypesTest.java @@ -0,0 +1,160 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class WorkUnitPartitionTypesTest { + + private final static DateTimeFormatter DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd").withZone(DateTimeZone.UTC); + + /** + * test breaking apart a range to hourly brackets, allowing partial hour + */ + @Test + public void testGetHourlyRangesPartial() { + DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(DateTimeZone.UTC); + String expected = + "[(1546304400000,1546308000000), (1546308000000,1546311600000), (1546311600000,1546315200000), (1546315200000,1546315506000)]"; + Assert.assertEquals(expected, + WorkUnitPartitionTypes.HOURLY.getRanges(formatter.parseDateTime("2019-01-01 01:00:00"), formatter.parseDateTime("2019-01-01 04:05:06"), true).toString()); + expected = "[(1546304400000,1546308000000), (1546308000000,1546311600000), (1546311600000,1546315200000)]"; + // High watermark truncated + Assert.assertEquals(expected, + WorkUnitPartitionTypes.HOURLY.getRanges(formatter.parseDateTime("2019-01-01 01:00:00"), formatter.parseDateTime("2019-01-01 04:00:00"), true).toString()); + } + + /** + * test breaking apart a range to hourly brackets + */ + @Test + public void testGetHourlyRanges() { + DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(DateTimeZone.UTC); + String expected = "[(1546304400000,1546308000000), (1546308000000,1546311600000), (1546311600000,1546315200000)]"; + Assert.assertEquals(WorkUnitPartitionTypes.HOURLY.getRanges(formatter.parseDateTime("2019-01-01 01:00:00"), + formatter.parseDateTime("2019-01-01 04:05:06"), false).toString(), + expected); + expected = "[(1546304400000,1546308000000), (1546308000000,1546311600000)]"; + // High watermark truncated + Assert.assertEquals(WorkUnitPartitionTypes.HOURLY.getRanges(formatter.parseDateTime("2019-01-01 01:00:00"), + formatter.parseDateTime("2019-01-01 03:00:00"), false).toString(), + expected); + } + + /** + * test breaking apart a range to daily brackets + */ + @Test + public void testGetDailyRanges() { + DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withZone(DateTimeZone.UTC); + + String expected = "[(1546300800000,1546387200000), (1546387200000,1546473600000)]"; + Assert.assertEquals(WorkUnitPartitionTypes.DAILY.getRanges(formatter.parseDateTime("2019-01-01 00:00:00"), + formatter.parseDateTime("2019-01-03 00:00:00"), true).toString(), + expected); + + expected = "[(1546300800000,1546387200000), (1546387200000,1546473600000), (1546473600000,1546477323000)]"; + // High watermark not truncated + Assert.assertEquals(WorkUnitPartitionTypes.DAILY.getRanges(formatter.parseDateTime("2019-01-01 00:00:00"), + formatter.parseDateTime("2019-01-03 01:02:03"), true).toString(), + expected); + } + + /** + * test breaking apart a recent range to daily brackets + */ + @Test + public void testGetDailyRangesRecent() { + DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd").withZone(DateTimeZone.UTC); + Assert.assertEquals( + WorkUnitPartitionTypes.DAILY.getRanges(DateTime.now().minusDays(2).dayOfMonth().roundFloorCopy(), DateTime.now(), false).size(), + 2); + Assert.assertEquals( + WorkUnitPartitionTypes.DAILY.getRanges(DateTime.now().minusDays(2).dayOfMonth().roundFloorCopy(), DateTime.now(), true).size(), + 3); + } + + /** + * test breaking apart a range to a weekly brackets + */ + @Test + public void testGetWeeklyRanges() { + DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd").withZone(DateTimeZone.UTC); + String expected = "[(1546300800000,1546905600000), (1546905600000,1547078400000)]"; + Assert.assertEquals(WorkUnitPartitionTypes.WEEKLY.getRanges(formatter.parseDateTime("2019-01-01"), + formatter.parseDateTime("2019-01-10"), true).toString(), + expected); + expected = "[(1546300800000,1546905600000)]"; + Assert.assertEquals(WorkUnitPartitionTypes.WEEKLY.getRanges(formatter.parseDateTime("2019-01-01"), + formatter.parseDateTime("2019-01-10"), false).toString(), + expected); + Assert.assertEquals(WorkUnitPartitionTypes.WEEKLY.getRanges(formatter.parseDateTime("2019-01-01"), + formatter.parseDateTime("2019-01-10")).toString(), + expected); + } + + /** + * Test from string + */ + @Test + public void testFromString() { + Assert.assertNull(WorkUnitPartitionTypes.fromString(null)); + Assert.assertEquals(WorkUnitPartitionTypes.HOURLY, WorkUnitPartitionTypes.fromString("hourly")); + Assert.assertEquals(WorkUnitPartitionTypes.DAILY, WorkUnitPartitionTypes.fromString("daily")); + Assert.assertEquals(WorkUnitPartitionTypes.WEEKLY, WorkUnitPartitionTypes.fromString("weekly")); + Assert.assertEquals(WorkUnitPartitionTypes.MONTHLY, WorkUnitPartitionTypes.fromString("monthly")); + } + + /** + * Test is weekly or monthly partitioned + */ + @Test + public void testIsDayPartitioned() { + Assert.assertFalse(WorkUnitPartitionTypes.isMultiDayPartitioned(null)); + Assert.assertFalse(WorkUnitPartitionTypes.isMultiDayPartitioned(WorkUnitPartitionTypes.HOURLY)); + Assert.assertFalse(WorkUnitPartitionTypes.isMultiDayPartitioned(WorkUnitPartitionTypes.DAILY)); + Assert.assertTrue(WorkUnitPartitionTypes.isMultiDayPartitioned(WorkUnitPartitionTypes.WEEKLY)); + Assert.assertTrue(WorkUnitPartitionTypes.isMultiDayPartitioned(WorkUnitPartitionTypes.MONTHLY)); + } + + /** + * test breaking apart a range to a monthly brackets + */ + @Test + public void testGetMonthlyRanges() { + DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd").withZone(DateTimeZone.UTC); + String expected = "[(1546300800000,1548979200000), (1548979200000,1549756800000)]"; + Assert.assertEquals(WorkUnitPartitionTypes.MONTHLY.getRanges( + formatter.parseDateTime("2019-01-01"), formatter.parseDateTime("2019-02-10"), true).toString(), + expected); + } + + /** + * Test getRanges() with date range which doesn't allow partial partitions + * Expected: list of full ranges + */ + @Test + public void testGetRangesWithoutPartitions() { + String expected = "[(1546387200000,1546992000000)]";// [2019-01-02 -> 2019-01-08] + Assert.assertEquals( + WorkUnitPartitionTypes.WEEKLY.getRanges( + new ImmutablePair<>(DATE_FORMATTER.parseDateTime("2019-01-02"), DATE_FORMATTER.parseDateTime("2019-01-10"))).toString(), + expected + ); + } + + @Test + public void testString() { + Assert.assertEquals(WorkUnitPartitionTypes.WEEKLY.toString(), "weekly"); + } +} diff --git a/dil/src/test/java/com/linkedin/dil/util/WorkUnitStatusTest.java b/dil/src/test/java/com/linkedin/dil/util/WorkUnitStatusTest.java new file mode 100644 index 0000000..9fd548b --- /dev/null +++ b/dil/src/test/java/com/linkedin/dil/util/WorkUnitStatusTest.java @@ -0,0 +1,60 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +package com.linkedin.dil.util; + +import com.google.common.collect.ImmutableMap; +import com.google.gson.JsonArray; +import java.util.HashMap; +import java.util.Map; +import org.testng.Assert; +import org.testng.annotations.Test; + + +@Test +public class WorkUnitStatusTest { + + /** + * testing the builder function + */ + @Test + public void testDataMethods() { + String expected = "WorkUnitStatus(totalCount=10, setCount=0, pageNumber=1, pageStart=0, pageSize=100, buffer=null, messages={name=text}, sessionKey=)"; + Map messages = new HashMap<>(); + messages.put("name", "text"); + Assert.assertEquals(expected, WorkUnitStatus.builder() + .pageNumber(1) + .pageSize(100) + .totalCount(10) + .messages(messages) + .sessionKey("") + .build() + .toString()); + } + + /** + * test getting schema + * scenario 1: default value + * scenario 2: source provided value + * scenario 3: source provided invalid value + */ + public void testGetSchema() { + // when there is no source provided schema, the getSchema() method + // should just return a new JsonSchema object + Assert.assertEquals(WorkUnitStatus.builder().build().getSchema(), new JsonArray()); + + String originSchema = "{\"id\":{\"type\":\"string\"}}"; + SchemaBuilder builder = SchemaBuilder.fromJsonSchema(originSchema); + Map messages = new HashMap<>(); + messages.put("schema", builder.buildAltSchema().toString()); + Assert.assertEquals(WorkUnitStatus.builder().messages(messages).build().getSchema().toString(), + "[{\"columnName\":\"id\",\"isNullable\":false,\"dataType\":{\"type\":\"string\"}}]"); + + // source schema is invalid + WorkUnitStatus.builder().messages(ImmutableMap.of("schema", "{\"id\": {\"type\": \"string\"}")).build().getSchema(); + + // source schema is null + Assert.assertEquals(WorkUnitStatus.builder().messages(null).build().getSchema(), new JsonArray()); + } +} diff --git a/dil/src/test/resources/avro/empty_file.avro b/dil/src/test/resources/avro/empty_file.avro new file mode 100644 index 0000000..e69de29 diff --git a/dil/src/test/resources/csv/comma-separated.csv b/dil/src/test/resources/csv/comma-separated.csv new file mode 100644 index 0000000..05aeed8 --- /dev/null +++ b/dil/src/test/resources/csv/comma-separated.csv @@ -0,0 +1,5 @@ +value1,value2 +value3,value4 +value5,value6 +value7,value8 +value9,value10 \ No newline at end of file diff --git a/dil/src/test/resources/csv/common-crawl-files.csv b/dil/src/test/resources/csv/common-crawl-files.csv new file mode 100644 index 0000000..26d2299 --- /dev/null +++ b/dil/src/test/resources/csv/common-crawl-files.csv @@ -0,0 +1,10 @@ +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00000.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00001.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00002.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00003.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00004.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00005.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00006.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00007.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00008.gz +cc-index/collections/CC-MAIN-2019-43/indexes/cdx-00009.gz diff --git a/dil/src/test/resources/csv/ctl_d_text.dat b/dil/src/test/resources/csv/ctl_d_text.dat new file mode 100644 index 0000000..c01a6e2 --- /dev/null +++ b/dil/src/test/resources/csv/ctl_d_text.dat @@ -0,0 +1,2 @@ +P3592020-04-065475410610332230236526293721$5 USD World Wildlife Fund Donation(09) - Charity - Domestic5.00USD10.505.59D +P3592020-04-065210014989791541687610078846$5 Amazon.com Gift Card(03) - E-Gift Card - Domestic5.00USD10.505.59D diff --git a/dil/src/test/resources/csv/flat.csv b/dil/src/test/resources/csv/flat.csv new file mode 100644 index 0000000..8329c74 --- /dev/null +++ b/dil/src/test/resources/csv/flat.csv @@ -0,0 +1,3 @@ +col1,col2,col3,col4,col5 +val1,val2,val3,val4,val5 +val6,val7,val8,val9,val10 \ No newline at end of file diff --git a/dil/src/test/resources/csv/flat_uppercase_header.csv b/dil/src/test/resources/csv/flat_uppercase_header.csv new file mode 100644 index 0000000..7db5ae0 --- /dev/null +++ b/dil/src/test/resources/csv/flat_uppercase_header.csv @@ -0,0 +1,3 @@ +COL1,COL2,COL3,COL4,COL5 +val1,val2,val3,val4,val5 +val6,val7,val8,val9,val10 \ No newline at end of file diff --git a/dil/src/test/resources/csv/flat_without_header.csv b/dil/src/test/resources/csv/flat_without_header.csv new file mode 100644 index 0000000..038e5f4 --- /dev/null +++ b/dil/src/test/resources/csv/flat_without_header.csv @@ -0,0 +1,2 @@ +val1,val2,val3,val4,val5 +val6,val7,val8,val9,val10 \ No newline at end of file diff --git a/dil/src/test/resources/csv/ids.csv b/dil/src/test/resources/csv/ids.csv new file mode 100644 index 0000000..7175b05 --- /dev/null +++ b/dil/src/test/resources/csv/ids.csv @@ -0,0 +1,11 @@ +id_0,date +497766636,2020-04-10 -07:00 +583903591,2020-04-11 -07:00 +666997959,2020-04-12 -07:00 +131360915,2020-04-13 -07:00 +704160186,2020-04-14 -07:00 +121446123,2020-04-15 -07:00 +106921314,2020-04-16 -07:00 +147490822,2020-04-17 -07:00 +553282504,2020-04-18 -07:00 +216647067,2020-04-19 -07:00 \ No newline at end of file diff --git a/dil/src/test/resources/csv/ids_flat.csv b/dil/src/test/resources/csv/ids_flat.csv new file mode 100644 index 0000000..7d59d10 --- /dev/null +++ b/dil/src/test/resources/csv/ids_flat.csv @@ -0,0 +1,11 @@ +id0,id1,date,id2,id3,id4,id5,id6,id7,id8 +497766636,497766636,2020-04-10 -07:00,497766636,497766636,497766636,497766636,497766636,497766636,497766636 +583903591,583903591,2020-04-11 -07:00,583903591,583903591,583903591,583903591,583903591,583903591,583903591 +666997959,666997959,2020-04-12 -07:00,666997959,666997959,666997959,666997959,666997959,666997959,666997959 +131360915,131360915,2020-04-13 -07:00,131360915,131360915,131360915,131360915,131360915,131360915,131360915 +704160186,704160186,2020-04-14 -07:00,704160186,704160186,704160186,704160186,704160186,704160186,704160186 +121446123,121446123,2020-04-15 -07:00,121446123,121446123,121446123,121446123,121446123,121446123,121446123 +106921314,106921314,2020-04-16 -07:00,106921314,106921314,106921314,106921314,106921314,106921314,106921314 +147490822,147490822,2020-04-17 -07:00,147490822,147490822,147490822,147490822,147490822,147490822,147490822 +553282504,553282504,2020-04-18 -07:00,553282504,553282504,553282504,553282504,553282504,553282504,553282504 +216647067,216647067,2020-04-19 -07:00,216647067,216647067,216647067,216647067,216647067,216647067,216647067 \ No newline at end of file diff --git a/dil/src/test/resources/csv/ids_need_cleansing.csv b/dil/src/test/resources/csv/ids_need_cleansing.csv new file mode 100644 index 0000000..1510e9f --- /dev/null +++ b/dil/src/test/resources/csv/ids_need_cleansing.csv @@ -0,0 +1,11 @@ +id$0 +497766636$ +583903591 +666997959 +131360915 +704160186 +121446123 +106921314 +147490822 +553282504 +216647067 \ No newline at end of file diff --git a/dil/src/test/resources/csv/tab-separated.csv b/dil/src/test/resources/csv/tab-separated.csv new file mode 100644 index 0000000..6058d04 --- /dev/null +++ b/dil/src/test/resources/csv/tab-separated.csv @@ -0,0 +1,5 @@ +value1 value2 +value3 value4 +value5 value6 +value7 value8 +value9 value10 \ No newline at end of file diff --git a/dil/src/test/resources/gpg/test.csv b/dil/src/test/resources/gpg/test.csv new file mode 100644 index 0000000..ecd5582 --- /dev/null +++ b/dil/src/test/resources/gpg/test.csv @@ -0,0 +1,2 @@ +e52ac829-b712-40d8-8326-fb5b85b68ffd,IDFA,1557160998,589fcebe9364eebfa1a80910c3faab01f5a69c86,0 +660bc3cd-8f45-4141-a4ca-7f683cfebdc4,IDFA,1536552731,70a40e05ee2d4a7d9ea2728e6b1bec62b21ad9d44eacd891fd432196e7870eee,0 diff --git a/dil/src/test/resources/gpg/test.csv.gpg b/dil/src/test/resources/gpg/test.csv.gpg new file mode 100644 index 0000000..16d165f Binary files /dev/null and b/dil/src/test/resources/gpg/test.csv.gpg differ diff --git a/dil/src/test/resources/gpg/test.secret b/dil/src/test/resources/gpg/test.secret new file mode 100644 index 0000000..f9e9ed1 Binary files /dev/null and b/dil/src/test/resources/gpg/test.secret differ diff --git a/dil/src/test/resources/gzip/cc-index.paths.gz b/dil/src/test/resources/gzip/cc-index.paths.gz new file mode 100644 index 0000000..724260c Binary files /dev/null and b/dil/src/test/resources/gzip/cc-index.paths.gz differ diff --git a/dil/src/test/resources/json/last-page-with-data.json b/dil/src/test/resources/json/last-page-with-data.json new file mode 100644 index 0000000..220a216 --- /dev/null +++ b/dil/src/test/resources/json/last-page-with-data.json @@ -0,0 +1 @@ +{"totalResults":2741497,"limit":100,"count":0,"items":[{"key1": "value1"},{"key2": "value2"}]} \ No newline at end of file diff --git a/dil/src/test/resources/json/parameter-encryption.json b/dil/src/test/resources/json/parameter-encryption.json new file mode 100644 index 0000000..40b3c76 --- /dev/null +++ b/dil/src/test/resources/json/parameter-encryption.json @@ -0,0 +1 @@ +[{"name":"test-parameter", "value": "ENC(M6nV+j0lhqZ36RgvuF5TQMyNvBtXmkPl)"}] \ No newline at end of file diff --git a/dil/src/test/resources/json/sample-data-for-source.json b/dil/src/test/resources/json/sample-data-for-source.json new file mode 100644 index 0000000..0926cb1 --- /dev/null +++ b/dil/src/test/resources/json/sample-data-for-source.json @@ -0,0 +1,131 @@ +{ + "ms.pagination": { + "fields": [ + "page_start", + "page_size", + "page_number" + ], + "initialvalues": [ + 0, + 100, + 1 + ] + }, + "ms.session.key.field": { + "name": "status", + "condition": { + "regexp": "success|ready" + } + }, + "ms.total.count.field": "total", + "ms.parameters": [ + { + "name": "filter", + "type": "object", + "value": [ + { + "name": "fromDateTime", + "type": "watermark", + "watermark": "system", + "value": "low", + "format": "datetime", + "pattern": "yyyy-MM-dd'T'HH:mm:ss'Z'" + }, + { + "name": "toDateTime", + "type": "watermark", + "watermark": "system", + "value": "high", + "format": "datetime", + "pattern": "yyyy-MM-dd'T'HH:mm:ss'Z'" + } + ] + }, + { + "name": "cursor", + "type": "session" + } + ], + "ms.encryption.fields": [ + "access_token", + "client_secret", + "refresh_token" + ], + "ms.data.field": "items", + "ms.call.interval.millis": "0", + "ms.wait.timeout.seconds": "3600", + "ms.enable.cleansing": "true", + "ms.work.unit.partial.partition": "false", + "ms.watermark": [ + { + "name": "system", + "type": "datetime", + "range": { + "from": "2018-01-01", + "to": "-" + } + } + ], + "ms.secondary.input": [ + { + "fields": [ + "uuid" + ], + "category": "authentication", + "retry": { + "delayInSec": "1", + "retryCount": "1" + }, + "authentication": { + } + } + ], + "ms.http.client.factory": "com.linkedin.dil.factory.ApacheHttpClientFactory", + "ms.http.request.headers": { + "Content-Type": "application/json", + "key": "adwpsdfsftcc9cj749fnb8xxsdfsn" + }, + "ms.source.uri": "https://test.com/v2/items/item", + "ms.http.request.method": "POST", + "ms.authentication": { + "method": "basic", + "encryption": "base64", + "header": "Authorization", + "token": "sdf23someresfsdwrw24234" + }, + "ms.http.statuses": { + "success": [ + 200, + 201, + 202 + ], + "warning": [ + 404, + 407 + ], + "error": [ + 301, + 302 + ], + "pagination_error": [ + 405, + 406 + ] + }, + "ms.http.status.reasons": { + "success": [ + "reason1 for success", + "reason2 for success", + "reason3 for success" + ], + "warning": [ + "reason1 for warning", + "reason2 for warning" + ], + "error": [ + "reason1 for error", + "reason2 for error" + ] + }, + "ms.extractor.class": "com.linkedin.dil.extractor.JsonExtractor" +} \ No newline at end of file diff --git a/dil/src/test/resources/json/sample-json-schema.json b/dil/src/test/resources/json/sample-json-schema.json new file mode 100644 index 0000000..72d38fb --- /dev/null +++ b/dil/src/test/resources/json/sample-json-schema.json @@ -0,0 +1,34 @@ +{ + "type": "array", + "items": { + "__metadata": { + "type": "object", + "properties": { + "type": { + "type": "string" + }, + "properties": { + "type": "object", + "properties": { + "User": { + "type": "object", + "properties": { + "associationuri": { + "type": "string" + } + } + }, + "Module": { + "type": "object", + "properties": { + "associationuri": { + "type": "string" + } + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/dil/src/test/resources/key/egress_test.key b/dil/src/test/resources/key/egress_test.key new file mode 100644 index 0000000..6eed4fd --- /dev/null +++ b/dil/src/test/resources/key/egress_test.key @@ -0,0 +1 @@ +egress_test \ No newline at end of file diff --git a/dil/src/test/resources/key/master.key b/dil/src/test/resources/key/master.key new file mode 100644 index 0000000..7ef0020 --- /dev/null +++ b/dil/src/test/resources/key/master.key @@ -0,0 +1 @@ +testkey \ No newline at end of file diff --git a/dil/src/test/resources/key/public.key b/dil/src/test/resources/key/public.key new file mode 100644 index 0000000..cb0652e --- /dev/null +++ b/dil/src/test/resources/key/public.key @@ -0,0 +1,52 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQINBF9ipkYBEAC2zBj35hXUFuEvlJAiMG4lkehQSYnLkHAGMCLYpSlnaj3XQeIr +6NtkldaJu1SzGTl56vsgRkb7WICUXEM1IXJFIldswSNLtBIaJKibmcingK/wl8Ho +UML16NKKPEFtzdIoQ1ut+hcN9WurBod5GpV7e64vp3modjZAPOaW3dfb0UBupINd +VWA9YXQROz7gw2YJa+P3VWvtJ+/CeXTWz+nxLWcEda/gdOiwA6XLMT0cs7AnqzuI +xzBccunWokcGj8vsmyTtayhYaDNt9qprLodhP150QVNf1S8gdqMq4Y+wjv6KOZSa +ovpS0B11yjQYRD1b4skyDveMjiKFIfiCTdTeS/mqn2ioTBMiCU4Lfz/uNfHcXOt2 +PYX7ZbZa2W822hP8iUr1v/3Ka3Eb9i+Fjw4vbNDQz1RTb/DdurNwtLoMZ2VX8fop +8nEY2s+/Mln9DJap+43uPCNTed/MZPduAXOsbqpyiUA9z1S+uEqURXnokPZxcxOd +2ezVePKP5NBWwKijUm5igEsTc5OVhJ2AfW75UKiMgoSJnn83brbDLc79M/iTEhWP +NAU48a8/0wiwwGD1feef954gJlk+mlByr2HA9WEprSGE/NZHtpM2b9l5WA0xenmK +9EKb6oB9CHwGf5URgTkMMP5e/NDmMU5Op/S8DwnxPtu4dWHdS2+NF1BmxwARAQAB +tE1DaHJpcyBMaSAoQ2hyaXMncyBHUEcgNDA5Ni1iaXQgbm9uLWV4cGlyaW5nIFJT +QS1SU0Ega2V5KSA8Y2hybGlAbGlua2VkaW4uY29tPokCTgQTAQgAOBYhBJkAsa0w +KhN4m4uFsUioTy+m44hwBQJfYqZGAhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA +AAoJEEioTy+m44hwxpoQAJq3kg994laCvROf0bKnaPDtnlQ1tBgLJ7dR2uZ+Joa0 +P5hNwkxkEYrhv4AXOMnqPP7Eo+1b6MXn/hreNzdExws6XB7XpXSOCPmWrfDjaxmk +t0hLTtM1gfkbWyXMGoADztyoNVVF/7/UFoKy8/Gaf8033+hiQm253A1Bd+Lk69AR +BEN7dG11jlX/C+P1pjlB4DJrF19D06OoZtw7JKrsg6Rti38mDtdBPE6Y1Hfz8lQP +0JxCIR9fz2LNUw74zKv8QBU2IQeYIInQSjw+qKeEdIrOlIhiLLsl2v07xvHxG6uj +qoMShFzwSJAQhNENIiGQKd5OUFa8c+G9zCkVAPGf95IdyMI5P/xjdX5pGo+uGsnT +/pP/rJ98NeiZfIHkneGjhJGjv9UIggu4DLr4nLjOUBRo3U0npGh8eu6XY3AEi874 +gOZiemNywYrgQku+/wEAcR5FdAADspZ2Scvhzm7Ojc8J5h43/YuSwPiA51tzNF4D +9I7Mjljlsvsq+u34cwl0PKxVlGMnGe42TdNY7I+M8Z7L12FG/dsqRqmOUsNiqRN/ +mrl5W35jSzIwuFM/oIH9QhBxeakVhkfudTf+lahFxbPMbfqHBKsYRFBxBJrIqWcH +1fL//W8F2V4umfrzK/yUlm8P5RsRNERJo0tNl5MEvZGArVysiXavCtb4L7n60bSy +uQINBF9ipkYBEACwpjXo2vN4DBfLWGeLzfTtsc8zaCgA+R5gfozT6Ap0XhDx2RdX +MxbIGBa5ho/RvtG5SoJnq+J6ZyoS/lQaFSPCQQHw2zaA43Dm+VdOJB0QP166lLXS +HXiarm44O1HOCzCvQWug0qWSz6cC+tn3z9maEsePL+5ImLwrkKVDh6RNreVCh09+ +s6slSymzDaLJv0k78/XTQ9iYKub3ibk8Q2DpqHqA7bhWfaiylODUxpxNF4IQHdw9 +eIL6Lh4s85u5iCBsdi9DvlQwUyESj4fepM8jICAJiRwxKrAAybcd3b0aW1n6CDtC +J0ZeYzP1pWym2LqYm8J817IIJv0mPJwl6GVhrvtvZ6fxoGMYNrT9Btiu/ZrAsS33 +/4Qn0LPMlFxl1XfhjMP+wMB7szuOgkZ2gp5ai7ze//kE3gNT4mZ7UhledjI8grlG +ebs8BSK8FbUXZ3jb5KV6xQjvmkgtOvn5IQ8w/+Yd7D3nh4bd5kK7PP48T10Lob1K +VwMt+KH4lduFfiYS0K/7GAwSkrbFIdHYv+DBIlQTxh3jtmbEH5gRIuFrNvqq/p2w +cfgqjPO+qaTQTjTdUNj/Ye2+uGdxYMJjR5UWett2RZJvyVnd4lJWkoblhL8JV3Hc +POGp4Gkf9O1pFFIc/wJvIRECx18Y2+APN7QPUg/Z3x+gdNBpB9hHJUjVKQARAQAB +iQI2BBgBCAAgFiEEmQCxrTAqE3ibi4WxSKhPL6bjiHAFAl9ipkYCGwwACgkQSKhP +L6bjiHDYHhAAr7wbdif4pa9o6+focbwuVc6wkAJllMA1SoQfFSFrlsqs76xBJFBP +Ui1KUAmYUGfJrBHkfTgy7DtPseQHqEsw/0ZqGw9ETCZhyIhPRKWRPKOeDQPd70lm +3do3eD7/OuZh1jrinctQXMg106D8iXT466HjmSdDP0Mn9mJyUUklgYr9eeX8Rdp3 +W4M7AuZCfM7NVV04C6Xw7KU38VFUNbKz5dth6+DfujqpMhf4KCOMfKtPbYLHApBD +9XyaHmISG19hONc5Bh7AKakWsExfHWJQdnqNWZGWXNIXU0CKLpngFFFiNHw91nFR +jPB9bgopbMwGG8Xl8qclRJo2RB+ko/5Hvl4A7mowg9FwUQz1PfgJ2U2zdc0oCykY +R+CofGzIHWviqMEEgvc+lYJLroFyqtXlgQPvUmvwzmBdjGexVVBVLOJVlu2yBlE5 +AKYcOJRGJCg9sUX0sx08kIKMcF/9ewujU5jq4eg8aaYXilLgwGNVXYNISeA76oCg +v34HHzwmNWlYqEdSx9ZrrJhoMxOjn4O0dbofGtvCFNUI4hVKdqgaxNDzrkMNihp7 +iM3P3B2rxvl9VlXSAlIrfc1/Rnj64+2OQT43pv49dT81ZSANkuAp5nXuaGz6MZQm +tAra+AWufa5pbz3B/M60k3LPQQ/qT2IvpIy9VOWEm2CM+w7lc9W8WhE= +=MrKk +-----END PGP PUBLIC KEY BLOCK----- diff --git a/dil/src/test/resources/key/sftp_test.key b/dil/src/test/resources/key/sftp_test.key new file mode 100644 index 0000000..fc80254 --- /dev/null +++ b/dil/src/test/resources/key/sftp_test.key @@ -0,0 +1 @@ +pass \ No newline at end of file diff --git a/dil/src/test/resources/other/sample-data-include-long-file-name.txt b/dil/src/test/resources/other/sample-data-include-long-file-name.txt new file mode 100644 index 0000000..50d2b86 --- /dev/null +++ b/dil/src/test/resources/other/sample-data-include-long-file-name.txt @@ -0,0 +1 @@ +sfdasfasfasfdfsafdasfdasfdsafdsafasdfasdfasfasdfasdfasdfasdfsafsadfasdfsafsadfsafasdfasdfasdfasfdafsdasfdsafsadfsaffsadfsadfasdfasdfsadfsadfsadfsadfasdfasfsadfasdfasdfasfasdfasdfasdfsadfasdfasfsadfasdfasdfasdfasdfdsafsadfasdfasdfasdfasdfasdfasfsadfasdfasd.txt \ No newline at end of file diff --git a/dil/src/test/resources/pull/s3-csv.pull b/dil/src/test/resources/pull/s3-csv.pull new file mode 100644 index 0000000..55f9abf --- /dev/null +++ b/dil/src/test/resources/pull/s3-csv.pull @@ -0,0 +1,23 @@ +source.class=com.linkedin.dil.source.S3SourceV2 +ms.extract.preprocessors=com.linkedin.dil.preprocessor.GunzipProcessor +ms.extractor.class=com.linkedin.dil.extractor.CsvExtractor +converter.classes=org.apache.gobblin.converter.csv.CsvToJsonConverterV2,org.apache.gobblin.converter.avro.JsonIntermediateToAvroConverter +ms.http.client.factory=com.linkedin.dil.factory.ApacheHttpClientFactory + +extract.namespace=com.linkedin.test +extract.table.name=test +extract.table.type=SNAPSHOT_ONLY +extract.is.full=true +job.name=testJob + +ms.output.schema=[{"columnName":"path","isNullable":"true","dataType":{"type":"string"}}] +ms.source.uri=https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-43/cc-index.paths.gz + +fs.uri=file://localhost/ +state.store.fs.uri=file://localhost/ +data.publisher.final.dir=/tmp/gobblin/job-output + +writer.destination.type=HDFS +writer.output.format=AVRO +ms.extractor.target.file.name=crawl-data/CC-MAIN-2019-43/cc-index.paths.gz + diff --git a/dil/src/test/resources/pull/s3-filedump.pull b/dil/src/test/resources/pull/s3-filedump.pull new file mode 100644 index 0000000..ba59680 --- /dev/null +++ b/dil/src/test/resources/pull/s3-filedump.pull @@ -0,0 +1,21 @@ +source.class=com.linkedin.dil.source.S3SourceV2 +#ms.extract.preprocessors=com.linkedin.dil.preprocessor.GunzipProcessor +ms.extractor.class=com.linkedin.dil.extractor.FileDumpExtractor +converter.classes=org.apache.gobblin.converter.csv.CsvToJsonConverterV2,org.apache.gobblin.converter.avro.JsonIntermediateToAvroConverter +ms.http.client.factory=com.linkedin.dil.factory.ApacheHttpClientFactory + +extract.namespace=com.linkedin.test +extract.table.name=test +extract.table.type=SNAPSHOT_ONLY +extract.is.full=true +job.name=testJob + +ms.extractor.target.file.name=crawl-data/CC-MAIN-2019-43/cc-index.paths.gz +ms.output.schema=[{"columnName":"path","isNullable":"true","dataType":{"type":"string"}}] +ms.source.uri=https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-43/cc-index.paths.gz +fs.uri=file://localhost/ +state.store.fs.uri=file://localhost/ +data.publisher.final.dir=/tmp/gobblin/job-output + +writer.destination.type=HDFS +writer.output.format=AVRO \ No newline at end of file diff --git a/dil/src/test/resources/util/parameter-jsonarray.json b/dil/src/test/resources/util/parameter-jsonarray.json new file mode 100644 index 0000000..5b87a94 --- /dev/null +++ b/dil/src/test/resources/util/parameter-jsonarray.json @@ -0,0 +1 @@ +[{"name":"jsonarray_parameter","type":"jsonarray","value":[{"jsonkey": "jsonvalue"}, {"jsonkey": "jsonvalue"}]}] \ No newline at end of file diff --git a/dil/src/test/resources/util/sample-json-schema.json b/dil/src/test/resources/util/sample-json-schema.json new file mode 100644 index 0000000..855dc79 --- /dev/null +++ b/dil/src/test/resources/util/sample-json-schema.json @@ -0,0 +1,11 @@ +{ + "an": { + "type": "array", + "items": { + "type": "string" + } + }, + "data": { + "type": "string" + } +} diff --git a/dil/src/test/resources/util/sample-schema.json b/dil/src/test/resources/util/sample-schema.json new file mode 100644 index 0000000..b74f8bd --- /dev/null +++ b/dil/src/test/resources/util/sample-schema.json @@ -0,0 +1,18 @@ +[ + { + "columnName": "an", + "isNullable": false, + "dataType": { + "type": "array", + "name": "an", + "items": "string" + } + }, + { + "columnName": "data", + "isNullable": false, + "dataType": { + "type": "string" + } + } +] \ No newline at end of file diff --git a/dil/src/test/resources/util/sample.json b/dil/src/test/resources/util/sample.json new file mode 100644 index 0000000..36efbe0 --- /dev/null +++ b/dil/src/test/resources/util/sample.json @@ -0,0 +1,4 @@ +{ + "an": [ "arbitrarily", "nested" ], + "data": "structure" +} \ No newline at end of file diff --git a/flavored-build.gradle b/flavored-build.gradle new file mode 100644 index 0000000..cef335d --- /dev/null +++ b/flavored-build.gradle @@ -0,0 +1,13 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +// Check for build customizations +if (project.hasProperty('dilFlavor')) { + def dilFlavorFileName = project.projectDir.toString() + '/flavored-' + project.dilFlavor + '.gradle' + if (file(dilFlavorFileName).exists()) { + println "Using flavor:" + project.dilFlavor + " for project " + project.name + apply from: dilFlavorFileName + } +} + diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000..2941c8c --- /dev/null +++ b/gradle.properties @@ -0,0 +1,24 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# Long-running Gradle process speeds up local builds +# To stop the daemon run 'ligradle --stop' +org.gradle.daemon=true + +# Configures only relevant projects to speed up the configuration of large projects +# Useful when specific project/task is invoked +org.gradle.configureondemand=true + +# Gradle will run tasks from subprojects in parallel +# Higher CPU usage, faster builds +org.gradle.parallel=true + +# Allows generation of idea/eclipse metadata for a specific subproject and its upstream project dependencies +ide.recursive=true + +version=0.0.1 +group=com.linkedin.dil +release=false + +org.gradle.jvmargs=-Xms512m -Xmx4096m \ No newline at end of file diff --git a/gradle/scripts/bintrayPublishing.gradle b/gradle/scripts/bintrayPublishing.gradle new file mode 100644 index 0000000..45e2547 --- /dev/null +++ b/gradle/scripts/bintrayPublishing.gradle @@ -0,0 +1,156 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +import org.gradle.api.internal.java.JavaLibrary + +ext.pomAttributes = { + packaging = 'jar' + // optionally artifactId can be defined here + name = 'Apache Gobblin (incubating)' + description = 'A distributed data integration framework for streaming and batch data ecosystems.' + url = 'https://gobblin.apache.org/' + + scm { + url = 'https://git-wip-us.apache.org/repos/asf?p=incubator-gobblin.git;a=tree' + connection = 'scm:http://git-wip-us.apache.org/repos/asf/incubator-gobblin.git' + developerConnection = 'scm:https://git-wip-us.apache.org/repos/asf/incubator-gobblin.git' + } + + licenses { + license { + name = 'BSD 2-CLAUSE LICENSE' + url = 'http://www.opensource.org/licenses/bsd-license.php' + } + } +} + +// Default publication (mavenJava) to publish artifacts as maven modules +// Uses maven-publish plugin and generates the default jar, sources, javadoc +// and pom file (https://docs.gradle.org/current/userguide/publishing_maven.html) +subprojects{ + plugins.withType(JavaPlugin) { + plugins.apply('maven-publish') + publishing { + publications { + mavenJava(MavenPublication) { + from components.java + artifact sourcesJar + artifact javadocJar + + pom pomAttributes + + pom.withXml { + //Ensures that correct dependencies are in the pom when subproject declares a project dependency + // with specific target configuration (restClient, dataTemplate, etc.) + //Needed because pom model is lossy and does not carry the 'configuration' information + def dependenciesNode = it.asNode().dependencies[0] + def removed = [] as Set; def added = [] //helps auditing + configurations.runtime.allDependencies.each { d -> + def confToPom = ['restClient': 'rest-client', 'dataTemplate': 'data-template', 'tests': 'test'] + if (d instanceof ProjectDependency && confToPom.containsKey(d.targetConfiguration)) { + boolean dependsOnMainModule = configurations.runtime.allDependencies.any { + it.name == d.name && it.targetConfiguration == 'default' + } + if (!dependsOnMainModule) { + //subproject declares a dependency on target configuration (i.e. path: 'gobblin-rest-api', configuration: 'restClient') + // but does not declare a dependency on the 'default' artifact (i.e. 'gobblin-rest-api') + //we need to remove the 'default' artifact from the pom file + def mainModuleNode = dependenciesNode.find { it.artifactId.text() == d.name } + dependenciesNode.remove(mainModuleNode) + removed.add(d.name) + } + + //adding explicit dependency on the artifact that corresponds to given target configuration + // (i.e. 'gobblin-rest-api-rest-client') + def dependencyNode = dependenciesNode.appendNode('dependency') + dependencyNode.appendNode('groupId', d.group) + def newArtifactId = d.name + "-" + confToPom[d.targetConfiguration] + dependencyNode.appendNode('artifactId', newArtifactId) + dependencyNode.appendNode('version', d.version) + added.add(newArtifactId) + + //adding global exclude rules to the new dependency entries + if (configurations.runtime.excludeRules.size() > 0) { + def exclusionsNode = dependencyNode.appendNode('exclusions') + configurations.runtime.excludeRules.each { rule -> + def exclusionNode = exclusionsNode.appendNode('exclusion') + exclusionNode.appendNode('groupId', rule.group ?: "*") + exclusionNode.appendNode('artifactId', rule.module) + } + } + } + } + if (added || removed) { + logger.lifecycle("Updated pom dependencies in {}, added: {}, removed: {}", project.path, added, removed) + } + } + } + } + } + } + + // Using gradle bintray plugin to publish artifacts to Jfrog bintray + plugins.apply('com.jfrog.bintray') + bintray { + user = project.hasProperty('bintrayUser') ? project.property('bintrayUser') : System.getenv('BINTRAY_USER') + key = project.hasProperty('bintrayApiKey') ? project.property('bintrayApiKey') : System.getenv('BINTRAY_API_KEY') + publications = ["mavenJava"] + publish = true + override = project.hasProperty("bintray.override") //[Default: false] + dryRun = project.hasProperty("bintray.dryRun") //[Default: false] + pkg { + repo = 'maven' + name = 'gobblin-github' + userOrg = 'linkedin' + licenses = ['Apache-2.0'] + vcsUrl = 'https://github.com/apache/incubator-gobblin.git' + version { + name = project.version + desc = 'Apache Gobblin' + released = new Date() + // disable gpg signing to speed up publishing + gpg { + sign = false + } + // disable upload to maven central + mavenCentralSync { + sync = false + } + } + } + } + + tasks.bintrayUpload { + doFirst { + println "Running bintrayUpload for $project.name, publications: $project.bintray.publications" + } + dependsOn publishToMavenLocal + } + + ext.addPublicationToBintray = { pubName -> + project.bintray.publications += pubName + } + + ext.addRuntimeDependenciesToPom = { + def dependenciesNode = asNode().appendNode('dependencies') + configurations.runtime.allDependencies.each { + if (it.group != null && it.name != null) { + def dependencyNode = dependenciesNode.appendNode('dependency') + dependencyNode.appendNode('groupId', it.group) + dependencyNode.appendNode('artifactId', it.name) + dependencyNode.appendNode('version', it.version) + + if (it.excludeRules.size() > 0) { + def exclusionsNode = dependencyNode.appendNode('exclusions') + it.excludeRules.each { rule -> + def exclusionNode = exclusionsNode.appendNode('exclusion') + exclusionNode.appendNode('groupId', rule.group) + exclusionNode.appendNode('artifactId', rule.module) + } + } + } + } + } + +} \ No newline at end of file diff --git a/gradle/scripts/buildscript.gradle b/gradle/scripts/buildscript.gradle new file mode 100644 index 0000000..24db83d --- /dev/null +++ b/gradle/scripts/buildscript.gradle @@ -0,0 +1,17 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +repositories { + repositories { + maven { + // For gradle-nexus-plugin + url 'https://jcenter.bintray.com/' + } + } +} + +dependencies { + classpath 'org.gradle.api.plugins:gradle-nexus-plugin:0.7.1' + classpath 'com.fizzpod:gradle-sweeney-plugin:1.0+' +} \ No newline at end of file diff --git a/gradle/scripts/ci-support.gradle b/gradle/scripts/ci-support.gradle new file mode 100644 index 0000000..71062ed --- /dev/null +++ b/gradle/scripts/ci-support.gradle @@ -0,0 +1,56 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + + +def corePaths = ["gobblin-binary-management", "gobblin-compaction", "gobblin-core", "gobblin-data-management", + "gobblin-hive-registration", "gobblin-runtime", "gobblin-yarn", "gobblin-metrics-libs", "gobblin-runtime-hadoop"] +def servicePaths = ["gobblin-api", "gobblin-rest-service", "gobblin-restli", "gobblin-service"] +def modulePaths = ["gobblin-modules"] + +task getGroupedTests { + doLast { + def taskNames = subprojects.findAll { + subproject -> subproject.tasks.hasProperty('test') + } + def includedGroups + switch(groupName) { + case "Core Tests": + includedGroups = taskNames.findAll {task -> + corePaths.any { + task.path.contains(it) + } + } + break; + case "Service Tests": + includedGroups = taskNames.findAll {task -> + servicePaths.any { + task.path.contains(it) + } + } + break; + case "Module Tests": + includedGroups = taskNames.findAll {task -> + modulePaths.any { + task.path.contains(it) + } + } + break; + case "Other Tests": + corePaths.addAll(servicePaths) + corePaths.addAll(modulePaths) + includedGroups = taskNames.findAll { task -> + !corePaths.any { + task.path.contains(it) + } + } + break; + default: + includedGroups = taskNames + break; + } + + def groupedTaskNames = includedGroups.collect { task -> task.tasks.findByName('test').getPath() } + println "CI Task: " + groupedTaskNames.join(" ") + } +} diff --git a/gradle/scripts/computeVersions.gradle b/gradle/scripts/computeVersions.gradle new file mode 100644 index 0000000..b850d18 --- /dev/null +++ b/gradle/scripts/computeVersions.gradle @@ -0,0 +1,56 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +if (!project.hasProperty('version') || project.version == 'unspecified') { + try { + exec { + commandLine 'git', 'fetch', '-t', 'https://github.com/apache/incubator-gobblin.git', 'master' + } + def versionOut = new ByteArrayOutputStream() + exec { + commandLine 'git', 'describe', '--tags', '--always' + standardOutput versionOut + } + def tagStr = versionOut.toString().trim() + println 'Using latest tag for version: ' + tagStr + if (tagStr.startsWith("gobblin_")) { + project.version = tagStr.substring(8) + } + else { + project.version = tagStr + } + } + catch (Exception e) { + logger.warn("Unable to determine version. Is this a git copy? Using 'unknown'."); + project.version = 'unknown' + } +} + +if (!project.hasProperty('pegasusVersion')) { + project.ext.pegasusVersion = '6.0.12' +} + +if (project.hasProperty('jdkVersion')) { + if (project.jdkVersion.equals(JavaVersion.VERSION_1_8.toString())) { + ext.javaVersion = JavaVersion.VERSION_1_8 + } else { + throw new RuntimeException("Unsupported JDK version: " + project.jdkVersion) + } +} + +// Hadoop validation +if (! hadoopVersion.startsWith('2.')) { + throw new GradleScriptException("Only Hadoop 2.x is supported: " + hadoopVersion); +} + +def gradleVersions = project.gradle.gradleVersion.tokenize(".") +ext.gradleVersionMajor = Integer.parseInt(gradleVersions[0]) +ext.gradleVersionMinor = Integer.parseInt(gradleVersions[1]) +println "Detected Gradle version major=" + gradleVersionMajor + " minor=" + gradleVersionMinor + +ext.dropwizardMetricsVersion = '3.2.3' +ext.findBugsVersion = '3.0.0' +ext.googleVersion = '1.22.0' +ext.slf4jVersion = '1.7.21' +ext.log4jVersion = '1.2.17' diff --git a/gradle/scripts/configureSubprojects.gradle b/gradle/scripts/configureSubprojects.gradle new file mode 100644 index 0000000..ee6d989 --- /dev/null +++ b/gradle/scripts/configureSubprojects.gradle @@ -0,0 +1,8 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +subprojects { + project.version = rootProject.version + project.group = rootProject.group +} diff --git a/gradle/scripts/defaultBuildProperties.gradle b/gradle/scripts/defaultBuildProperties.gradle new file mode 100644 index 0000000..3e81fd3 --- /dev/null +++ b/gradle/scripts/defaultBuildProperties.gradle @@ -0,0 +1,66 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +import org.apache.gobblin.gradle.BuildProperties +import org.apache.gobblin.gradle.BuildProperty + +def BuildProperties BUILD_PROPERTIES = new BuildProperties(project) + .register(new BuildProperty("sonatypeArtifactRepository", "https://oss.sonatype.org/service/local/staging/deploy/maven2/", "Maven repository to publish artifacts")) + .register(new BuildProperty("sonatypeArtifactSnapshotRepository", "https://oss.sonatype.org/content/repositories/snapshots/", "Maven repository to publish artifacts")) + .register(new BuildProperty("nexusArtifactRepository", "https://repository.apache.org/service/local/staging/deploy/maven2", "Maven repository to publish artifacts")) + .register(new BuildProperty("nexusArtifactSnapshotRepository", "https://repository.apache.org/content/repositories/snapshots", "Maven repository to publish artifacts")) + .register(new BuildProperty("gobblinVersion", "0.15.0-dev-9467", "Gobblin dependencies version")) + .register(new BuildProperty("avroVersion", "1.8.1", "Avro dependencies version")) + .register(new BuildProperty("awsVersion", "2.10.15", "AWS dependencies version")) + .register(new BuildProperty("bytemanVersion", "4.0.5", "Byteman dependencies version")) + .register(new BuildProperty("confluentVersion", "2.0.1", "confluent dependencies version")) + .register(new BuildProperty("doNotSignArtifacts", false, "Do not sight Maven artifacts")) + .register(new BuildProperty("gobblinFlavor", "standard", "Build flavor (see http://gobblin.readthedocs.io/en/latest/developer-guide/GobblinModules/)")) + .register(new BuildProperty("hadoopVersion", "2.3.0", "Hadoop dependencies version")) + .register(new BuildProperty("hiveVersion", "1.0.1", "Hive dependencies version")) + .register(new BuildProperty("icebergVersion", "0.10.0", "Iceberg dependencies version")) + .register(new BuildProperty("jdkVersion", JavaVersion.VERSION_1_8.toString(), + "Java languange compatibility; supported versions: " + JavaVersion.VERSION_1_8)) + .register(new BuildProperty("kafka08Version", "0.8.2.2", "Kafka 0.8 dependencies version")) + .register(new BuildProperty("kafka09Version", "0.9.0.1", "Kafka 0.9 dependencies version")) + .register(new BuildProperty("kafka1Version", "1.1.1", "Kafka 1.1 dependencies version")) + .register(new BuildProperty("pegasusVersion", "29.6.4", "Pegasus dependencies version")) + .register(new BuildProperty("publishToMaven", false, "Enable publishing of artifacts to a central Maven repository")) + .register(new BuildProperty("publishToNexus", false, "Enable publishing of artifacts to Nexus")) + .register(new BuildProperty("salesforceVersion", "42.0.0", "Salesforce dependencies version")) + +task buildProperties(description: 'Lists main properties that can be used to customize the build') { + doLast { + BUILD_PROPERTIES.printHelp(); + } +} + +// Gobblin build customization +BUILD_PROPERTIES.ensureDefined('gobblinFlavor') + +// Compiler compatibility +BUILD_PROPERTIES.ensureDefined('jdkVersion') + +BUILD_PROPERTIES.ensureDefined('sonatypeArtifactRepository') +BUILD_PROPERTIES.ensureDefined('sonatypeArtifactSnapshotRepository') +BUILD_PROPERTIES.ensureDefined('nexusArtifactRepository') +BUILD_PROPERTIES.ensureDefined('nexusArtifactSnapshotRepository') +BUILD_PROPERTIES.ensureDefined('doNotSignArtifacts') + +// Library dependencies versions +BUILD_PROPERTIES.ensureDefined('gobblinVersion') +BUILD_PROPERTIES.ensureDefined('avroVersion') +BUILD_PROPERTIES.ensureDefined('awsVersion') +BUILD_PROPERTIES.ensureDefined('bytemanVersion') +BUILD_PROPERTIES.ensureDefined('confluentVersion') +BUILD_PROPERTIES.ensureDefined('hadoopVersion') +BUILD_PROPERTIES.ensureDefined('hiveVersion') +BUILD_PROPERTIES.ensureDefined('icebergVersion') +BUILD_PROPERTIES.ensureDefined('kafka08Version') +BUILD_PROPERTIES.ensureDefined('kafka09Version') +BUILD_PROPERTIES.ensureDefined('kafka1Version') +BUILD_PROPERTIES.ensureDefined('pegasusVersion') +BUILD_PROPERTIES.ensureDefined('salesforceVersion') + +ext.buildProperties = BUILD_PROPERTIES diff --git a/gradle/scripts/dependencyDefinitions.gradle b/gradle/scripts/dependencyDefinitions.gradle new file mode 100644 index 0000000..2fb5af0 --- /dev/null +++ b/gradle/scripts/dependencyDefinitions.gradle @@ -0,0 +1,232 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +dependencyManagement { + imports { + mavenBom 'com.amazonaws:aws-java-sdk-bom:1.11.8' + } +} + +ext.externalDependency = [ + "antlrRuntime": "org.antlr:antlr-runtime:3.5.2", + "avro": "org.apache.avro:avro:" + avroVersion, + "avroMapredH2": "org.apache.avro:avro-mapred:" + avroVersion, + "awsCore": "software.amazon.awssdk:aws-core:" + awsVersion, + "awsApacheHttp": "software.amazon.awssdk:apache-client:" + awsVersion, + "awsHttpSpi": "software.amazon.awssdk:http-client-spi:" + awsVersion, + "awsS3": "software.amazon.awssdk:s3:" + awsVersion, + "awsUtils": "software.amazon.awssdk:utils:" + awsVersion, + "commonsCli": "commons-cli:commons-cli:1.3.1", + "commonsCodec": "commons-codec:commons-codec:1.10", + "commonsDbcp": "commons-dbcp:commons-dbcp:1.4", + "commonsEmail": "org.apache.commons:commons-email:1.4", + "commonsLang": "commons-lang:commons-lang:2.6", + "commonsLang3": "org.apache.commons:commons-lang3:3.4", + "commonsConfiguration": "commons-configuration:commons-configuration:1.10", + "commonsIo": "commons-io:commons-io:2.5", + "commonsMath": "org.apache.commons:commons-math3:3.5", + "commonsHttpClient": "commons-httpclient:commons-httpclient:3.1", + "commonsCompress":"org.apache.commons:commons-compress:1.10", + "commonsPool": "org.apache.commons:commons-pool2:2.4.2", + "datanucleusCore": "org.datanucleus:datanucleus-core:3.2.10", + "datanucleusRdbms": "org.datanucleus:datanucleus-rdbms:3.2.9", + "eventhub": "com.microsoft.azure:azure-eventhubs:0.9.0", + "guava": "com.google.guava:guava:15.0", + "groovy": "org.codehaus.groovy:groovy:2.4.8", + "gson": "com.google.code.gson:gson:2.6.2", + "findBugsAnnotations": "com.google.code.findbugs:jsr305:" + findBugsVersion, + "hadoopCommon": "org.apache.hadoop:hadoop-common:" + hadoopVersion, + "hadoopClientCore": "org.apache.hadoop:hadoop-mapreduce-client-core:" + hadoopVersion, + "hadoopClientCommon": "org.apache.hadoop:hadoop-mapreduce-client-common:" + hadoopVersion, + "hadoopHdfs": "org.apache.hadoop:hadoop-hdfs:" + hadoopVersion, + "hadoopAuth": "org.apache.hadoop:hadoop-auth:" + hadoopVersion, + "hadoopYarnApi": "org.apache.hadoop:hadoop-yarn-api:" + hadoopVersion, + "hadoopYarnCommon": "org.apache.hadoop:hadoop-yarn-common:" + hadoopVersion, + "hadoopYarnClient": "org.apache.hadoop:hadoop-yarn-client:" + hadoopVersion, + "hadoopYarnMiniCluster": "org.apache.hadoop:hadoop-minicluster:" + hadoopVersion, + "hadoopAnnotations": "org.apache.hadoop:hadoop-annotations:" + hadoopVersion, + "hadoopAws": "org.apache.hadoop:hadoop-aws:2.6.0", + "hdrHistogram": "org.hdrhistogram:HdrHistogram:2.1.11", + "helix": "org.apache.helix:helix-core:0.9.4", + "hiveCommon": "org.apache.hive:hive-common:" + hiveVersion, + "hiveService": "org.apache.hive:hive-service:" + hiveVersion, + "hiveJdbc": "org.apache.hive:hive-jdbc:" + hiveVersion, + "hiveMetastore": "org.apache.hive:hive-metastore:" + hiveVersion, + "hiveExec": "org.apache.hive:hive-exec:" + hiveVersion + ":core", + "hiveSerDe": "org.apache.hive:hive-serde:" + hiveVersion, + "hiveStorageApi": "org.apache.hive:hive-storage-api:2.4.0", + "httpclient": "org.apache.httpcomponents:httpclient:4.5.2", + "httpmime": "org.apache.httpcomponents:httpmime:4.5.2", + "httpcore": "org.apache.httpcomponents:httpcore:4.4.11", + "httpasyncclient": "org.apache.httpcomponents:httpasyncclient:4.1.3", + "icebergHive": "org.apache.iceberg:iceberg-hive-runtime:" + icebergVersion, + "jgit": "org.eclipse.jgit:org.eclipse.jgit:5.1.1.201809181055-r", + "jmh": "org.openjdk.jmh:jmh-core:1.17.3", + "jmhAnnotations": "org.openjdk.jmh:jmh-generator-annprocess:1.17.3", + "jollyday": "de.jollyday:jollyday:0.4.9", + "kafka08": "org.apache.kafka:kafka_2.11:" + kafka08Version, + "kafka08Test": "org.apache.kafka:kafka_2.11:" + kafka08Version + ":test", + "kafka08Client": "org.apache.kafka:kafka-clients:" + kafka08Version, + "kafka09": "org.apache.kafka:kafka_2.11:" + kafka09Version, + "kafka09Test": "org.apache.kafka:kafka_2.11:" + kafka09Version + ":test", + "kafka09Client": "org.apache.kafka:kafka-clients:" + kafka09Version, + "kafka1": "org.apache.kafka:kafka_2.11:" + kafka1Version, + "kafka1Test": "org.apache.kafka:kafka_2.11:" + kafka1Version + ":test", + "kafka1Client": "org.apache.kafka:kafka-clients:" + kafka1Version, + "kafka1ClientTest": "org.apache.kafka:kafka-clients:" + kafka1Version + ":test", + "confluentSchemaRegistryClient": "io.confluent:kafka-schema-registry-client:" + confluentVersion, + "confluentAvroSerializer": "io.confluent:kafka-avro-serializer:" + confluentVersion, + "confluentJsonSerializer": "io.confluent:kafka-json-serializer:" + confluentVersion, + "zkClient": "com.101tec:zkclient:0.7", + "quartz": "org.quartz-scheduler:quartz:2.2.3", + "testng": "org.testng:testng:6.9.10", + "junit": "junit:junit:4.12", + "mockserver":"org.mock-server:mockserver-netty:3.10.4", + "jacksonCore": "org.codehaus.jackson:jackson-core-asl:1.9.13", + "jacksonMapper": "org.codehaus.jackson:jackson-mapper-asl:1.9.13", + "jasypt": "org.jasypt:jasypt:1.9.2", + "jodaTime": "joda-time:joda-time:2.9.3", + "jgrapht": "org.jgrapht:jgrapht-core:0.9.2", + "metricsCore": "io.dropwizard.metrics:metrics-core:" + dropwizardMetricsVersion, + "metricsJvm": "io.dropwizard.metrics:metrics-jvm:" + dropwizardMetricsVersion, + "metricsGraphite": "io.dropwizard.metrics:metrics-graphite:" + dropwizardMetricsVersion, + "jsch": "com.jcraft:jsch:0.1.54", + "jdo2": "javax.jdo:jdo2-api:2.1", + "azkaban": "com.linkedin.azkaban:azkaban:2.5.0", + "commonsVfs": "org.apache.commons:commons-vfs2:2.0", + "mysqlConnector": "mysql:mysql-connector-java:8.0.16", + "javaxInject": "javax.inject:javax.inject:1", + "guice": "com.google.inject:guice:4.0", + "guiceServlet": "com.google.inject.extensions:guice-servlet:4.0", + "derby": "org.apache.derby:derby:10.12.1.1", + "mockito": "org.mockito:mockito-core:1.10.19", + "powermock": "org.powermock:powermock-mockito-release-full:1.6.2", + "salesforceWsc": "com.force.api:force-wsc:" + salesforceVersion, + "salesforcePartner": "com.force.api:force-partner-api:" + salesforceVersion, + "scala": "org.scala-lang:scala-library:2.11.8", + "influxdbJava": "org.influxdb:influxdb-java:2.1", + "kryo": "com.esotericsoftware.kryo:kryo:2.22", + "libthrift":"org.apache.thrift:libthrift:0.9.3", + "lombok":"org.projectlombok:lombok:1.16.8", + "mockRunnerJdbc":"com.mockrunner:mockrunner-jdbc:1.0.8", + "xerces":"xerces:xercesImpl:2.11.0", + "typesafeConfig": "com.typesafe:config:1.2.1", + "byteman": "org.jboss.byteman:byteman:" + bytemanVersion, + "bytemanBmunit": "org.jboss.byteman:byteman-bmunit:" + bytemanVersion, + "bcpgJdk15on": "org.bouncycastle:bcpg-jdk15on:1.52", + "bcprovJdk15on": "org.bouncycastle:bcprov-jdk15on:1.52", + "calciteCore": "org.apache.calcite:calcite-core:1.16.0", + "calciteAvatica": "org.apache.calcite:calcite-avatica:1.13.0", + "jhyde": "org.pentaho:pentaho-aggdesigner-algorithm:5.1.5-jhyde", + "curatorFramework": "org.apache.curator:curator-framework:2.10.0", + "curatorRecipes": "org.apache.curator:curator-recipes:2.10.0", + "curatorClient": "org.apache.curator:curator-client:2.10.0", + "curatorTest": "org.apache.curator:curator-test:2.10.0", + "hamcrest": "org.hamcrest:hamcrest-all:1.3", + "joptSimple": "net.sf.jopt-simple:jopt-simple:4.9", + "protobuf": "com.google.protobuf:protobuf-java:3.6.1", + "pegasus" : [ + "data" : "com.linkedin.pegasus:data:" + pegasusVersion, + "generator" : "com.linkedin.pegasus:generator:" + pegasusVersion, + "restliClient" : "com.linkedin.pegasus:restli-client:" + pegasusVersion, + "restliServer" : "com.linkedin.pegasus:restli-server:" + pegasusVersion, + "restliTools" : "com.linkedin.pegasus:restli-tools:" + pegasusVersion, + "pegasusCommon" : "com.linkedin.pegasus:pegasus-common:" + pegasusVersion, + "restliCommon" : "com.linkedin.pegasus:restli-common:" + pegasusVersion, + "r2" : "com.linkedin.pegasus:r2:" + pegasusVersion, + "d2" : "com.linkedin.pegasus:d2:" + pegasusVersion, + "r2Netty" : "com.linkedin.pegasus:r2-netty:" + pegasusVersion, + "restliNettyStandalone" : "com.linkedin.pegasus:restli-netty-standalone:" + pegasusVersion, + "restliGuiceBridge": "com.linkedin.pegasus:restli-guice-bridge:" + pegasusVersion + ], + "jetty": [ + "org.eclipse.jetty:jetty-server:9.2.14.v20151106", + "org.eclipse.jetty:jetty-servlet:9.2.14.v20151106" + ], + "servlet-api": "javax.servlet:servlet-api:3.1.0", + "guavaretrying": "com.github.rholder:guava-retrying:2.0.0", + "jsonAssert": "org.skyscreamer:jsonassert:1.3.0", + "reflections" : "org.reflections:reflections:0.9.10", + "embeddedProcess": "de.flapdoodle.embed:de.flapdoodle.embed.process:1.50.2", + "testMysqlServer": "com.wix:wix-embedded-mysql:4.6.1", + "flyway": "org.flywaydb:flyway-core:3.2.1", + "oltu": "org.apache.oltu.oauth2:org.apache.oltu.oauth2.client:1.0.2", + "googleAnalytics": "com.google.apis:google-api-services-analytics:v3-rev134-1.22.0", + "googleDrive": "com.google.apis:google-api-services-drive:v3-rev42-1.22.0", + "googleWebmasters": "com.google.apis:google-api-services-webmasters:v3-rev17-1.22.0", + "googleHttpClient": "com.google.http-client:google-http-client:" + googleVersion, + "googleHttpClientGson": "com.google.http-client:google-http-client-gson:" + googleVersion, + "googleOauthClient": "com.google.oauth-client:google-oauth-client:" + googleVersion, + "googleApiClient": "com.google.api-client:google-api-client:" + googleVersion, + "opencsv": "com.opencsv:opencsv:3.8", + "grok": "io.thekraken:grok:0.1.5", + "hadoopAdl" : "org.apache.hadoop:hadoop-azure-datalake:3.0.0-alpha2", + /** + * Avoiding conflicts with Hive 1.x versions existed in the classpath + */ + "orcMapreduce":"org.apache.orc:orc-mapreduce:1.6.5:nohive", + "orcCore": "org.apache.orc:orc-core:1.6.5:nohive", + "orcTools":"org.apache.orc:orc-tools:1.6.5", + 'parquet': 'org.apache.parquet:parquet-hadoop:1.11.0', + 'parquetAvro': 'org.apache.parquet:parquet-avro:1.11.0', + 'parquetProto': 'org.apache.parquet:parquet-protobuf:1.11.0', + 'parquetHadoop': 'org.apache.parquet:parquet-hadoop-bundle:1.11.0', + 'reactivex': 'io.reactivex.rxjava2:rxjava:2.1.0', + "slf4j": [ + "org.slf4j:slf4j-api:" + slf4jVersion, + "org.slf4j:log4j-over-slf4j:" + slf4jVersion, + "org.slf4j:jcl-over-slf4j:" + slf4jVersion + ], + "log4j": [ + "log4j:log4j:" + log4jVersion, + "log4j:apache-log4j-extras:" + log4jVersion + ], + "slf4jToLog4j":[ + "org.slf4j:slf4j-log4j12:" + slf4jVersion + ], + "postgresConnector": "org.postgresql:postgresql:42.1.4", + "assertj": 'org.assertj:assertj-core:3.8.0', + "jmockit": "org.jmockit:jmockit:1.30", + "mockito-core": "org.mockito:mockito-core:1.10.19", + "powermock-core": "org.powermock:powermock-core:1.6.5", + "powermock-api-mockito": "org.powermock:powermock-api-mockito:1.6.5", + "powermock-module-testng": "org.powermock:powermock-module-testng:1.6.5", + + "gobblin-api": "org.apache.gobblin:gobblin-api:" + gobblinVersion, + "gobblin-azkaban": "org.apache.gobblin:gobblin-azkaban:" + gobblinVersion, + "gobblin-core": "org.apache.gobblin:gobblin-core:" + gobblinVersion, + "gobblin-core-base": "org.apache.gobblin:gobblin-core-base:" + gobblinVersion, + "gobblin-crypto": "org.apache.gobblin:gobblin-crypto:" + gobblinVersion, + "gobblin-crypto-provider": "org.apache.gobblin:gobblin-crypto-provider:" + gobblinVersion, + "gobblin-data-management": "org.apache.gobblin:gobblin-data-management:" + gobblinVersion, + "gobblin-flow-config-service-api": "org.apache.gobblin:gobblin-data-management:" + gobblinVersion, + "gobblin-helix": "org.apache.gobblin:gobblin-hive-registration:" + gobblinVersion, + "gobblin-hive-registration": "org.apache.gobblin:gobblin-hive-registration:" + gobblinVersion, + "gobblin-http": "org.apache.gobblin:gobblin-http:" + gobblinVersion, + "gobblin-kafka-08": "org.apache.gobblin:gobblin-kafka-08:" + gobblinVersion, + "gobblin-kafka-09": "org.apache.gobblin:gobblin-kafka-09:" + gobblinVersion, + "gobblin-kafka-common": "org.apache.gobblin:gobblin-kafka-common:" + gobblinVersion, + "gobblin-metadata": "org.apache.gobblin:gobblin-metadata:" + gobblinVersion, + "gobblin-metastore": "org.apache.gobblin:gobblin-metastore:" + gobblinVersion, + "gobblin-metrics": "org.apache.gobblin:gobblin-metrics:" + gobblinVersion, + "gobblin-rest-api": "org.apache.gobblin:gobblin-rest-api:" + gobblinVersion, + "gobblin-rest-api-data-template": "org.apache.gobblin:gobblin-rest-api-data-template:" + gobblinVersion, + "gobblin-runtime": "org.apache.gobblin:gobblin-runtime:" + gobblinVersion, + "gobblin-salesforce": "org.apache.gobblin:gobblin-runtime:" + gobblinVersion, + "gobblin-sql": "org.apache.gobblin:gobblin-runtime:" + gobblinVersion, + "gobblin-utility": "org.apache.gobblin:gobblin-utility:" + gobblinVersion, + + "okhttp": "com.squareup.okhttp3:okhttp:4.8.0", + "commonsValidator": "commons-validator:commons-validator:1.6" +] + +if (!isDefaultEnvironment) +{ + ext.externalDependency.each { overrideDepKey, overrideDepValue -> + if (externalDependency[overrideDepKey] != null) + { + externalDependency[overrideDepKey] = overrideDepValue + } + } +} diff --git a/gradle/scripts/environment.gradle b/gradle/scripts/environment.gradle new file mode 100644 index 0000000..e9f0d84 --- /dev/null +++ b/gradle/scripts/environment.gradle @@ -0,0 +1,14 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +ext.build_script_dir = "${projectDir.path}/build_script" +ext.isDefaultEnvironment = !project.hasProperty('overrideBuildEnvironment') + +File getEnvironmentScript() { + final File env = file(isDefaultEnvironment ? 'defaultEnvironment.gradle' : project.overrideBuildEnvironment) + assert env.isFile() : "The environment script [$env] does not exists or is not a file." + return env +} + +apply from: environmentScript diff --git a/gradle/scripts/globalDependencies.gradle b/gradle/scripts/globalDependencies.gradle new file mode 100644 index 0000000..456d989 --- /dev/null +++ b/gradle/scripts/globalDependencies.gradle @@ -0,0 +1,65 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +import javax.tools.ToolProvider + + +subprojects { + plugins.withType(JavaPlugin) { + configurations { + customTestCompile + compile + dependencies { + if (!project.name.contains('gobblin-elasticsearch-deps')) { + compile(externalDependency.hadoopCommon) { + exclude module: 'servlet-api' + } + compile externalDependency.hadoopClientCore + compile externalDependency.hadoopAnnotations + if (project.name.equals('gobblin-runtime') || project.name.equals('gobblin-test')) { + compile externalDependency.hadoopClientCommon + } + compile(externalDependency.guava) { + force = true + } + } + compile(externalDependency.commonsCodec) { + force = true + } + //Since testCompile inherit from compile, so we cannot use testCompile to import dependency for log4j + customTestCompile externalDependency.log4j + customTestCompile externalDependency.slf4jToLog4j + + // Required to add JDK's tool jar, which is required to run byteman tests. + testCompile (files(((URLClassLoader) ToolProvider.getSystemToolClassLoader()).getURLs())) + } + if (!project.name.contains('gobblin-aws') && !project.name.contains('gobblin-distribution')) { + configurations.compile.dependencies*.each { + //exclude this jar because we introduce log4j-over-slf4j, these two jars cannot present at the same time + it.exclude group: 'org.slf4j', module: 'slf4j-log4j12' + //exclude log4j related jars to provide a clean log environment + it.exclude group: 'log4j', module: 'log4j' + it.exclude group: 'log4j', module: 'apache-log4j-extras' + } + } + all*.exclude group: 'org.apache.calcite', module: 'calcite-avatica' // replaced by org.apache.calcite.avatica:avatica-core + //exclude this jar in test class path because we are using log4j implementation to test + testCompile.exclude group: 'org.slf4j', module: 'log4j-over-slf4j' + } + test{ + //Add log4j into runtime path + classpath += configurations.customTestCompile + //Add log4j into compile path + sourceSets.test.compileClasspath += configurations.customTestCompile + } + } + + dependencies { + // Gradle 5 compatibility + compileOnly externalDependency.lombok + testCompileOnly externalDependency.lombok + annotationProcessor externalDependency.lombok + testAnnotationProcessor externalDependency.lombok + } +} diff --git a/gradle/scripts/idesSetup.gradle b/gradle/scripts/idesSetup.gradle new file mode 100644 index 0000000..250df0f --- /dev/null +++ b/gradle/scripts/idesSetup.gradle @@ -0,0 +1,25 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +apply plugin: 'idea' + +subprojects { + plugins.withType(JavaPlugin) { + plugins.apply('idea') + plugins.apply('eclipse') + + // Configure the IDEA plugin to (1) add the codegen as source dirs and (2) work around + // an apparent bug in the plugin which doesn't set the outputDir/testOutputDir as documented + idea.module { + // Gradle docs claim the two settings below are the default, but + // the actual defaults appear to be "out/production/$MODULE_NAME" + // and "out/test/$MODULE_NAME". Changing it so IDEA and gradle share + // the class output directory. + + outputDir = file(sourceSets.main.output.classesDirs.getSingleFile()) + testOutputDir = file(sourceSets.test.output.classesDirs.getSingleFile()) + } + + } +} diff --git a/gradle/scripts/jacoco-coveralls-support.gradle b/gradle/scripts/jacoco-coveralls-support.gradle new file mode 100644 index 0000000..a6977dd --- /dev/null +++ b/gradle/scripts/jacoco-coveralls-support.gradle @@ -0,0 +1,40 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +/** + * Adds integration with the Gradle JaCoCo Plugin + */ + +apply plugin: 'jacoco' + +ext.jacocoVersion = '0.8.1' + +jacoco { + toolVersion = jacocoVersion +} + +// Add the jacoco plugin to each project +subprojects { + // Can't figure out to to get the coveralls plugin to work if only some subprojects have the Java plugin, so for now the Java plugin is applied to all sub-projects + plugins.apply('java') + plugins.apply('jacoco') + + jacoco { + toolVersion = jacocoVersion + } + + jacocoTestReport { + reports { + html.enabled = true + xml.enabled = true + csv.enabled = false + } + + afterEvaluate { + classDirectories = files(classDirectories.files.collect { + fileTree(dir: it, exclude: 'helix-*.jar') + }) + } + } +} diff --git a/gradle/scripts/javaPlugin.gradle b/gradle/scripts/javaPlugin.gradle new file mode 100644 index 0000000..01238e7 --- /dev/null +++ b/gradle/scripts/javaPlugin.gradle @@ -0,0 +1,13 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +subprojects { + plugins.withType(JavaPlugin) { + sourceCompatibility = javaVersion + configurations { + // guava-jdk5 conflicts with guava libraries + all*.exclude module: 'guava-jdk5' + } + } +} diff --git a/gradle/scripts/javaVersionCheck.gradle b/gradle/scripts/javaVersionCheck.gradle new file mode 100644 index 0000000..98e4fca --- /dev/null +++ b/gradle/scripts/javaVersionCheck.gradle @@ -0,0 +1,17 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +gradle.taskGraph.whenReady { + gradle.taskGraph.allTasks.each { task -> + def taskProject = task.project + if (taskProject.hasProperty('requiresJavaVersion') && !taskProject.requiresJavaVersion.equals(javaVersion)) { + logger.warn("WARNING: Project {} requires Java version {} which conflicts with build version {}. COMPILATION DISABLED. Please use -PjdkVersion={} .", + taskProject.name, + taskProject.requiresJavaVersion, + javaVersion, + taskProject.requiresJavaVersion) + task.onlyIf { false } + } + } +} \ No newline at end of file diff --git a/gradle/scripts/javadoc.gradle b/gradle/scripts/javadoc.gradle new file mode 100644 index 0000000..2303c22 --- /dev/null +++ b/gradle/scripts/javadoc.gradle @@ -0,0 +1,97 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +task javadocTarball(type: Tar) { + baseName = "gobblin-javadoc-all" + destinationDir = new File(project.buildDir, baseName) + compression = Compression.GZIP + extension = 'tgz' + description = "Generates a tar-ball with all javadocs to ${destinationDir}/${archiveName}" + doLast { + def indexFile = new File(destinationDir, "index.md") + def version = rootProject.ext.javadocVersion + indexFile << """---- + layout: page + title: Gobblin Javadoc packages ${version} + permalink: /javadoc/${version}/ + ---- + + """ + rootProject.ext.javadocPackages.each { + indexFile << "* [${it}](${it})\n" + } + } +} + +// Javadoc initialization for subprojects +ext.javadocVersion = null != project.version ? project.version.toString() : "latest" +if (ext.javadocVersion.indexOf('-') > 0) { + // Remove any "-" addons from the version + ext.javadocVersion = javadocVersion.substring(0, javadocVersion.indexOf('-')) +} + +ext.javadocPackages = new HashSet() +subprojects.each{Project pr -> + if (file(pr.projectDir.absolutePath + "/src/main/java").exists()) { + rootProject.ext.javadocPackages += pr.name + } +} + +subprojects { + plugins.withType(JavaPlugin) { + + // Sometimes generating javadocs can lead to OOM. This may needs to be increased. + // Also force javadocs to pick up system proxy settings if available + javadoc { + options.encoding = 'UTF-8'; + options.jFlags('-Xmx256m', '-Djava.net.useSystemProxies=true'); + } + + rootProject.tasks.javadocTarball.dependsOn project.tasks.javadoc + if ( rootProject.ext.javadocPackages.contains(project.name)) { + rootProject.tasks.javadocTarball.into(project.name){from(fileTree(dir: "${project.buildDir}/docs/javadoc/"))} + } + } +} + +subprojects { + plugins.withType(JavaPlugin) { + if (isDefaultEnvironment) { + task javadocJar(type: Jar) { + from javadoc + classifier = 'javadoc' + } + artifacts { archives javadocJar } + } + + // Add standard javadoc repositories so we can reference classes in them using @link + tasks.javadoc.options.links "http://typesafehub.github.io/config/latest/api/", + "https://docs.oracle.com/javase/7/docs/api/", + "http://google.github.io/guava/releases/15.0/api/docs/", + "http://hadoop.apache.org/docs/r${rootProject.ext.hadoopVersion}/api/", + "https://hive.apache.org/javadocs/r${rootProject.ext.hiveVersion}/api/", + "http://avro.apache.org/docs/${avroVersion}/api/java/", + "https://dropwizard.github.io/metrics/${dropwizardMetricsVersion}/apidocs/" + } +} + +gradle.projectsEvaluated { + subprojects { + plugins.withType(JavaPlugin) { + getAllDependentProjects(project).each { + tasks.javadoc.options.linksOffline "http://linkedin.github.io/gobblin/javadoc/${javadocVersion}/${it.name}/", + "${rootProject.buildDir}/${it.name}/docs/javadoc/" + } + } + } +} + +//Turn off javadoc lint for Java 8+ +if (JavaVersion.current().isJava8Compatible()) { + allprojects { + tasks.withType(Javadoc) { + options.addStringOption('Xdoclint:none', '-quiet') + } + } +} diff --git a/gradle/scripts/mavenPublishing.gradle b/gradle/scripts/mavenPublishing.gradle new file mode 100644 index 0000000..d35a00b --- /dev/null +++ b/gradle/scripts/mavenPublishing.gradle @@ -0,0 +1,107 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +import java.util.concurrent.atomic.AtomicBoolean +import java.util.concurrent.locks.ReentrantLock + +// Set group for Maven +allprojects { + if (!project.hasProperty('group') || project.group.length() == 0) { + project.ext.group = 'org.apache.gobblin' + project.group = project.ext.group + } +} + +// Maven publishing +ext.buildProperties.ensureDefined('publishToMaven') +if (ext.publishToMaven) { + plugins.apply('maven') + // Workaround for a bug in gradle's "maven" plugin. See https://discuss.gradle.org/t/error-in-parallel-build/7215/3 + project.setProperty("org.gradle.parallel", "false") +} + +ext.signArtifacts = !project.doNotSignArtifacts + +// Maven POM generation is not thread safe, so serialize all the Upload tasks we can use `--parallel`. +// https://issues.gradle.org/browse/GRADLE-2492 +// When we start building with 2.3 and later we should remove this and just add a common output dir for all tasks and let Gradle serialize them +def lock = new ReentrantLock() +def available = lock.newCondition() +def busy = new AtomicBoolean() +def serializedTasks = [] +allprojects { + tasks.matching { it.name == "generatePom" || it instanceof Upload }.all { + serializedTasks << it + doFirst { + lock.lock() + while (busy.get()) { + available.await() + } + busy.set(true) + } + } +} + +gradle.taskGraph.afterTask { + if (it in serializedTasks && lock.heldByCurrentThread) { + busy.set(false) + available.signal() + lock.unlock() + } +} + +subprojects { + plugins.withType(JavaPlugin) { + plugins.apply('maven') + + install { + repositories { + mavenInstaller { + mavenLocal() + pom.project { + packaging 'jar' + name 'Apache Gobblin (incubating)' + description 'A distributed data integration framework for streaming and batch data ecosystems.' + url 'https://gobblin.apache.org/' + } + } + } + } + + // Publishing of maven artifacts for subprojects + if (rootProject.ext.publishToMaven) { + if (rootProject.ext.signArtifacts) { + plugins.apply('signing') + } + + uploadArchives { + repositories { + mavenDeployer { + beforeDeployment { MavenDeployment deployment -> + if (rootProject.ext.signArtifacts) { + signing.signPom(deployment) + } + } + + repository(url: rootProject.sonatypeArtifactRepository) { + authentication(userName: rootProject.ext.ossrhUsername, password: rootProject.ext.ossrhPassword) + } + + snapshotRepository(url: rootProject.sonatypeArtifactSnapshotRepository) { + authentication(userName: rootProject.ext.ossrhUsername, password: rootProject.ext.ossrhPassword) + } + + pom.project pomAttributes + } + } + } + + if (rootProject.ext.signArtifacts) { + signing { + sign configurations.archives + } + } + } + } +} diff --git a/gradle/scripts/nexusPublishing.gradle b/gradle/scripts/nexusPublishing.gradle new file mode 100644 index 0000000..e811882 --- /dev/null +++ b/gradle/scripts/nexusPublishing.gradle @@ -0,0 +1,128 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +import java.util.concurrent.atomic.AtomicBoolean +import java.util.concurrent.locks.ReentrantLock + +// Set group for Maven +allprojects { + if (!project.hasProperty('group') || project.group.length() == 0) { + project.ext.group = 'org.apache.gobblin' + project.group = project.ext.group + } +} + +// Nexus publishing +ext.buildProperties.ensureDefined('publishToNexus') +if (ext.publishToNexus) { + plugins.apply('maven') + // Workaround for a bug in gradle's "maven" plugin. See https://discuss.gradle.org/t/error-in-parallel-build/7215/3 + project.setProperty("org.gradle.parallel", "false") +} + +ext.signArtifacts = !project.doNotSignArtifacts + +// Maven POM generation is not thread safe, so serialize all the Upload tasks we can use `--parallel`. +// https://issues.gradle.org/browse/GRADLE-2492 +// When we start building with 2.3 and later we should remove this and just add a common output dir for all tasks and let Gradle serialize them +def lock = new ReentrantLock() +def available = lock.newCondition() +def busy = new AtomicBoolean() +def serializedTasks = [] +allprojects { + tasks.matching { it.name == "generatePom" || it instanceof Upload }.all { + serializedTasks << it + doFirst { + lock.lock() + while (busy.get()) { + available.await() + } + busy.set(true) + } + } +} + +gradle.taskGraph.afterTask { + if (it in serializedTasks && lock.heldByCurrentThread) { + busy.set(false) + available.signal() + lock.unlock() + } +} + +ext.pomAttributes = { + packaging 'jar' + // optionally artifactId can be defined here + name 'Apache Gobblin (incubating)' + description 'A distributed data integration framework for streaming and batch data ecosystems.' + url 'https://gobblin.apache.org/' + + scm { + url 'https://git-wip-us.apache.org/repos/asf?p=incubator-gobblin.git;a=tree' + connection 'scm:http://git-wip-us.apache.org/repos/asf/incubator-gobblin.git' + developerConnection 'scm:https://git-wip-us.apache.org/repos/asf/incubator-gobblin.git' + } + + licenses { + license { + name 'The Apache License, Version 2.0' + url 'http://www.apache.org/licenses/LICENSE-2.0.txt' + } + } +} + +subprojects { + plugins.withType(JavaPlugin) { + plugins.apply('maven') + + install { + repositories { + mavenInstaller { + mavenLocal() + pom.project { + packaging 'jar' + name 'Apache Gobblin (incubating)' + description 'A distributed data integration framework for streaming and batch data ecosystems.' + url 'https://gobblin.apache.org/' + } + } + } + } + + // Publishing of maven artifacts for subprojects + if (rootProject.ext.publishToNexus) { + if (rootProject.ext.signArtifacts) { + plugins.apply('signing') + } + + uploadArchives { + repositories { + mavenDeployer { + beforeDeployment { MavenDeployment deployment -> + if (rootProject.ext.signArtifacts) { + signing.signPom(deployment) + } + } + + repository(url: rootProject.nexusArtifactRepository) { + authentication(userName: rootProject.ext.nexusUsername, password: rootProject.ext.nexusPassword) + } + + snapshotRepository(url: rootProject.nexusArtifactSnapshotRepository) { + authentication(userName: rootProject.ext.nexusUsername, password: rootProject.ext.nexusPassword) + } + + pom.project pomAttributes + } + } + } + + if (rootProject.ext.signArtifacts) { + signing { + sign configurations.archives + } + } + } + } +} diff --git a/gradle/scripts/release.gradle b/gradle/scripts/release.gradle new file mode 100644 index 0000000..3badd61 --- /dev/null +++ b/gradle/scripts/release.gradle @@ -0,0 +1,164 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +project(':') { + apply plugin: 'base' + apply plugin: 'signing' +} + +def isRelease = ext.release.toBoolean() + +def releaseVersion = project.version +println String.format("Release Version: %s", releaseVersion) + +// Modify the gradle.properties to indicate whether this is a release. This results in the +// source releases generating artifacts without -SNAPSHOT appended to the version when they are +// built. +task prepare_release_config() { + project.copy { + from "$rootDir/gradle.properties" + into "$rootDir" + rename { filename -> + filename + ".release" + } + filter { line -> + if (isRelease && line.startsWith("release=")) { + "release=true" + } else { + line + } + } + } +} + +task sourceRelease(type: Tar, dependsOn: prepare_release_config) { + description = "Build a source release, specifically excluding the build directories and gradle wrapper files" + compression = Compression.GZIP + + baseName "gobblin-connectors-sources-${releaseVersion}" + + from(project.rootDir) { + // don't include gradle.properties because we use a modified version + exclude "gradle.properties" + exclude '**/build' + exclude '.gradle' + // including gradlew but excluding related jar because of: https://issues.apache.org/jira/browse/LEGAL-288 + // instructions for downloading the gradle-wrapper.jar have been added to the README + exclude 'gradle/wrapper/gradle-wrapper.jar' + exclude '.github' + exclude 'maven-sonatype' + exclude 'travis' + exclude '.classpath*' + exclude '.project*' + exclude '.settings' + exclude '**/.idea' + exclude '**/*.iml' + exclude '**/*.iws' + exclude '**/*.ipr' + exclude '**/.classpath' + exclude '**/.project' + exclude '**/.settings' + exclude '**/*.swp' + exclude '**/*.swo' + exclude '**/*.log' + exclude '**/build/' + exclude '**/.gradle' + exclude 'test-output' + exclude '**/test-output' + exclude 'dist' + exclude 'target' + exclude 'tmp' + exclude 'out' + exclude '**/out' + exclude 'output' + exclude 'gobblin-test/basicTest' + exclude 'gobblin-test/jobOutput' + exclude 'gobblin-test/state-store' + exclude 'gobblin-tesTaskt/metrics' + exclude 'gobblin-test/byteman' + exclude 'gobblin-test/locks' + exclude 'gobblin-test/mr-jobs' + exclude '**/mainGeneratedDataTemplate' + exclude '**/mainGeneratedRest' + exclude 'gobblin-dist' + exclude 'metastore_db' + exclude 'GobblinKey_*.pem' + exclude 'node_modules' + exclude 'package-lock.json' + exclude '**/gen-java' + exclude '**/generated-gobblin-cluster.conf' + exclude 'gobblin-modules/gobblin-couchbase/mock-couchbase' + exclude 'gobblin-modules/gobblin-elasticsearch/test-elasticsearch' + } + + // rename gradle.properties.release to gradle.properties + rename { filename -> + if (filename.equals("gradle.properties.release")) { + "gradle.properties" + } + else { + filename + } + } + + into(baseName) + + // Set destination directory. + destinationDir = file("${project.buildDir}/distribution/source") + + archiveName = "${baseName}.tgz" + doLast { + // Disable md5, since Apache release policy has changed + // ant.checksum file:"$destinationDir/$archiveName", algorithm: "MD5", fileext: ".md5" + ant.checksum file:"$destinationDir/$archiveName", algorithm: "SHA-512", fileext: ".sha512" + } +} + +// Signing requires a user ~/.gradle/gradle.properties file with signing information. +// See: http://www.gradle.org/docs/current/userguide/signing_plugin.html +signing { + // Sign the source archive if the proper configuration is in place to do so. Otherwise + // skip the signing process (it isn't required). This archive can be manually signed if + // needed. + required false + sign sourceRelease +} + +task release(dependsOn: signSourceRelease) + +// Publishing to Apache's Maven repository (Nexus). To install the archives in the +// local repository, run the 'install' task. +//subprojects { +// apply plugin: 'nexus' +// +// nexus { +// attachSources = false +// attachTests = false +// attachJavadoc = false +// sign = true +// repositoryUrl = 'https://repository.apache.org/service/local/staging/deploy/maven2' +// snapshotRepositoryUrl = 'https://repository.apache.org/content/repositories/snapshots' +// } +// +// modifyPom { +// project { +// name 'Apache Gobblin (incubating)' +// description 'A distributed data integration framework for streaming and batch data ecosystems.' +// url 'https://gobblin.apache.org/' +// +// scm { +// url 'https://git-wip-us.apache.org/repos/asf?p=incubator-gobblin.git;a=tree' +// connection 'scm:http://git-wip-us.apache.org/repos/asf/incubator-gobblin.git' +// developerConnection 'scm:https://git-wip-us.apache.org/repos/asf/incubator-gobblin.git' +// } +// +// licenses { +// license { +// name 'The Apache Software License, Version 2.0' +// url 'http://www.apache.org/licenses/LICENSE-2.0.txt' +// } +// } +// } +// } +//} \ No newline at end of file diff --git a/gradle/scripts/repositories.gradle b/gradle/scripts/repositories.gradle new file mode 100644 index 0000000..3a2efce --- /dev/null +++ b/gradle/scripts/repositories.gradle @@ -0,0 +1,25 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +repositories { + mavenCentral() + maven { + url "https://plugins.gradle.org/m2/" + } + maven { + url "http://packages.confluent.io/maven/" + } + maven { + url "https://linkedin.bintray.com/maven" + } + jcenter() +} + +try { + subprojects { + project.repositories.addAll(rootProject.repositories) + } +} catch (Throwable t) { + //nothing +} diff --git a/gradle/scripts/restli.gradle b/gradle/scripts/restli.gradle new file mode 100644 index 0000000..7eb9a72 --- /dev/null +++ b/gradle/scripts/restli.gradle @@ -0,0 +1,16 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +subprojects { + + afterEvaluate { + // add the standard pegasus dependencies wherever the plugin is used + if (project.plugins.hasPlugin('pegasus')) { + dependencies { + dataTemplateCompile externalDependency.pegasus.data + restClientCompile externalDependency.pegasus.restliClient,externalDependency.pegasus.restliCommon,externalDependency.pegasus.restliTools + } + } + } +} diff --git a/gradle/scripts/sourcesJar.gradle b/gradle/scripts/sourcesJar.gradle new file mode 100644 index 0000000..7afb676 --- /dev/null +++ b/gradle/scripts/sourcesJar.gradle @@ -0,0 +1,15 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +subprojects { + plugins.withType(JavaPlugin) { + if (isDefaultEnvironment) { + task sourcesJar(type: Jar, dependsOn: classes) { + from sourceSets.main.allSource + classifier = 'sources' + } + artifacts { archives sourcesJar } + } + } +} diff --git a/gradle/scripts/testSetup.gradle b/gradle/scripts/testSetup.gradle new file mode 100644 index 0000000..fc0ab31 --- /dev/null +++ b/gradle/scripts/testSetup.gradle @@ -0,0 +1,92 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +subprojects { + plugins.withType(JavaPlugin) { + plugins.apply('findbugs') + plugins.apply('checkstyle') + + test { + if (project.hasProperty("printTestOutput")) { + testLogging.showStandardStreams = true + } + useTestNG () { + excludeGroups 'ignore', 'performance' + if (project.hasProperty('skipTestGroup')) { + excludeGroups skipTestGroup + } + if (project.hasProperty('runTestGroups')) { + includeGroups project.runTestGroups + } + } + testLogging { + events "started","skipped","passed","failed" + exceptionFormat "full" + } + // Some tests require MySQL we can either download an embedded Wix image or use a pre-installed version + if (rootProject.hasProperty('usePreinstalledMysql') && Boolean.parseBoolean(rootProject.usePreinstalledMysql)) { + systemProperty 'gobblin.metastore.testing.embeddedMysqlEnabled', 'false' + } + } + + findbugs { + toolVersion = findBugsVersion + ignoreFailures = false + effort = "max" + sourceSets = [sourceSets.main] // Only analyze src/java/main, not src/java/test/ + // The exclude filter file must be under "ligradle/findbugs/" for internal compatibility with ligradle FindBugs + excludeFilter = file(rootProject.projectDir.path + "/quality/findbugsExclude.xml") + } + checkstyle { + toolVersion = '7.6.1' + configFile = rootProject.file('config/checkstyle/checkstyle.xml') + } + } + + tasks.withType(FindBugs) { + // Only one findbugs report can be enabled at a time. + // There are 3 kinds of reports in gobblin. + // 1. xml - when withFindBugsXmlReport is set + // 2. xml:withMessages - For Li internal builds + // 3. html - otherwise + if (project.hasProperty("withFindBugsXmlReport") && reports.find { it.name == "xml:withMessages"} == null) { + reports { + html.enabled = false + xml.enabled = true + } + } + else if (reports.find { it.name == "xml:withMessages"} == null) { + reports { + html.enabled = true + xml.enabled = false + } + } + } + + tasks.withType(Test) { + + // a collection to track failedTests + ext.failedTests = [] + + afterTest { descriptor, result -> + if (result.resultType == TestResult.ResultType.FAILURE) { + String failedTest = "${descriptor.className}::${descriptor.name}" + logger.debug("Adding " + failedTest + " to failedTests...") + failedTests << [failedTest] + } + } + + afterSuite { suite, result -> + if (!suite.parent) { // will match the outermost suite + // logs each failed test + if (!failedTests.empty) { + logger.lifecycle("Failed tests:") + failedTests.each { failedTest -> + logger.lifecycle("${failedTest}") + } + } + } + } + } +} diff --git a/gradle/scripts/utilities.gradle b/gradle/scripts/utilities.gradle new file mode 100644 index 0000000..44e9863 --- /dev/null +++ b/gradle/scripts/utilities.gradle @@ -0,0 +1,34 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +def getAllDependentProjectsImpl(project) { + def projectDependencies = project.configurations.runtime.getAllDependencies().withType(ProjectDependency) + def dependentProjects = projectDependencies*.dependencyProject + if (dependentProjects.size() > 0) { + dependentProjects.each { dependentProjects += getAllDependentProjects(it) } + } + return dependentProjects.unique() +} + +ext.getAllDependentProjects = {getAllDependentProjectsImpl(it)} + +task dotProjectDependencies(description: 'List of gobblin project dependencies in dot format') { + doLast { + println "// ========= Start of project dependency graph ======= " + println "digraph project_dependencies {" + subprojects.each { Project project -> + def project_node_name = project.name.replaceAll("-", "_") + if (project.configurations.findByName("compile") != null) { + project.configurations.compile.dependencies.each { Dependency dep -> + if (dep instanceof ProjectDependency) { + def dep_node_name = dep.dependencyProject.name.replaceAll("-", "_") + println "\t${project_node_name} -> ${dep_node_name};" + } + } + } + } + println "}" + println "// ========= End of project dependency graph ======= " + } +} diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000..ca78035 Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..05a606e --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,10 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +#Mon Aug 13 16:50:54 PDT 2018 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-5.6.4-bin.zip diff --git a/gradlew b/gradlew new file mode 100755 index 0000000..07004a2 --- /dev/null +++ b/gradlew @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS="" + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn ( ) { + echo "$*" +} + +die ( ) { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin, switch paths to Windows format before running java +if $cygwin ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=$((i+1)) + done + case $i in + (0) set -- ;; + (1) set -- "$args0" ;; + (2) set -- "$args0" "$args1" ;; + (3) set -- "$args0" "$args1" "$args2" ;; + (4) set -- "$args0" "$args1" "$args2" "$args3" ;; + (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules +function splitJvmOpts() { + JVM_OPTS=("$@") +} +eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS +JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" + +exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100755 index 0000000..3153872 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,94 @@ +@rem Copyright 2021 LinkedIn Corporation. All rights reserved. +@rem Licensed under the BSD-2 Clause license. +@rem See LICENSE in the project root for license information. + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args +if "%@eval[2+2]" == "4" goto 4NT_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* +goto execute + +:4NT_args +@rem Get arguments from the 4NT Shell from JP Software +set CMD_LINE_ARGS=%$ + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/maven-nexus/maven-install.sh b/maven-nexus/maven-install.sh new file mode 100755 index 0000000..2d1219f --- /dev/null +++ b/maven-nexus/maven-install.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + + +#group is overiden to support forked repositories + +function print_usage(){ + echo "maven-install.sh --version VERSION [--group GROUP]" +} + +for i in "$@" +do + case "$1" in + --version) + VERSION="$2" + shift + ;; + --group) + GROUP="$2" + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + ;; + esac + shift +done + +if [ -z "$VERSION" ]; then + print_usage + exit +fi +echo VERSION=$VERSION + +if [ -z "$GROUP" ]; then + GROUP="org.apache.gobblin" +fi + + +./gradlew install -Dorg.gradle.parallel=false -Pversion=$VERSION -Pgroup=$GROUP diff --git a/maven-nexus/maven-nexus.sh b/maven-nexus/maven-nexus.sh new file mode 100755 index 0000000..677d0b3 --- /dev/null +++ b/maven-nexus/maven-nexus.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +script_dir=$(dirname $0) +script_name=$(basename $0) +GRADLE="$script_dir/../gradlew" + +function print_usage() { + echo -e "USAGE: $0 [-remote|-local] [-noclean] [gradle_args]" + echo + echo -e "Publishes signed maven artifacts locally ($HOME/.m2/repository) or remotely (Nexus)." + echo -e "\t-local Publish to local repository" + echo -e "\t-noclean Don't run gradlew clean (useful if re-running)" + echo -e "\t-remote Publish to Nexus repository" + echo -e "\t-packages a comma-separated list of gradle paths to publish (e.g. :gobblin-api,:gobblin-core)" + echo + echo -e "NOTES:" + echo -e "\t1. You need the Gobblin PGP key to sign the artifacts. If you don't have it," + echo -e "\t talk to another committer to get it. You also need to add the following to" + echo -e "\t your $HOME/.gradle/gradle.properties file:" + echo + echo -e "signing.keyId=" + echo -e "signing.password=" + echo -e "signing.secretKeyRingFile=$HOME/.gnupg/secring.gpg" + echo + echo -e "\t2. To upload remotely, you'll need a Nexus account. Visit " + echo -e "\t https://repository.apache.org/ to set it up. After" + echo -e "\t that add to your $HOME/.gradle/gradle.properties file:" + echo + echo -e "nexusUsername=" + echo -e "nexusPassword=" + echo + echo -e "\t3. Uploading remotely will upload only to the Nexus staging directory. " + echo -e "\t4. Don't forget to create a gobblin_ tag before publishing remotely!" + echo -e "\t5. Sometimes build with fail with an error" + echo -e "\t '... Failed to interpolate field: private java.lang.String ...'" + echo -e "\t Just re-run with -noclean" +} + +if [ "$#" -eq 0 ] ; then + print_usage + exit +fi + +install_target= +gradle_args= +noclean= +declare -a packages + +# Parse command line +while [ "$#" -gt 0 ] ; do + A="$1" + case "$A" in + -local) + install_target=install + ;; + -noclean) + noclean="1" + ;; + -remote) + install_target=uploadArchives + ;; + -h|--help|-help) + print_usage + exit + ;; + -packages) + shift + packages=( ${1//,/ } ) + ;; + *) + gradle_args="$gradle_args $A" + ;; + esac + shift +done + +if [ -z "${install_target}" ] ; then + echo "${script_name}: missing install target" + exit 1 +fi + +declare -a package_targets +for P in "${packages[@]}" ; do + ptarget="$P:${install_target}" + if [ "${ptarget:0:1}" != ":" ] ; then + ptarget=":$ptarget" + fi + package_targets+=( "$ptarget" ) +done + +if [ "${#packages[@]}" -gt 0 ] ; then + install_target="${package_targets[@]}" +fi + +if [ -z "$noclean" ] ; then + $GRADLE clean +fi +$GRADLE -PpublishToNexus -Porg.gradle.parallel=false -Porg.gradle.daemon=false -xtest $gradle_args $install_target 2>&1 | tee /tmp/${script_name}.out diff --git a/maven-nexus/upload-to-nexus.sh b/maven-nexus/upload-to-nexus.sh new file mode 100755 index 0000000..7994b81 --- /dev/null +++ b/maven-nexus/upload-to-nexus.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +PROG=$(basename $0) + +function usage() { + echo -e "USAGE: $PROG" +} + +# main() + +if [ "$#" -eq 0 ] ; then + usage + exit +fi + +while [ "$#" -gt 0 ] ; do + A="$1" + case "$A" in + -h|--help) + usage + exit + ;; + *) + echo "$PROG: unknown option: $A" + exit 1 + ;; + esac + shift +done + +echo "CLEANING" +./gradlew clean + +upload_all=0 + +for P in :gobblin-admin :gobblin-api :gobblin-azkaban :gobblin-compaction :gobblin-config-management:gobblin-config-core :gobblin-config-management:gobblin-config-client :gobblin-core :gobblin-data-management :gobblin-distribution :gobblin-example :gobblin-metastore :gobblin-metrics :gobblin-oozie :gobblin-rest-service:gobblin-rest-api :gobblin-rest-service:gobblin-rest-client :gobblin-rest-service:gobblin-rest-server :gobblin-runtime :gobblin-salesforce :gobblin-scheduler :gobblin-test :gobblin-iceberg :gobblin-test-harness :gobblin-utility :gobblin-yarn ; do + echo "----------------------------" + if [ "$upload_all" -eq 0 ] ; then + read -p "UPLOAD $P [y(es)|n(o)|a(ll)]" ans + ans=$(echo "$ans" | tr '[:upper:]' '[:lower:]') + if [ "$ans" == "a" -o "$ans" == "all" ] ; then + upload_all=1 + ans="y" + fi + else + ans="y" + fi + if [ "$ans" == "y" -o "$ans" == "yes" ] ; then + ./maven-nexus/maven-nexus.sh -remote -noclean -packages $P + else + echo "Skipping $P" + fi + done + diff --git a/maven-sonatype/github-pr-change-log.py b/maven-sonatype/github-pr-change-log.py new file mode 100755 index 0000000..739d72f --- /dev/null +++ b/maven-sonatype/github-pr-change-log.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# pylint: disable=line-too-long + +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# Usage: Python Script that takes in a range of Gobblin Pull Request Numbers and outputs metadata about each Pull Request +# +# An example output would look like: +# +# * [] [PR 902] Make it possible to specify empty job data publisher +# * [] [PR 903] The underlying Avro CodecFactory only matches lowercase codecs, so we… +# * [] [PR 904] Fixed precondition check for overwriting in datapublisher +# +# The output of this script is meant for the CHANGELOG file that is updated before each Gobblin release. +# There is a pair of [] brackets at the beginning of the build which is meant to containg the project name the PR is related to +# +# The script should be run as follows "./pull-requests-change-log.py [github-username] [github-password] [starting-pr-number] [ending-pr-number] +# For example, to produce the above output the command run was "./pull-requests-change-log.py sahilTakiar [my-password] 900 905" + +import sys +import requests + +for prNumber in range(int(sys.argv[3]), int(sys.argv[4])): + pr = requests.get("https://api.github.com/repos/linkedin/gobblin/pulls/" + str(prNumber), auth=(sys.argv[1], sys.argv[2])).json() + if "state" in pr.keys() and pr["state"] == "closed" and pr["merged"]: + print "* [] [PR " + str(pr["number"]) + "] " + pr["title"] diff --git a/maven-sonatype/maven-install.sh b/maven-sonatype/maven-install.sh new file mode 100755 index 0000000..e07d925 --- /dev/null +++ b/maven-sonatype/maven-install.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +#group is overiden to support forked repositories + +function print_usage(){ + echo "maven-install.sh --version VERSION [--group GROUP]" +} + +for i in "$@" +do + case "$1" in + --version) + VERSION="$2" + shift + ;; + --group) + GROUP="$2" + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + ;; + esac + shift +done + +if [ -z "$VERSION" ]; then + print_usage + exit +fi +echo VERSION=$VERSION + +if [ -z "$GROUP" ]; then + GROUP="gobblin" +fi + + +./gradlew install -Dorg.gradle.parallel=false -Pversion=$VERSION -Pgroup=$GROUP diff --git a/maven-sonatype/maven-sonatype.sh b/maven-sonatype/maven-sonatype.sh new file mode 100755 index 0000000..233884e --- /dev/null +++ b/maven-sonatype/maven-sonatype.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +script_dir=$(dirname $0) +script_name=$(basename $0) +GRADLE="$script_dir/../gradlew" + +function print_usage() { + echo -e "USAGE: $0 [-remote|-local] [-noclean] [gradle_args]" + echo + echo -e "Publishes signed maven artifacts locally ($HOME/.m2/repository) or remotely (Sonatype)." + echo -e "\t-local Publish to local repository" + echo -e "\t-noclean Don't run gradlew clean (useful if re-running)" + echo -e "\t-remote Publish to Sonatype repository" + echo -e "\t-packages a comma-separated list of gradle paths to publish (e.g. :gobblin-api,:gobblin-core)" + echo + echo -e "NOTES:" + echo -e "\t1. You need the Gobblin PGP key to sign the artifacts. If you don't have it," + echo -e "\t talk to another committer to get it. You also need to add the following to" + echo -e "\t your $HOME/.gradle/gradle.properties file:" + echo + echo -e "signing.keyId=" + echo -e "signing.password=" + echo -e "signing.secretKeyRingFile=$HOME/.gnupg/secring.gpg" + echo + echo -e "\t2. To upload remotely, you'll need a Sonatype account. Visit " + echo -e "\t https://issues.sonatype.org/secure/Signup!default.jspa to set it up. After" + echo -e "\t that add to your $HOME/.gradle/gradle.properties file:" + echo + echo -e "ossrhUsername=" + echo -e "ossrhPassword=" + echo + echo -e "\t3. Uploading remotely will upload only to the Sonatype staging directory. Follow " + echo -e " the steps at http://central.sonatype.org/pages/releasing-the-deployment.html to" + echo -e " synchronize with Maven Central." + echo -e "\t4. Don't forget to create a gobblin_ tag before publishing remotely!" + echo -e "\t5. Sometimes build with fail with an error" + echo -e "\t '... Failed to interpolate field: private java.lang.String ...'" + echo -e "\t Just re-run with -noclean" +} + +if [ "$#" -eq 0 ] ; then + print_usage + exit +fi + +install_target= +gradle_args= +noclean= +declare -a packages + +# Parse command line +while [ "$#" -gt 0 ] ; do + A="$1" + case "$A" in + -local) + install_target=install + ;; + -noclean) + noclean="1" + ;; + -remote) + install_target=uploadArchives + ;; + -h|--help|-help) + print_usage + exit + ;; + -packages) + shift + packages=( ${1//,/ } ) + ;; + *) + gradle_args="$gradle_args $A" + ;; + esac + shift +done + +if [ -z "${install_target}" ] ; then + echo "${script_name}: missing install target" + exit 1 +fi + +declare -a package_targets +for P in "${packages[@]}" ; do + ptarget="$P:${install_target}" + if [ "${ptarget:0:1}" != ":" ] ; then + ptarget=":$ptarget" + fi + package_targets+=( "$ptarget" ) +done + +if [ "${#packages[@]}" -gt 0 ] ; then + install_target="${package_targets[@]}" +fi + +if [ -z "$noclean" ] ; then + $GRADLE clean +fi +$GRADLE -PpublishToMaven -Porg.gradle.parallel=false -Porg.gradle.daemon=false -xtest $gradle_args $install_target 2>&1 | tee /tmp/${script_name}.out diff --git a/maven-sonatype/upload-to-sonatype.sh b/maven-sonatype/upload-to-sonatype.sh new file mode 100755 index 0000000..7f684c5 --- /dev/null +++ b/maven-sonatype/upload-to-sonatype.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +PROG=$(basename $0) + +function usage() { + echo -e "USAGE: $PROG" +} + +# main() + +if [ "$#" -eq 0 ] ; then + usage + exit +fi + +while [ "$#" -gt 0 ] ; do + A="$1" + case "$A" in + -h|--help) + usage + exit + ;; + *) + echo "$PROG: unknown option: $A" + exit 1 + ;; + esac + shift +done + +echo "CLEANING" +./gradlew clean + +upload_all=0 + +for P in :gobblin-admin :gobblin-api :gobblin-azkaban :gobblin-compaction :gobblin-config-management:gobblin-config-core :gobblin-config-management:gobblin-config-client :gobblin-core :gobblin-data-management :gobblin-distribution :gobblin-example :gobblin-metastore :gobblin-metrics :gobblin-oozie :gobblin-rest-service:gobblin-rest-api :gobblin-rest-service:gobblin-rest-client :gobblin-rest-service:gobblin-rest-server :gobblin-iceberg :gobblin-runtime :gobblin-salesforce :gobblin-scheduler :gobblin-test :gobblin-test-harness :gobblin-utility :gobblin-yarn ; do + echo "----------------------------" + if [ "$upload_all" -eq 0 ] ; then + read -p "UPLOAD $P [y(es)|n(o)|a(ll)]" ans + ans=$(echo "$ans" | tr '[:upper:]' '[:lower:]') + if [ "$ans" == "a" -o "$ans" == "all" ] ; then + upload_all=1 + ans="y" + fi + else + ans="y" + fi + if [ "$ans" == "y" -o "$ans" == "yes" ] ; then + ./maven-sonatype/maven-sonatype.sh -remote -noclean -packages $P + else + echo "Skipping $P" + fi + done + diff --git a/quality/findbugsExclude.xml b/quality/findbugsExclude.xml new file mode 100644 index 0000000..ca72a2e --- /dev/null +++ b/quality/findbugsExclude.xml @@ -0,0 +1,16 @@ + + + + + + + + diff --git a/query_github_issues.py b/query_github_issues.py new file mode 100755 index 0000000..4d10b7f --- /dev/null +++ b/query_github_issues.py @@ -0,0 +1,79 @@ +#!/bin/env /usr/bin/python2.6 +# pylint: disable=missing-docstring +# pylint: disable=unused-import +from datetime import date, timedelta +import json +import httplib +from string import Template +import sys +import urllib + +def search_issues(query_terms, sortBy=None, orderDir=None): + # pylint: disable=line-too-long + # pylint: disable=invalid-name + conn = httplib.HTTPSConnection("api.github.com") + params_map = {"q": "+".join(query_terms)} + if sortBy is not None: + params_map["sort"] = sortBy + if orderDir is not None: + params_map["order"] = orderDir + # Note we don't do urlencode because the output is not compatible with the q syntax, e.g. : and / should not be escaped + # params = urllib.urlencode(params_map) + params = "&".join(str(i[0]) + "=" + str(i[1]) for i in params_map.items()) + #print params + + headers = {"User-Agent": "Python App"} + + conn.request("GET", "/search/issues?" + params, headers=headers) + response = conn.getresponse() + if response.status != httplib.OK: + sys.stderr.write("Query error: %s %s: %s" % (response.status, response.reason, response.read())) + sys.exit(1) + result = json.loads(response.read()) + conn.close() + return result + + +def get_created_issues_since(day): + # pylint: disable=line-too-long + return search_issues(query_terms=["repo:linkedin/gobblin", "is:open", "is:issue", "created:>=" + day], + sortBy="created", + orderDir="desc" + ) + +def get_created_issues_last_days(n=10): + # pylint: disable=invalid-name + since_day = (date.today() - timedelta(days=n)).strftime("%Y-%m-%d") + return get_created_issues_since(since_day) + +def simple_issue_list(issues): + # pylint: disable=invalid-name + HEADER_TEMPLATE = Template("$total_count issues found") + ISSUE_TEMPLATE = Template("""------------------ +ISSUE $number : HTML: $html_url JSON: $url +\tCREATED ON: $created_at +\tCREATED BY: $user_login ( $user_name ) $user_html_url +\tASSIGNED TO: $assignee_login ($assignee_name) $assignee_html_url +\tCOMMENTS: $comments +\tUPDATED ON: $updated_at +\tCLOSED ON: $closed_at + +$body +""") + print HEADER_TEMPLATE.substitute(issues) + for issue in issues["items"]: + user_data = issue["user"] + for user_attr in user_data: + issue["user_" + user_attr] = user_data[user_attr] + assignee_data = issue["assignee"] + for assignee_attr in assignee_data: + issue["assignee_" + assignee_attr] = assignee_data[assignee_attr] + print ISSUE_TEMPLATE.safe_substitute(issue) + +def main(argv): + # pylint: disable=unused-argument + issues = get_created_issues_last_days(7) + print simple_issue_list(issues) + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/settings.gradle b/settings.gradle new file mode 100644 index 0000000..8b02f26 --- /dev/null +++ b/settings.gradle @@ -0,0 +1,27 @@ +// Copyright 2021 LinkedIn Corporation. All rights reserved. +// Licensed under the BSD-2 Clause license. +// See LICENSE in the project root for license information. + +def modules = ['dil'] + +// Disable jacoco for now as Kafka 0.8 is the default version and jacoco does not like the same classes +// being declared in different modules +def jacocoBlacklist = new HashSet([ +]) + +modules.each { module -> + include "${module}" + file(module).eachDir { submodule -> + if (!submodule.name.startsWith('.') && !submodule.name.equals('src') && !submodule.name.equals('bin') + && !submodule.name.equals('test-output') && !submodule.name.equals('jobconf')) { + def submoduleId = "${module}:${submodule.name}" + if (System.getProperty('jacocoBuild') == null || !jacocoBlacklist.contains(submoduleId.toString())) { + include submoduleId + } + else { + println "Ignoring blacklisted module ${submoduleId}" + } + } + } +} + diff --git a/travis/bintrayDeploy.sh b/travis/bintrayDeploy.sh new file mode 100755 index 0000000..5b47c01 --- /dev/null +++ b/travis/bintrayDeploy.sh @@ -0,0 +1,26 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# Test script used by Travis to test the +# hadoop1 or hadoop2 versions of gobblin. +# + +#!/bin/bash +set -e + +echo "Starting $0 at " $(date) +PROJECT_VERSION=$(./gradlew properties -q | grep "version:" | awk '{print $2}') + +echo "Project Version: $PROJECT_VERSION" +BUILD_VERSION=$PROJECT_VERSION-dev-${TRAVIS_BUILD_NUMBER} +echo "Build Version: $BUILD_VERSION" + +echo "Pull request: [$TRAVIS_PULL_REQUEST], Travis branch: [$TRAVIS_BRANCH]" +# release only from master when no pull request build +if [ "$TRAVIS_PULL_REQUEST" = "false" ] +then + echo "Uploading artifacts to bintray for version $BUILD_VERSION" + ./gradlew bintrayUpload -Pversion=$BUILD_VERSION -Pbintray.override +fi \ No newline at end of file diff --git a/travis/filter-to-failing-test-results.py b/travis/filter-to-failing-test-results.py new file mode 100644 index 0000000..8018600 --- /dev/null +++ b/travis/filter-to-failing-test-results.py @@ -0,0 +1,34 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Python script used to filter the list of junit XML test results +# to those that contain errors or failures. +# + +#!/usr/bin/python + +import sys +import fileinput +import xml.etree.ElementTree + +for line in fileinput.input(): + suite = xml.etree.ElementTree.parse(line.rstrip()).getroot() + errors = suite.get("errors") + failures = suite.get("failures") + if (errors is not None and int(errors) > 0) or (failures is not None and int(failures) > 0): + sys.stdout.write(line) + +sys.exit(0) \ No newline at end of file diff --git a/travis/junit-errors-to-stdout.sh b/travis/junit-errors-to-stdout.sh new file mode 100755 index 0000000..bb2cdaf --- /dev/null +++ b/travis/junit-errors-to-stdout.sh @@ -0,0 +1,31 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# Script used by Travis builds upon a failure to format and +# print the failing test results to the console. +# + +#!/bin/bash +IFS=' +' +DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +ROOTDIR="$1" +if [ -z "$ROOTDIR" ]; then + ROOTDIR="." +fi +echo 'Formatting results...' +FILES=$(find "$ROOTDIR" -path '*/build/*/test-results/*.xml' | python "$DIR/filter-to-failing-test-results.py") +if [ -n "$FILES" ]; then + for file in $FILES; do + echo "Formatting $file" + if [ -f "$file" ]; then + echo '=====================================================' + xsltproc "$DIR/junit-xml-format-errors.xsl" "$file" + fi + done + echo '=====================================================' +else + echo 'No */build/*/test-results/*.xml files found with failing tests.' +fi diff --git a/travis/junit-xml-format-errors.xsl b/travis/junit-xml-format-errors.xsl new file mode 100755 index 0000000..c4b0895 --- /dev/null +++ b/travis/junit-xml-format-errors.xsl @@ -0,0 +1,71 @@ + + + + + + + + + Testsuite: + +Tests run: + + , Failures: + + , Errors: + + , Time elapsed: + + sec + +--------- ----------- --------- + + + + + + + +Testcase: + + took + + + FAILURE + + ERROR + + SUCCESS + + + + + + + + + + + + + + + +------ Standard output ------ + + + + + + +------ Error output ------ + + + + + \ No newline at end of file diff --git a/travis/test-build.sh b/travis/test-build.sh new file mode 100755 index 0000000..f2aea4c --- /dev/null +++ b/travis/test-build.sh @@ -0,0 +1,14 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# Build script used by Travis to clean and assemble the +# hadoop1 or hadoop2 versions of gobblin. +# + +#!/bin/bash +set -e + +echo "Starting $0 at " $(date) +time ./gradlew clean build -x test -x javadoc -Dorg.gradle.parallel=true $GOBBLIN_GRADLE_OPTS diff --git a/travis/test-coverage.sh b/travis/test-coverage.sh new file mode 100755 index 0000000..8e87989 --- /dev/null +++ b/travis/test-coverage.sh @@ -0,0 +1,18 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# Test script used by Travis to test the +# hadoop1 or hadoop2 versions of gobblin. +# + +#!/bin/bash +set -e + +script_dir=$(dirname $0) + +source ${script_dir}/test-groups.inc + +echo "Starting $0 at " $(date) +time ./gradlew -PskipTestGroup=disabledOnCI -Dorg.gradle.parallel=false -DjacocoBuild=1 $GOBBLIN_GRADLE_OPTS jacocoTestCoverage diff --git a/travis/test-default.sh b/travis/test-default.sh new file mode 100755 index 0000000..3d91ff0 --- /dev/null +++ b/travis/test-default.sh @@ -0,0 +1,19 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# Test script used by Travis to test the +# hadoop1 or hadoop2 versions of gobblin. +# + +#!/bin/bash +set -e + +script_dir=$(dirname $0) + +source ${script_dir}/test-groups.inc + +echo "Starting $0 at " $(date) +echo "GOBBLIN_GRADLE_OPTS=$GOBBLIN_GRADLE_OPTS" +time ./gradlew -PskipTestGroup=disabledOnCI,$TEST_GROUP1 -Dorg.gradle.parallel=false $GOBBLIN_GRADLE_OPTS test diff --git a/travis/test-group1.sh b/travis/test-group1.sh new file mode 100755 index 0000000..8591e6a --- /dev/null +++ b/travis/test-group1.sh @@ -0,0 +1,22 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# Test script used by Travis to test the +# hadoop1 or hadoop2 versions of gobblin. +# + +#!/bin/bash +set -e + +script_dir=$(dirname $0) + +source ${script_dir}/test-groups.inc + +echo "Starting $0 at " $(date) +echo "Precompiling tests:" +rm -rf $HOME/.gradle/caches/ +./gradlew compileTest -Porg.gradle.parallel=false $GOBBLIN_GRADLE_OPTS +echo "Running tests for $TEST_GROUP1" +time ./gradlew -PskipTestGroup=disabledOnCI -PrunTestGroups=$TEST_GROUP1 -Dorg.gradle.parallel=false $GOBBLIN_GRADLE_OPTS test \ No newline at end of file diff --git a/travis/test-groups.inc b/travis/test-groups.inc new file mode 100644 index 0000000..8eef534 --- /dev/null +++ b/travis/test-groups.inc @@ -0,0 +1 @@ +TEST_GROUP1=gobbin.yarn,gobblin.runtime,gobblin.cluster,gobblin.compaction,gobblin.util,gobblin.writer diff --git a/travis/test.sh b/travis/test.sh new file mode 100755 index 0000000..fd557d0 --- /dev/null +++ b/travis/test.sh @@ -0,0 +1,33 @@ +# Copyright 2021 LinkedIn Corporation. All rights reserved. +# Licensed under the BSD-2 Clause license. +# See LICENSE in the project root for license information. + +# +# Test script used by Travis to test the +# hadoop1 or hadoop2 versions of gobblin. +# + +#!/bin/bash +set -e + +#free + +RUN_TEST_GROUP=${RUN_TEST_GROUP:-default} + +script_dir=$(dirname $0) +echo "Old GRADLE_OPTS=$GRADLE_OPTS" + +export java_version=$(java -version 2>&1 | grep 'openjdk version' | sed -e 's/openjdk version "\(1\..\).*/\1/') + +echo "Using Java version:${java_version}" + +export GOBBLIN_GRADLE_OPTS="-Dorg.gradle.daemon=false -Dgobblin.metastore.testing.embeddedMysqlEnabled=false -PusePreinstalledMysql=true -PjdkVersion=${java_version}" + +TEST_SCRIPT=${script_dir}/test-${RUN_TEST_GROUP}.sh +if [ -x $TEST_SCRIPT ] ; then + echo "Running test group $RUN_TEST_GROUP" + $TEST_SCRIPT "$@" +else + echo "Test file $TEST_SCRIPT does not exist or is not executable!" + exit 1 +fi