Skip to content

Commit

Permalink
Add support for static deployments of autojoin data JOINs and UNIONs (#…
Browse files Browse the repository at this point in the history
…184)

* Separate script from library functions
* Add support for joining v2 autonode datatypes
* Deploy autonode v2 datatypes
* Use project parameter for views
* Add TODO to remove autojoin view deployments
  • Loading branch information
stephen-soltesz authored Sep 20, 2024
1 parent ad38690 commit 846d6b4
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 49 deletions.
10 changes: 10 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,13 @@ steps:
- PROJECT_IN=mlab-oti
args:
- /workspace/views/create_dataset_views.sh self $PROJECT_ID measurement-lab

# Deployments for v2 autoloader pipeline.
# TODO(soltesz): this remains manual and should be automatic. Remove this
# deployment once the autoloader supports automated creation of configured
# joins and unified views.
- name: gcr.io/$PROJECT_ID/gcloud-jsonnet-cbif:1.1
env:
- PROJECT_IN=mlab-sandbox,mlab-staging,mlab-autojoin
args:
- /workspace/views/create_autojoin_dataset_views.sh self $PROJECT_ID
53 changes: 53 additions & 0 deletions views/autoload_v2_ndt/ndt7_joined.template.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
--
-- ndt7_joined - joins the raw ndt7 and annotation2 autoloaded datasets with standard columns.
--
WITH prendt7 AS (
SELECT
raw.Download IS NOT NULL AS isDownload,
raw.Upload IS NOT NULL AS isUpload,
ARRAY_REVERSE(raw.Download.ServerMeasurements)[SAFE_OFFSET(0)].BBRInfo IS NOT NULL AS isBBR,
ARRAY_REVERSE(raw.Download.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.BytesAcked AS downloadBytesAcked,
ARRAY_REVERSE(raw.Download.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.ElapsedTime AS downloadElapsedTime,
ARRAY_REVERSE(raw.Download.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.MinRTT AS downloadMinRTT,
ARRAY_REVERSE(raw.Download.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.BytesRetrans AS downloadBytesRetrans,
ARRAY_REVERSE(raw.Download.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.BytesSent AS downloadBytesSent,
ARRAY_REVERSE(raw.Upload.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.BytesReceived AS uploadBytesReceived,
ARRAY_REVERSE(raw.Upload.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.ElapsedTime AS uploadElapsedTime,
ARRAY_REVERSE(raw.Upload.ServerMeasurements)[SAFE_OFFSET(0)].TCPInfo.MinRTT AS uploadMinRTT,
*
FROM `{{.ProjectID}}.autoload_v2_{{ORG}}_ndt.ndt7_raw`

), ndt7 AS (
SELECT
-- Pick the download or upload UUID per row.
IF(isDownload, raw.Download.UUID, IF(isUpload, raw.Upload.UUID, NULL)) AS id,
-- Construct the summary 'a' record for compatibility with standard columns.
STRUCT (
IF(isDownload, raw.Download.UUID, IF(isUpload, raw.Upload.UUID, NULL)) AS UUID,
IF(isDownload, raw.Download.StartTime, IF(isUpload, raw.Upload.StartTime, NULL)) AS TestTime,
IF(isBBR, "bbr", "unknown") AS CongestionControl,
8 * IF(isDownload, downloadBytesAcked / downloadElapsedTime,
IF(isUpload, uploadBytesReceived / uploadElapsedTime, NULL)) AS MeanThroughputMbps,
IF(isDownload, downloadMinRTT, IF(isUpload, uploadMinRTT, NULL)) / 1000 AS MinRTT, -- unit: ms
IF(isDownload, downloadBytesRetrans / downloadBytesSent, IF(isUpload, 0, NULL)) AS LossRate
) AS a,
*
FROM prendt7
), ann2 AS (
SELECT raw.UUID AS id, *
FROM `{{.ProjectID}}.autoload_v2_{{ORG}}_ndt.annotation2_raw`
)

SELECT
-- Standard column order.
ndt7.id,
ndt7.date,
ndt7.archiver,
ann2.raw.server,
ann2.raw.client,
ndt7.a,
ndt7.raw
FROM ndt7 LEFT JOIN ann2
ON ndt7.id = ann2.id AND ndt7.date = ann2.date
WHERE ndt7.id IS NOT NULL

Empty file.
47 changes: 47 additions & 0 deletions views/create_autojoin_dataset_views.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
#
# create_autojoin_dataset_views.sh creates all datasets and views for the
# autojoin pipeline. Like create_dataset_views.sh, every directory is a dataset
# name and every sql file within the dataset subdirectory should be a view query
# template.
#
# Example usage:
#
# ./create_autojoin_dataset_views.sh "self" mlab-sandbox
# ./create_autojoin_dataset_views.sh "self" mlab-oti

set -eu
USAGE="$0 <key-name> <source-project>"
KEYNAME=${1:?Please provide a key name to authorize operations or "self"}
SRC_PROJECT=${2:?Please provide source project: $USAGE}

# Setup environment.
BASEDIR=$( realpath $( dirname "${BASH_SOURCE[0]}" ) )
cd ${BASEDIR}

# Initialize library.
source ${BASEDIR}/create_view_lib.sh
create_view_init

echo "Creating autojoin views"
# TODO(soltesz): eliminate this in favor of automation within the autoloader.
# Get list of orgs with ndt autoloaded data.
datasets=$( bq ls --project_id ${SRC_PROJECT} | grep autoload | grep _ndt | grep -v autoload_v2_ndt )
echo '-- Generated query' > ./autoload_v2_ndt/ndt7_union.sql
for ds in $datasets ; do
org=$( echo $ds | tr '_' ' ' | awk '{print $3}' )
create_org_joined_view ${SRC_PROJECT} ${org}
if grep -q SELECT ./autoload_v2_ndt/ndt7_union.sql ; then
# If there is already a SELECT statement in the union, append a "UNION ALL" before the next.
echo 'UNION ALL' >> ./autoload_v2_ndt/ndt7_union.sql
fi
echo 'SELECT * FROM `{{.ProjectID}}.'$ds'.ndt7_joined`' >> ./autoload_v2_ndt/ndt7_union.sql
done

# Only deploy view if it contains at least one SELECT.
if grep -q SELECT ./autoload_v2_ndt/ndt7_union.sql ; then
# NOTE: Must create "ndt7_union" last because it references the views above.
create_view ${SRC_PROJECT} ${SRC_PROJECT} autoload_v2_ndt ./autoload_v2_ndt/ndt7_union.sql
fi

echo "All views created successfully"
53 changes: 4 additions & 49 deletions views/create_dataset_views.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,58 +19,13 @@ KEYNAME=${1:?Please provide a key name to authorize operations or "self"}
SRC_PROJECT=${2:?Please provide source project: $USAGE}
DST_PROJECT=${3:?Please provide destination project: $USAGE}

# Setup environment.
BASEDIR=$( realpath $( dirname "${BASH_SOURCE[0]}" ) )
cd ${BASEDIR}

# Git info is nominally exported from the caller
if [ -z "${TAG_NAME-}" -o -z "${COMMIT_SHA-}" ]; then
echo "Not Git"
export TAG_NAME="manual"
export COMMIT_SHA="undefined"
fi

if [[ "${KEYNAME}" != "self" ]] ; then
echo "${!KEYNAME}" > /tmp/sa.json
export GOOGLE_APPLICATION_CREDENTIALS=/tmp/sa.json
# Guarantee that `gcloud config get-value accounnt` works as intended.
gcloud auth activate-service-account --key-file /tmp/sa.json
fi
# Extract service account user name.
USER=$( gcloud config get-value account )

BQ_CREATE_VIEW=bq_create_view
if [[ -x ${BASEDIR}/bq_create_view ]] ; then
BQ_CREATE_VIEW=${BASEDIR}/bq_create_view
fi

function create_view() {
local src_project=$1
local dst_project=$2
local dataset=$3
local template=$4

description=$(
awk '/^--/ {print substr($0, 3)} /^SELECT/ {exit(0)}' ${template} )
description+=$'\n'$'\n'"Release tag: $TAG_NAME Commit: $COMMIT_SHA"
description+=$'\n'"View of data from '${src_project}'."
description+=$'\n'"Using: github.com/m-lab/..${template}"
description+=$'\n'"On :"`date`

# Strip filename down to view name.
# Note that _nofilter views are generated with .SQL~ suffix to prevent checkin
view="${template%%.sql}"
view="${view%%.SQL~}"
view="${view##*/}"

echo "Creating "${dst_project}.${dataset}.${view}" using "${template}

${BQ_CREATE_VIEW} \
-src-project "${src_project}" \
-create-view "${dst_project}.${dataset}.${view}" \
-template "${template}" \
-description "${description}" \
-editor "${USER}"
}
# Initialize library.
source ${BASEDIR}/create_view_lib.sh
create_view_init

# Build all views
# Upper level views always have src_project=dst_project=DST_PROJECT
Expand Down
61 changes: 61 additions & 0 deletions views/create_view_lib.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Library for creating dataset views.

function create_view_init() {
# Git info is nominally exported from the caller
if [ -z "${TAG_NAME-}" -o -z "${COMMIT_SHA-}" ]; then
echo "Not Git"
export TAG_NAME="manual"
export COMMIT_SHA="undefined"
fi

if [[ "${KEYNAME}" != "self" ]] ; then
echo "${!KEYNAME}" > /tmp/sa.json
export GOOGLE_APPLICATION_CREDENTIALS=/tmp/sa.json
# Guarantee that `gcloud config get-value accounnt` works as intended.
gcloud auth activate-service-account --key-file /tmp/sa.json
fi
# Extract service account user name.
USER=$( gcloud config get-value account )

BQ_CREATE_VIEW=bq_create_view
if [[ -x ./bq_create_view ]] ; then
BQ_CREATE_VIEW=./bq_create_view
fi
}

function create_view() {
local src_project=$1
local dst_project=$2
local dataset=$3
local template=$4

description=$(
awk '/^--/ {print substr($0, 3)} /^SELECT/ {exit(0)}' ${template} )
description+=$'\n'$'\n'"Release tag: $TAG_NAME Commit: $COMMIT_SHA"
description+=$'\n'"View of data from '${src_project}'."
description+=$'\n'"Using: github.com/m-lab/..${template}"
description+=$'\n'"On :"`date`

# Strip filename down to view name.
# Note that _nofilter views are generated with .SQL~ suffix to prevent checkin
view="${template%%.sql}"
view="${view%%.SQL~}"
view="${view##*/}"

echo "Creating "${dst_project}.${dataset}.${view}" using "${template}

${BQ_CREATE_VIEW} \
-src-project "${src_project}" \
-create-view "${dst_project}.${dataset}.${view}" \
-template "${template}" \
-description "${description}" \
-editor "${USER}"
}

function create_org_joined_view() {
local project=$1
local org=$2
mkdir -p autoload_v2_${org}_ndt
sed -e 's/{{ORG}}/'${org}'/g' autoload_v2_ndt/ndt7_joined.template.sql > autoload_v2_${org}_ndt/ndt7_joined.sql
create_view ${project} ${project} autoload_v2_${org}_ndt ./autoload_v2_${org}_ndt/ndt7_joined.sql
}

0 comments on commit 846d6b4

Please sign in to comment.