Skip to content

Commit

Permalink
sync: sync dev/lm-eval with main branch (#271)
Browse files Browse the repository at this point in the history
* feat: Initial database support (#246)

* Initial database support

- Add status checking
- Add better storage flags
- Add spec.storage.format validation
- Add DDL
-Add HIBERNATE format to DB (test)
- Update service image
- Revert identifier to DATABASE
- Update CR options (remove mandatory data)

* Remove default DDL generation env var

* Update service image to latest tag

* Add migration awareness

* Add updating pods for migration

* Change JDBC url from mysql to mariadb

* Fix TLS mount

* Revert images

* Remove redundant logic

* Fix comments

* feat: Add TLS certificate mount on ModelMesh (#255)

* feat: Add TLS certificate mount on ModelMesh

* Revert from http to https until kserve/modelmesh#147 is merged

* Pin oc version, ubi version (#263)

* Restore checkout of trustyai-exp (#265)

* Add operator installation robustness (#266)

* fix: Skip InferenceService patching for KServe RawDeployment (#262)

* feat: ConfigMap key to disable KServe Serverless configuration (#267)

* feat: Add support for custom certificates in database connection (#259)

* Add TLS endpoint for ModelMesh payload processors. (#268)

Keep non-TLS endpoint for KServe Serverless (disabled by default)

---------

Signed-off-by: Yihong Wang <[email protected]>
Co-authored-by: Rui Vieira <[email protected]>
Co-authored-by: Rob Geada <[email protected]>
  • Loading branch information
3 people authored Aug 5, 2024
1 parent 2173aae commit 427d102
Show file tree
Hide file tree
Showing 10 changed files with 177 additions and 66 deletions.
7 changes: 7 additions & 0 deletions config/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,10 @@ vars:
apiVersion: v1
fieldref:
fieldpath: data.oauthProxyImage
- name: kServeServerless
objref:
kind: ConfigMap
name: config
apiVersion: v1
fieldref:
fieldpath: data.kServeServerless
1 change: 1 addition & 0 deletions config/base/params.env
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
kServeServerless=disabled
38 changes: 38 additions & 0 deletions controllers/tas/config_maps.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,44 @@ func (r *TrustyAIServiceReconciler) getImageFromConfigMap(ctx context.Context, k
}
}

// getKServeServerlessConfig checks the kServeServerless value in a ConfigMap in the operator's namespace
func (r *TrustyAIServiceReconciler) getKServeServerlessConfig(ctx context.Context) (bool, error) {

if r.Namespace != "" {
// Define the key for the ConfigMap
configMapKey := types.NamespacedName{
Namespace: r.Namespace,
Name: constants.ConfigMap,
}

// Create an empty ConfigMap object
var cm corev1.ConfigMap

// Try to get the ConfigMap
if err := r.Get(ctx, configMapKey, &cm); err != nil {
if errors.IsNotFound(err) {
// ConfigMap not found, return false as the default behavior
return false, nil
}
// Other error occurred when trying to fetch the ConfigMap
return false, fmt.Errorf("error reading configmap %s", configMapKey)
}

// ConfigMap is found, extract the kServeServerless value
kServeServerless, ok := cm.Data[configMapkServeServerlessKey]

if !ok || kServeServerless != "enabled" {
// Key is missing or its value is not "enabled", return false
return false, nil
}

// kServeServerless is "enabled"
return true, nil
} else {
return false, nil
}
}

// getConfigMapNamesWithLabel retrieves the names of ConfigMaps that have the specified label
func (r *TrustyAIServiceReconciler) getConfigMapNamesWithLabel(ctx context.Context, namespace string, labelSelector client.MatchingLabels) ([]string, error) {
configMapList := &corev1.ConfigMapList{}
Expand Down
5 changes: 3 additions & 2 deletions controllers/tas/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ const (

// Configuration constants
const (
configMapOAuthProxyImageKey = "oauthProxyImage"
configMapServiceImageKey = "trustyaiServiceImage"
configMapOAuthProxyImageKey = "oauthProxyImage"
configMapServiceImageKey = "trustyaiServiceImage"
configMapkServeServerlessKey = "kServeServerless"
)

// OAuth constants
Expand Down
16 changes: 15 additions & 1 deletion controllers/tas/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type DeploymentConfig struct {
CustomCertificatesBundle CustomCertificatesBundle
Version string
BatchSize int
UseDBTLSCerts bool
}

// createDeploymentObject returns a Deployment for the TrustyAI Service instance
Expand Down Expand Up @@ -69,7 +70,20 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context,
PVCClaimName: pvcName,
CustomCertificatesBundle: caBunble,
Version: constants.Version,
BatchSize: batchSize,
}

if instance.Spec.Storage.IsStorageDatabase() {
_, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace)
if err != nil {
deploymentConfig.UseDBTLSCerts = false
log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found")
} else {
deploymentConfig.UseDBTLSCerts = true
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls")
}
} else {
deploymentConfig.UseDBTLSCerts = false
log.FromContext(ctx).Info("No need to check database secrets. Using PVC-mode.")
}

var deployment *appsv1.Deployment
Expand Down
36 changes: 28 additions & 8 deletions controllers/tas/inference_services.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"
)

const (
DEPLOYMENT_MODE_MODELMESH = "ModelMesh"
DEPLOYMENT_MODE_RAW = "RawDeployment"
DEPLOYMENT_MODE_SERVERLESS = "Serverless"
)

// GetDeploymentsByLabel returns a list of Deployments that match a label key-value pair
func (r *TrustyAIServiceReconciler) GetDeploymentsByLabel(ctx context.Context, namespace string, labelKey string, labelValue string) ([]appsv1.Deployment, error) {
// Prepare a DeploymentList object
Expand Down Expand Up @@ -213,23 +219,37 @@ func (r *TrustyAIServiceReconciler) handleInferenceServices(ctx context.Context,
return false, err
}

kServeServerlessEnabled, err := r.getKServeServerlessConfig(ctx)
if err != nil {
log.FromContext(ctx).Error(err, "Could not read KServeServerless configuration. Defaulting to disabled")
kServeServerlessEnabled = false
}

if len(inferenceServices.Items) == 0 {
return true, nil
}

for _, infService := range inferenceServices.Items {
annotations := infService.GetAnnotations()
// Check the annotation "serving.kserve.io/deploymentMode: ModelMesh"
if val, ok := annotations["serving.kserve.io/deploymentMode"]; ok && val == "ModelMesh" {
shouldContinue, err := r.patchEnvVarsByLabelForDeployments(ctx, instance, namespace, labelKey, labelValue, envVarName, crName, remove)
if err != nil {
log.FromContext(ctx).Error(err, "Could not patch environment variables for ModelMesh deployments.")
return shouldContinue, err

// Check the annotation "serving.kserve.io/deploymentMode"
if val, ok := annotations["serving.kserve.io/deploymentMode"]; ok {
if val == DEPLOYMENT_MODE_RAW {
log.FromContext(ctx).Info("RawDeployment mode not supported by TrustyAI")
continue
} else if val == DEPLOYMENT_MODE_MODELMESH {
shouldContinue, err := r.patchEnvVarsByLabelForDeployments(ctx, instance, namespace, labelKey, labelValue, envVarName, crName, remove)
if err != nil {
log.FromContext(ctx).Error(err, "could not patch environment variables for ModelMesh deployments")
return shouldContinue, err
}
continue
}
} else {
}
if kServeServerlessEnabled {
err := r.patchKServe(ctx, instance, infService, namespace, crName, remove)
if err != nil {
log.FromContext(ctx).Error(err, "Could not path InferenceLogger for KServe deployment.")
log.FromContext(ctx).Error(err, "could not patch InferenceLogger for KServe deployment")
return false, err
}
}
Expand Down
38 changes: 23 additions & 15 deletions controllers/tas/secrets.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,42 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

// getSecret retrieves a secret if it exists, returns an error if not
func (r *TrustyAIServiceReconciler) getSecret(ctx context.Context, name, namespace string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, client.ObjectKey{Name: name, Namespace: namespace}, secret)
if err != nil {
if errors.IsNotFound(err) {
return nil, fmt.Errorf("secret %s not found in namespace %s: %w", name, namespace, err)
}
return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", name, namespace, err)
}
return secret, nil
}

// findDatabaseSecret finds the DB configuration secret named (specified or default) in the same namespace as the CR
func (r *TrustyAIServiceReconciler) findDatabaseSecret(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (*corev1.Secret, error) {

databaseConfigurationsName := instance.Spec.Storage.DatabaseConfigurations
defaultDatabaseConfigurationsName := instance.Name + dbCredentialsSuffix

secret := &corev1.Secret{}

if databaseConfigurationsName != "" {
secret := &corev1.Secret{}
err := r.Get(ctx, client.ObjectKey{Name: databaseConfigurationsName, Namespace: instance.Namespace}, secret)
if err == nil {
return secret, nil
secret, err := r.getSecret(ctx, databaseConfigurationsName, instance.Namespace)
if err != nil {
return nil, err
}
if !errors.IsNotFound(err) {
return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", databaseConfigurationsName, instance.Namespace, err)
if secret != nil {
return secret, nil
}
} else {
// If specified not found, try the default

err := r.Get(ctx, client.ObjectKey{Name: defaultDatabaseConfigurationsName, Namespace: instance.Namespace}, secret)
if err == nil {
return secret, nil
secret, err := r.getSecret(ctx, defaultDatabaseConfigurationsName, instance.Namespace)
if err != nil {
return nil, err
}
if !errors.IsNotFound(err) {
return nil, fmt.Errorf("failed to get secret %s in namespace %s: %w", defaultDatabaseConfigurationsName, instance.Namespace, err)
if secret != nil {
return secret, nil
}

}

return nil, fmt.Errorf("neither secret %s nor %s found in namespace %s", databaseConfigurationsName, defaultDatabaseConfigurationsName, instance.Namespace)
Expand Down
17 changes: 16 additions & 1 deletion controllers/tas/templates/service/deployment.tmpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,11 @@ spec:
name: {{ .Instance.Spec.Storage.DatabaseConfigurations }}
key: databasePort
- name: QUARKUS_DATASOURCE_JDBC_URL
{{ if .UseDBTLSCerts }}
value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database?sslMode=verify-ca&serverSslCert=/etc/tls/db/tls.crt"
{{ else }}
value: "jdbc:${QUARKUS_DATASOURCE_DB_KIND}://${DATABASE_SERVICE}:${DATABASE_PORT}/trustyai_database"
{{ end }}
- name: SERVICE_DATA_FORMAT
value: "HIBERNATE"
- name: QUARKUS_DATASOURCE_GENERATION
Expand All @@ -121,7 +125,12 @@ spec:
- name: {{ .VolumeMountName }}
mountPath: {{ .Instance.Spec.Storage.Folder }}
readOnly: false
{{ end }}
{{ end }}
{{ if .UseDBTLSCerts }}
- name: db-tls-certs
mountPath: /etc/tls/db
readOnly: true
{{ end }}
- resources:
limits:
cpu: 100m
Expand Down Expand Up @@ -209,3 +218,9 @@ spec:
secret:
secretName: {{ .Instance.Name }}-internal
defaultMode: 420
{{ if .UseDBTLSCerts }}
- name: db-tls-certs
secret:
secretName: {{ .Instance.Name }}-db-tls
defaultMode: 420
{{ end }}
22 changes: 4 additions & 18 deletions tests/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
FROM registry.access.redhat.com/ubi8:8.10-901.1716497712
FROM registry.access.redhat.com/ubi8:8.10-1020

ARG ORG=trustyai-explainability
ARG BRANCH=main
ARG ODS_CI_REPO=https://github.com/red-hat-data-services/ods-ci
# This git reference should always reference a stable commit from ods-ci that supports ODH
# This hash corresponds to a March 24th, 2023 commit
ARG ODS_CI_GITREF=867a617bc224726cf98fa3354293f8e50b4f5eb5
ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/latest/openshift-client-linux.tar.gz
ARG ODS_CI_GITREF=a8cf770b37caa4ef7ce6596acc8bdd6866cc7772
ARG OC_CLI_URL=https://mirror.openshift.com/pub/openshift-v4/amd64/clients/ocp/4.14.33/openshift-client-linux.tar.gz

ENV HOME /root
WORKDIR /root

RUN dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm &&\
dnf install -y jq bc git go-toolset python3.11 python3.11-pip python3.11-devel unzip && \
RUN dnf install -y jq bc git go-toolset python3.11 python3.11-devel python3.11-pip unzip && \
dnf clean all && \
git clone https://github.com/opendatahub-io/peak $HOME/peak && \
cd $HOME/peak && \
Expand All @@ -25,9 +24,6 @@ RUN curl -L https://github.com/mikefarah/yq/releases/download/v4.25.1/yq_linux_a
RUN mkdir -p $HOME/src && \
cd $HOME/src && \
git clone --depth=1 --branch ${BRANCH} https://github.com/${ORG}/trustyai-explainability && \
# Clone ods-ci repo at specified git ref for the ODH Dashboard webUI tests
git clone --depth=1 ${ODS_CI_REPO} ods-ci && cd ods-ci && \
git fetch origin ${ODS_CI_GITREF} && git checkout FETCH_HEAD && \
chmod -R 777 $HOME/src

# Use a specific destination file name in case the url download name changes
Expand All @@ -37,16 +33,6 @@ RUN tar -C /usr/local/bin -xvf $HOME/peak/oc-cli.tar.gz && \

COPY Pipfile Pipfile.lock $HOME/peak/

RUN pip3 install micropipenv &&\
ln -s `which pip3` /usr/bin/pip &&\
cd $HOME/peak &&\
micropipenv install

# Install poetry to support the exeuction of ods-ci test framework
RUN curl -sSL https://install.python-poetry.org | python3 -
ENV PATH="${PATH}:$HOME/.local/bin"
RUN cd $HOME/src/ods-ci && poetry install

## Grab CI scripts from single-source-of-truth
RUN mkdir -p $HOME/peak/operator-tests/trustyai-explainability/ &&\
mkdir $HOME/kfdef/ &&\
Expand Down
63 changes: 42 additions & 21 deletions tests/scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,44 @@ if ! [ -z "${SKIP_OPERATOR_INSTALL}" ]; then
./setup.sh -t ~/peak/operatorsetup 2>&1
else
echo "Installing operator from community marketplace"
while [[ $retry -gt 0 ]]; do

# patch bug in peak setup script
sed -i "s/path=\"{.status.channels.*/ | jq '.status.channels | .[0].currentCSVDesc.installModes | map(select(.type == \"AllNamespaces\")) | .[0].supported')/" setup.sh
sed -i "s/csource=.*/echo \$3; csource=\$3/" setup.sh
sed -i 's/installop \$.*/installop \${vals[0]} \${vals[1]} \${vals[3]}/' setup.sh
start_t=$(date +%s) 2>&1
ready=false 2>&1
while ! $ready; do
CATALOG_SOURCES=$(oc get catalogsources -n openshift-marketplace 2> /dev/null | grep 'community-operators')
if [ ! -z "${CATALOG_SOURCES}" ]; then
echo $CATALOG_SOURCES
ready=true 2>&1
else
sleep 10
fi
if [ $(($(date +%s)-start_t)) -gt 300 ]; then
echo "Marketplace pods never started"
exit 1
fi
done

start_t=$(date +%s) 2>&1
ready=false 2>&1
while ! $ready; do
MANIFESTS=$(oc get packagemanifests -n openshift-marketplace 2> /dev/null | grep 'opendatahub')
echo $MANIFESTS
if [ ! -z "${MANIFESTS}" ]; then
echo $MANIFESTS
ready=true 2>&1
else
sleep 10
fi
if [ $(($(date +%s)-start_t)) -gt 900 ]; then
echo "Package manifests never downloaded"
exit 1
fi
done

while [[ $retry -gt 0 ]]; do
./setup.sh -o ~/peak/operatorsetup\

./setup.sh -o ~/peak/operatorsetup
# approve installplans
if [ $? -eq 0 ]; then
retry=-1
else
Expand All @@ -31,11 +61,16 @@ else
fi
retry=$(( retry - 1))

sleep 30
echo "Approving Install Plans, if needed"
oc patch installplan $(oc get installplan -n openshift-operators | grep $ODH_VERSION | awk '{print $1}') -n openshift-operators --type merge --patch '{"spec":{"approved":true}}' || true
oc patch installplan $(oc get installplan -n openshift-operators | grep authorino | awk '{print $1}') -n openshift-operators --type merge --patch '{"spec":{"approved":true}}' || true

finished=false 2>&1
start_t=$(date +%s) 2>&1
echo "Verifying installation of ODH operator"
while ! $finished; do
if [ ! -z "$(oc get pods -n openshift-operators | grep 'opendatahub-operator-controller-manager' | grep '1/1')" ]; then
if [ ! -z "$(oc get pods -n openshift-operators | grep 'opendatahub-operator-controller-manager' | grep '1/1')" ]; then
finished=true 2>&1
else
sleep 10
Expand All @@ -50,20 +85,6 @@ else
done
fi

#popd
### Grabbing and applying the patch in the PR we are testing
#pushd ~/src/${REPO_NAME}
#if [ -z "$PULL_NUMBER" ]; then
# echo "No pull number, assuming nightly run"
#else
# if [ $REPO_OWNER == "trustyai-explainability" ]; then
# curl -O -L https://github.com/${REPO_OWNER}/${REPO_NAME}/pull/${PULL_NUMBER}.patch
# echo "Applying followng patch:"
# cat ${PULL_NUMBER}.patch > ${ARTIFACT_DIR}/github-pr-${PULL_NUMBER}.patch
# git apply ${PULL_NUMBER}.patch
# fi
#fi

popd
## Point manifests repo uri in the KFDEF to the manifests in the PR
pushd ~/kfdef
Expand Down

0 comments on commit 427d102

Please sign in to comment.