diff --git a/data/aistack/aistack_gpu_values.yaml b/data/aistack/aistack_gpu_values.yaml new file mode 100644 index 000000000000..bc7994c029cd --- /dev/null +++ b/data/aistack/aistack_gpu_values.yaml @@ -0,0 +1,194 @@ +global: + tls: + # options: suse-private-ai, letsEncrypt, secret + source: suse-private-ai + issuerName: suse-private-ai + + # This section to be filled out when using letsEncrypt as the tls source + letsEncrypt: + environment: staging + email: yarunachalam@suse.com + ingress: + class: "nginx" +ollama: + ollama: + gpu: + enabled: true + type: 'nvidia' + number: 1 + ingress: + enabled: false +open-webui: + ollamaUrls: + - http://suse-private-ai-ollama.suse-private-ai.svc.cluster.local:11434 + persistence: + enabled: true + storageClass: local-path + ollama: + enabled: false + pipelines: + enabled: false + persistence: + storageClass: local-path + ingress: + enabled: true + class: "" + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/ssl-services: "open-webui" + host: suse-ollama-webui + tls: true + existingSecret: suse-private-ai-tls + extraEnvVars: + - name: DEFAULT_MODELS + value: "gemma:2b" + - name: DEFAULT_USER_ROLE + value: "user" + - name: WEBUI_NAME + value: "SUSE AI" + - name: GLOBAL_LOG_LEVEL + value: INFO + - name: VECTOR_DB + value: "milvus" + - name: MILVUS_URI + value: http://suse-private-ai-milvus.suse-private-ai.svc.cluster.local:19530 +milvus: + enabled: True + cluster: + enabled: True + standalone: + persistence: + persistentVolumeClaim: + storageClass: local-path + etcd: + replicaCount: 1 + persistence: + storageClassName: local-path + minio: + mode: distributed + replicas: 4 + rootUser: "admin" + rootPassword: "adminminio" + persistence: + storageClass: local-path + resources: + requests: + memory: 1024Mi + pulsar: + enabled: True + affinity: + anti_affinity: false + autorecovery: + resources: + requests: + cpu: 0.1 + memory: 256Mi + proxy: + replicaCount: 1 + resources: + requests: + cpu: 0.2 + memory: 256Mi + configData: + PULSAR_MEM: > + -Xms256m -Xmx256m + PULSAR_GC: > + -XX:MaxDirectMemorySize=256m + bookkeeper: + replicaCount: 2 + volumes: + journal: + local_storage: false + storageClassName: local-path + ledgers: + local_storage: false + storageClassName: local-path + common: + local_storage: false + storageClassName: local-path + resources: + requests: + cpu: 0.2 + memory: 512Mi + configData: + PULSAR_MEM: > + -Xms512m -Xmx512m + PULSAR_GC: > + -XX:MaxDirectMemorySize=512m + -Dio.netty.leakDetectionLevel=disabled + -Dio.netty.recycler.linkCapacity=1024 + -XX:+UseG1GC -XX:MaxGCPauseMillis=10 + -XX:+ParallelRefProcEnabled + -XX:+UnlockExperimentalVMOptions + -XX:+DoEscapeAnalysis -XX:ParallelGCThreads=32 + -XX:ConcGCThreads=32 -XX:G1NewSizePercent=50 + -XX:+DisableExplicitGC + -XX:-ResizePLAB + -XX:+ExitOnOutOfMemoryError + -XX:+PerfDisableSharedMem + -XX:+PrintGCDetails + zookeeper: + replicaCount: 1 + volumes: + data: + storageClassName: local-path + resources: + requests: + cpu: 0.1 + memory: 256Mi + configData: + PULSAR_MEM: > + -Xms256m + -Xmx256m + PULSAR_GC: > + -Dcom.sun.management.jmxremote + -Djute.maxbuffer=10485760 + -XX:+ParallelRefProcEnabled + -XX:+UnlockExperimentalVMOptions + -XX:+DoEscapeAnalysis -XX:+DisableExplicitGC + -XX:+PerfDisableSharedMem + -Dzookeeper.forceSync=no + broker: + replicaCount: 2 + volumes: + data: + storageClassName: local-path + resources: + requests: + cpu: 0.2 + memory: 512Mi + configData: + PULSAR_MEM: > + -Xms512m + -Xmx512m + PULSAR_GC: > + -XX:MaxDirectMemorySize=512m + -Dio.netty.leakDetectionLevel=disabled + -Dio.netty.recycler.linkCapacity=1024 + -XX:+ParallelRefProcEnabled + -XX:+UnlockExperimentalVMOptions + -XX:+DoEscapeAnalysis + -XX:ParallelGCThreads=32 + -XX:ConcGCThreads=32 + -XX:G1NewSizePercent=50 + -XX:+DisableExplicitGC + -XX:-ResizePLAB + -XX:+ExitOnOutOfMemoryError + autorecovery: + resources: + requests: + memory: 512Mi + cpu: 1 +# #indexNode: + # resources: + # requests: + # nvidia.com/gpu: "1" + # limits: + # nvidia.com/gpu: "1" + #queryNode: + # resources: + # requests: + # nvidia.com/gpu: "1" + # limits: + # nvidia.com/gpu: "1" +# diff --git a/data/aistack/aistack_values.yaml b/data/aistack/aistack_values.yaml new file mode 100644 index 000000000000..f40683507cc6 --- /dev/null +++ b/data/aistack/aistack_values.yaml @@ -0,0 +1,192 @@ +global: + tls: + # options: suse-private-ai, letsEncrypt, secret + source: suse-private-ai + issuerName: suse-private-ai + + # This section to be filled out when using letsEncrypt as the tls source + letsEncrypt: + environment: staging + email: yarunachalam@suse.com + ingress: + class: "nginx" +ollama: + ollama: + gpu: + enabled: false + ingress: + enabled: false +open-webui: + ollamaUrls: + - http://suse-private-ai-ollama.suse-private-ai.svc.cluster.local:11434 + persistence: + enabled: true + storageClass: local-path + ollama: + enabled: false + pipelines: + enabled: false + persistence: + storageClass: local-path + ingress: + enabled: true + class: "" + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/ssl-services: "open-webui" + host: suse-ollama-webui + tls: true + existingSecret: suse-private-ai-tls + extraEnvVars: + - name: DEFAULT_MODELS + value: "gemma:2b" + - name: DEFAULT_USER_ROLE + value: "user" + - name: WEBUI_NAME + value: "SUSE AI" + - name: GLOBAL_LOG_LEVEL + value: INFO + - name: VECTOR_DB + value: "milvus" + - name: MILVUS_URI + value: http://suse-private-ai-milvus.suse-private-ai.svc.cluster.local:19530 +milvus: + enabled: True + cluster: + enabled: True + standalone: + persistence: + persistentVolumeClaim: + storageClass: local-path + etcd: + replicaCount: 1 + persistence: + storageClassName: local-path + minio: + mode: distributed + replicas: 4 + rootUser: "admin" + rootPassword: "adminminio" + persistence: + storageClass: local-path + resources: + requests: + memory: 1024Mi + pulsar: + enabled: True + affinity: + anti_affinity: false + autorecovery: + resources: + requests: + cpu: 0.1 + memory: 256Mi + proxy: + replicaCount: 1 + resources: + requests: + cpu: 0.2 + memory: 256Mi + configData: + PULSAR_MEM: > + -Xms256m -Xmx256m + PULSAR_GC: > + -XX:MaxDirectMemorySize=256m + bookkeeper: + replicaCount: 2 + volumes: + journal: + local_storage: false + storageClassName: local-path + ledgers: + local_storage: false + storageClassName: local-path + common: + local_storage: false + storageClassName: local-path + resources: + requests: + cpu: 0.2 + memory: 512Mi + configData: + PULSAR_MEM: > + -Xms512m -Xmx512m + PULSAR_GC: > + -XX:MaxDirectMemorySize=512m + -Dio.netty.leakDetectionLevel=disabled + -Dio.netty.recycler.linkCapacity=1024 + -XX:+UseG1GC -XX:MaxGCPauseMillis=10 + -XX:+ParallelRefProcEnabled + -XX:+UnlockExperimentalVMOptions + -XX:+DoEscapeAnalysis -XX:ParallelGCThreads=32 + -XX:ConcGCThreads=32 -XX:G1NewSizePercent=50 + -XX:+DisableExplicitGC + -XX:-ResizePLAB + -XX:+ExitOnOutOfMemoryError + -XX:+PerfDisableSharedMem + -XX:+PrintGCDetails + zookeeper: + replicaCount: 1 + volumes: + data: + storageClassName: local-path + resources: + requests: + cpu: 0.1 + memory: 256Mi + configData: + PULSAR_MEM: > + -Xms256m + -Xmx256m + PULSAR_GC: > + -Dcom.sun.management.jmxremote + -Djute.maxbuffer=10485760 + -XX:+ParallelRefProcEnabled + -XX:+UnlockExperimentalVMOptions + -XX:+DoEscapeAnalysis -XX:+DisableExplicitGC + -XX:+PerfDisableSharedMem + -Dzookeeper.forceSync=no + broker: + replicaCount: 2 + volumes: + data: + storageClassName: local-path + resources: + requests: + cpu: 0.2 + memory: 512Mi + configData: + PULSAR_MEM: > + -Xms512m + -Xmx512m + PULSAR_GC: > + -XX:MaxDirectMemorySize=512m + -Dio.netty.leakDetectionLevel=disabled + -Dio.netty.recycler.linkCapacity=1024 + -XX:+ParallelRefProcEnabled + -XX:+UnlockExperimentalVMOptions + -XX:+DoEscapeAnalysis + -XX:ParallelGCThreads=32 + -XX:ConcGCThreads=32 + -XX:G1NewSizePercent=50 + -XX:+DisableExplicitGC + -XX:-ResizePLAB + -XX:+ExitOnOutOfMemoryError + autorecovery: + resources: + requests: + memory: 512Mi + cpu: 1 +# #indexNode: + # resources: + # requests: + # nvidia.com/gpu: "1" + # limits: + # nvidia.com/gpu: "1" + #queryNode: + # resources: + # requests: + # nvidia.com/gpu: "1" + # limits: + # nvidia.com/gpu: "1" +# diff --git a/data/aistack/local_path_storage.yaml b/data/aistack/local_path_storage.yaml new file mode 100644 index 000000000000..a66db5ff0bda --- /dev/null +++ b/data/aistack/local_path_storage.yaml @@ -0,0 +1,161 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: local-path-storage + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: local-path-provisioner-service-account + namespace: local-path-storage + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: local-path-provisioner-role + namespace: local-path-storage +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: local-path-provisioner-role +rules: + - apiGroups: [""] + resources: ["nodes", "persistentvolumeclaims", "configmaps", "pods", "pods/log"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: local-path-provisioner-bind + namespace: local-path-storage +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: local-path-provisioner-role +subjects: + - kind: ServiceAccount + name: local-path-provisioner-service-account + namespace: local-path-storage + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: local-path-provisioner-bind +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: local-path-provisioner-role +subjects: + - kind: ServiceAccount + name: local-path-provisioner-service-account + namespace: local-path-storage + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: local-path-provisioner + namespace: local-path-storage +spec: + replicas: 1 + selector: + matchLabels: + app: local-path-provisioner + template: + metadata: + labels: + app: local-path-provisioner + spec: + serviceAccountName: local-path-provisioner-service-account + containers: + - name: local-path-provisioner + image: rancher/local-path-provisioner:v0.0.28 + imagePullPolicy: IfNotPresent + command: + - local-path-provisioner + - --debug + - start + - --config + - /etc/config/config.json + volumeMounts: + - name: config-volume + mountPath: /etc/config/ + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_MOUNT_PATH + value: /etc/config/ + volumes: + - name: config-volume + configMap: + name: local-path-config + +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: local-path +provisioner: rancher.io/local-path +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete + +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: local-path-config + namespace: local-path-storage +data: + config.json: |- + { + "nodePathMap":[ + { + "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES", + "paths":["/opt/local-path-provisioner"] + } + ] + } + setup: |- + #!/bin/sh + set -eu + mkdir -m 0777 -p "$VOL_DIR" + teardown: |- + #!/bin/sh + set -eu + rm -rf "$VOL_DIR" + helperPod.yaml: |- + apiVersion: v1 + kind: Pod + metadata: + name: helper-pod + spec: + priorityClassName: system-node-critical + tolerations: + - key: node.kubernetes.io/disk-pressure + operator: Exists + effect: NoSchedule + containers: + - name: helper-pod + image: busybox + imagePullPolicy: IfNotPresent diff --git a/data/aistack/nvidia_gpu_values.yaml b/data/aistack/nvidia_gpu_values.yaml new file mode 100644 index 000000000000..753050501c72 --- /dev/null +++ b/data/aistack/nvidia_gpu_values.yaml @@ -0,0 +1,10 @@ +toolkit: + env: + - name: CONTAINERD_CONFIG + value: /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl + - name: CONTAINERD_SOCKET + value: /run/k3s/containerd/containerd.sock + - name: CONTAINERD_RUNTIME_CLASS + value: nvidia + - name: CONTAINERD_SET_AS_DEFAULT + value: "true" diff --git a/lib/main_micro_alp.pm b/lib/main_micro_alp.pm index ed6dce7d6955..5aef18a82d9a 100644 --- a/lib/main_micro_alp.pm +++ b/lib/main_micro_alp.pm @@ -16,11 +16,13 @@ use main_ltp_loader 'load_kernel_tests'; use main_containers qw(load_container_tests is_container_test load_container_engine_test); use main_publiccloud qw(load_publiccloud_download_repos); use main_security qw(load_security_tests is_security_test); -use testapi qw(check_var get_required_var get_var set_var); +use testapi qw(check_var get_required_var get_var set_var record_info); use version_utils; use utils; use Utils::Architectures; use Utils::Backends; +use Data::Dumper; + sub is_image { return get_required_var('FLAVOR') =~ /image|default|kvm|base/i; @@ -311,6 +313,13 @@ sub load_slem_on_pc_tests { } if (get_var('PUBLIC_CLOUD_LTP', 0)) { loadtest("publiccloud/run_ltp", run_args => $args); + } elsif (get_var('PUBLIC_CLOUD_AISTACK')) { + # AISTACK test verification + loadtest("publiccloud/ssh_interactive_start", run_args => $args); + loadtest("publiccloud/create_aistack_env", run_args => $args); + loadtest("publiccloud/ssh_interactive_end", run_args => $args); + #loadtest("publiccloud/", run_args => $args); + #loadtest("publiccloud/", run_args => $args); } elsif (is_container_test) { loadtest("publiccloud/ssh_interactive_start", run_args => $args); loadtest("publiccloud/instance_overview", run_args => $args); @@ -321,10 +330,8 @@ sub load_slem_on_pc_tests { $run_args->{runtime} = $_; load_container_engine_test($run_args); } - loadtest("publiccloud/ssh_interactive_end", run_args => $args); - } - else { + } else { loadtest "publiccloud/check_services", run_args => $args; loadtest("publiccloud/slem_basic", run_args => $args); } diff --git a/tests/publiccloud/create_aistack_env.pm b/tests/publiccloud/create_aistack_env.pm new file mode 100644 index 000000000000..fc32aa01f8fd --- /dev/null +++ b/tests/publiccloud/create_aistack_env.pm @@ -0,0 +1,247 @@ +# SUSE's openQA tests +# +# Copyright 2024 SUSE LLC +# SPDX-License-Identifier: FSFAP + +# Basic aistack test + +# Summary: This test performs the following actions +# - Create a VM in EC2 using SLE-Micro-BYOS later than 6.0 version +# - Install the required dependencies to install the aistack helm chart +# - Test access to OpenWebUI and run integration tests with Ollama and MilvusDB +# Maintainer: Yogalakshmi Arunachalam +# + +use Mojo::Base 'publiccloud::basetest'; +use testapi; +use serial_terminal; +use publiccloud::utils; +use publiccloud::ssh_interactive; +use transactional; +use containers::k8s; +use strict; +use warnings; +use utils; +use publiccloud::utils; +use transactional qw(process_reboot trup_install trup_shell); +use File::Basename; +use version_utils; +use Data::Dumper; + +sub install_dependency_package { + my ($instance) = @_; + my $rke2_url = get_var('RKE2_URL'); + my $kubectl_url = get_var('KUBECTL_URL'); + my $helm_url = get_var('HELM_URL'); + + record_info('Dep pkg install'); + trup_call("pkg install curl git docker"); + process_reboot(trigger => 1); + systemctl("enable docker"); + systemctl("start docker"); + systemctl("status docker"); + script_run("curl -sSL $rke2_url -o ./install_rke2.sh && chmod 775 ./install_rke2.sh"); + script_run("sh ./install_rke2.sh"); + script_run("echo 'export PATH=\$PATH:/opt/rke2/bin' >> ~/.bashrc"); + systemctl("enable rke2-server.service"); + systemctl("start rke2-server.service"); + systemctl("status rke2-server.service"); + script_run("rke2 --version"); + script_run("curl -sSL $helm_url -o ./install_helm.sh && chmod 775 ./install_helm.sh"); + script_run("sh ./install_helm.sh"); + script_run("helm version"); + script_run("curl -sSL $kubectl_url -o ./kubectl && chmod +x ./kubectl"); + script_run("sudo mv ./kubectl /usr/local/bin/"); + script_run("kubectl version --client"); +} + +sub install_dependency_components { + my ($instance) = @_; + my $cert_repo = get_var('HELM_CERTS'); + my $ingress_repo = get_var('HELM_INGRESS'); + my $ing_ver = get_var('ING_VERSION'); + # Add Ingress Controller to open-webui endpoint + assert_script_run("helm repo add $ingress_repo"); + assert_script_run("helm repo update"); + assert_script_run("helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx --set controller.service.type=ClusterIP --version $ing_ver --create-namespace", timeout => 120); + + # Add cert-manager repo,install + assert_script_run("helm repo add $cert_repo"); + assert_script_run("helm repo update"); + assert_script_run("helm install cert-manager jetstack/cert-manager --namespace cert-manager --create-namespace --version v1.15.2 --set crds.enabled=true", timeout => 120); +} + +sub config_kubectl { + my ($instance) = @_; + # config kubectl + assert_script_run("mkdir -p ~/.kube"); + assert_script_run("sudo cp /etc/rancher/rke2/rke2.yaml ~/.kube/config"); + assert_script_run("kubectl config get-contexts"); + assert_script_run("kubectl config use-context default"); + assert_script_run("kubectl config view"); +} + +sub install_aistack_chart { + my ($instance, $ai_chart_repo, $namespace, $vf_name) = @_; + my $SECRET_application_collection = get_var('_SECRET_DOCKER'); + my $cert_repo = get_var('HELM_CERTS'); + my $ingress_repo = get_var('HELM_INGRESS'); + my $docker_user_name = 'yarunachalam@suse.com'; + #my $repo_url = 'github.com/SUSE/private-ai-charts'; + my $repo_url = get_var('HELM_CHARTS'); + my $git_token = get_var('_SECRET_SSH'); + my $ing_ver = get_var('ING_VERSION'); + #my $ing_ver = '4.11.1'; + my $local_path_url = 'https://raw.githubusercontent.com/yarunachalam/os-autoinst-distri-opensuse/aistack_basic/data/aistack/local_path_storage.yaml'; + + record_info('AISTACK charts install'); + assert_script_run("helm list --all-namespaces"); + assert_script_run("kubectl get pods --all-namespaces"); + + # Access to Application collection registery + # Get docker username and password + assert_script_run("kubectl create ns $namespace"); + assert_script_run("kubectl create secret docker-registry application-collection --docker-server=dp.apps.rancher.io --docker-username='$docker_user_name' --docker-password='$SECRET_application_collection' -n $namespace", timeout => 120); + + # Install private-ai-stack + my $gitlab_clone_url = 'https://git:' . $git_token . '@' . $repo_url; + assert_script_run("git clone $gitlab_clone_url"); + assert_script_run("curl -o $vf_name $ai_chart_repo", timeout => 120); + assert_script_run("curl -o $vf_name $ai_chart_repo", timeout => 120); + + # local-path-storage.yaml is a copy of https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.28/deploy/local-path-storage.yaml + assert_script_run("kubectl apply -f $local_path_url", timeout => 120); + assert_script_run("helm upgrade --install suse-private-ai private-ai-charts --namespace $namespace --create-namespace --values $vf_name --set open-webui.ingress.class=nginx", timeout => 600); + assert_script_run("kubectl get all --namespace $namespace"); + sleep 180; + + # Check pod status and log for successful + # Loop thru each pod and pod status is running,error,failed,CrashLoopBackoff,ContainerStatusUnknown skip to next pod + # any other pod status check the log for Failure,Error,Exception record the log and skip to next pod + # if not loop thru till it reaches the max_retries to ensure the pod comes to running or failure state. + # After reaching max_retries , record the pod details which does not run after reaching max_retries + my $max_retries = 15; + my $sleep_interval = 10; + my @out = split(' ', script_output("kubectl get pods --namespace $namespace -o custom-columns=':metadata.name'")); + record_info("Pod names", join(" ", @out)); + POD_LOOP: foreach my $pod (@out) { + my $counter = 0; + my $start_time = time(); + while ($counter++ < $max_retries) { + my $status = script_output("kubectl get pod $pod -n $namespace -o=jsonpath='{.status.phase}'", proceed_on_failure => 1); + my $logs = script_output("kubectl logs $pod -n $namespace", proceed_on_failure => 1); + if ($status eq 'Running') { + record_info("$pod is running. "); + next POD_LOOP; + } elsif ($status =~ /^(Error|Failed|CrashLoopBackOff|ContainerStatusUnknown)$/) { + record_info("$pod failed due to error in log: $logs \n "); + next POD_LOOP; + } else { + if ($logs =~ /ERROR|FAILURE|Exception|Failed/) { + record_info("$pod failed due to error in log: $logs \n "); + next POD_LOOP; + } # if log + sleep $sleep_interval; + } # if status + } # while loop + record_info("$pod is not running after $max_retries "); + } #pod loop + + assert_script_run("kubectl get all --namespace $namespace"); + record_info("Logs for the pods which is not in running or pending state"); + foreach my $pod (@out) { + my $status = script_output("kubectl get pod $pod -n $namespace -o=jsonpath='{.status.phase}'", proceed_on_failure => 1); + if ($status !~ /^(Running|Pending|Completed)$/) { + my $logs = script_output("kubectl logs $pod -n $namespace", proceed_on_failure => 1); + record_info("$pod is in $status state. Logs:\n$logs\n"); + } + } # pod loop +} + +sub test_openwebui_service { + my ($instance, $namespace) = @_; + my $sr_name = 'open-webui'; + my $host_name = 'suse-ollama-webui'; + #my $host_name = get_var('OPENWEBUI_HOSTNAME'); + record_info('OpenWebUI service'); + + # After successfull installation, Get open-webUI ipaddress and add in /etc/host and verify connectivity + assert_script_run("kubectl get ingress --namespace $namespace -o json"); + my $ipaddr = script_output("kubectl get ingress -n $namespace -o jsonpath='{.items[0].status.loadBalancer.ingress[0].ip}'"); + assert_script_run("echo \"$ipaddr $host_name\" | sudo tee -a /etc/hosts > /dev/null"); + set_var('OPENWEBUI_IP', "$ipaddr"); + record_info("Added $ipaddr to /etc/hosts with hostname $host_name"); + my $curl_cmd = "curl -v -k https://$host_name"; + my $curl_result = script_run($curl_cmd); + if ($curl_result == 0) { + record_info("Successfully connected to the open-webui service at $curl_cmd \n"); + } else { + die "Unable to connect to the open-webui service at $curl_cmd\n"; + } +} + + +sub install_nvidia_drivers { + my ($instance, $values_url, $file_name) = @_; + my $easyinstall_url = 'https://gitlab.nue.suse.com/cloud-solutions-sys-eng/nvidia-drivers-easy-install/-/blob/main/nvidia_easy_install.sh'; + record_info('Install nvidia drivers'); + my $driver_version = '550.54.14'; + #my $gpu_op_url = get_var('GPU_OPERATOR'); + my $gpu_op_url = 'nvidia https://nvidia.github.io/gpu-operator'; + +# my $driver_version=script_output("zypper se -s nvidia-open-driver | grep nvidia-open-driver- | sed 's/.* package | //g' | sed 's/\s.*//g' | sort | head -n 1 | sed 's/[-_].*//g'"); + script_run("sudo zypper ar https://download.nvidia.com/suse/sle15sp6/ nvidia-sle15sp6-main"); + script_run("sudo zypper --gpg-auto-import-keys refresh"); + trup_call("pkg install -y --auto-agree-with-licenses nvidia-open-driver-G06-signed-kmp=$driver_version nvidia-compute-utils-G06=$driver_version"); + + record_info('Install nvidia gpu operator'); + assert_script_run("curl -o $file_name $values_url", timeout => 120); + assert_script_run("helm repo add $gpu_op_url", timeout => 600); + assert_script_run("helm repo update", timeout => 600); + assert_script_run("helm repo list", timeout => 600); + assert_script_run("helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator --set driver.enabled=false -f $file_name ", timeout => 600); + + #install nvidia gpu operator + process_reboot(trigger => 1); + script_run("sudo nvidia-smi"); + script_run("rpm -q nvidia-open-driver-G06-signed-kmp"); + script_run("kubectl get pods -n gpu-operator"); +} + +sub run { + my ($self, $args) = @_; + #my $values_url = 'https://raw.githubusercontent.com/yarunachalam/os-autoinst-distri-opensuse/aistack_basic/data/aistack/'; + my $values_url = get_var('HELM_VALUES'); + my $ai_ns = 'suse-private-ai'; + my $value_file_name = ''; + + my $instance = $self->{my_instance} = $args->{my_instance}; + my $provider = $self->{provider} = $args->{my_provider}; + + # Install dependency package + install_dependency_package($instance); + config_kubectl($instance); + install_dependency_components($instance); + + if (check_var('PUBLIC_CLOUD_NVIDIA_GPU_AISTACK', 1)) { + #uncomment when the nvidia driver is ready to test + my $gpu_url = $values_url; + my $gpu_values = 'nvidia_gpu_values.yaml'; + $gpu_url .= "$gpu_values"; + install_nvidia_drivers($instance, $gpu_url, $gpu_values); + $value_file_name = 'aistack_gpu_values.yaml'; + $values_url .= "$value_file_name"; + } else { + $values_url .= 'aistack_values.yaml'; + $value_file_name = 'aistack_values.yaml'; + } + + # Install private_ai_stack chart + install_aistack_chart($instance, $values_url, $ai_ns, $value_file_name); + + # OpenWebUI service test + test_openwebui_service($instance, $ai_ns); + record_info('End of AISTACK_BASIC'); +} + +1;