diff --git a/infrastructure/cluster/flux/rucio/rucio-daemons.yaml b/infrastructure/cluster/flux/rucio/rucio-daemons.yaml new file mode 100644 index 0000000..de3ec4c --- /dev/null +++ b/infrastructure/cluster/flux/rucio/rucio-daemons.yaml @@ -0,0 +1,521 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: daemons + namespace: rucio + +spec: + releaseName: daemons + interval: 30s + chart: + spec: + sourceRef: + kind: HelmRepository + name: rucio-charts + namespace: rucio + interval: 1m + chart: rucio-daemons + version: 34.0.4 + + valuesFrom: + - kind: Secret + name: rucio-db + valuesKey: dbfullstring + targetPath: config.database.default + # - kind: Secret + # name: hermes-secret + # valuesKey: values.yaml + + values: + additionalSecrets: + - volumeName: idpsecrets + secretName: idpsecrets + mountPath: /opt/rucio/etc/idpsecrets.json + subPath: idpsecrets.json + - volumeName: cafile + secretName: cafile + mountPath: /etc/grid-security/ca.pem + subPath: ca.pem + # - volumeName: rucio-x509up + # secretName: rucio-x509up + # mountPath: /tmp/x509up + # subPath: x509up + # - volumeName: rse-accounts + # secretName: rse-accounts + # mountPath: /opt/rucio/etc/rse-accounts.cfg + # subPath: rse-accounts.cfg + + # TO START WITH + + # https://rucio.cern.ch/documentation/operator/transfers/transfers-overview/#daemon-overview + + abacusAccountCount: 1 + abacusCollectionReplicaCount: 1 + abacusRseCount: 1 + automatixCount: 1 + cacheConsumerCount: 0 + conveyorTransferSubmitterCount: 2 + conveyorPollerCount: 2 + conveyorFinisherCount: 1 + conveyorReceiverCount: 0 + conveyorStagerCount: 0 + conveyorThrottlerCount: 0 + conveyorPreparerCount: 0 # for debugging, if it is not there the submitter will do the path computation adn source replica selection, and since the preparer needs a secret but I dont know of which kind, let's try without + darkReaperCount: 0 + hermesCount: 2 + hermes2Count: 0 + judgeCleanerCount: 1 + judgeEvaluatorCount: 1 + judgeInjectorCount: 1 + judgeRepairerCount: 1 + oauthManagerCount: 1 + undertakerCount: 1 + reaperCount: 2 + replicaRecovererCount: 0 + transmogrifierCount: 1 + tracerKronosCount: 0 + minosCount: 1 + minosTemporaryExpirationCount: 0 + necromancerCount: 1 + + image: + repository: rucio/rucio-daemons + tag: release-34.6.0 + pullPolicy: Always + + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + + podLabels: {} + podAnnotations: {} + + minReadySeconds: 5 + + # monitoring: + # enabled: true + # serviceMonitorEnabled: true + # exporterPort: 8080 + # targetPort: 8080 + # interval: 30s + # telemetryPath: /metrics + # namespace: monitoring + # labels: + # release: prometheus-operator + + abacusAccount: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + abacusCollectionReplica: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + abacusRse: + fillHistoryTable: 0 + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + automatix: + threads: 1 + sleepTime: 30 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "100Mi" + cpu: "100m" + + cacheConsumer: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "100Mi" + cpu: "100m" + + conveyorTransferSubmitter: + threads: 3 # atlas has 10 + podAnnotations: {} + activities: "'Analysis Input' 'Analysis Output' 'Data Brokering' 'Data Consolidation' 'Data rebalancing' 'Debug' 'Express' 'Functional Test' 'Group Subscriptions' 'Production Input' 'Production Output' 'Recovery' 'Staging' 'T0 Export' 'T0 Export' 'T0 Tape' 'User Subscriptions' 'default' 'DAC21'" + sleepTime: 10 + archiveTimeout: "" + bulk: 1000 + groupBulk: 200 + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + conveyorPoller: + activities: "'Analysis Input' 'Analysis Output' 'Data Brokering' 'Data Consolidation' 'Data rebalancing' 'Debug' 'Express' 'Functional Test' 'Group Subscriptions' 'Production Input' 'Production Output' 'Recovery' 'Staging' 'T0 Export' 'T0 Export' 'T0 Tape' 'User Subscriptions' 'default' 'DAC21'" + threads: 3 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + conveyorFinisher: + threads: 2 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + conveyorReceiver: + threads: 2 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + conveyorThrottler: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + conveyorPreparer: + threads: 2 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + darkReaper: + workers: 1 + chunkSize: "10" + scheme: "" + rses: "" + includeRses: "" + excludeRses: "" + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + hermes: + threads: 2 + podAnnotations: {} + bulk: 1000 + sleepTime: 10 + brokerTimeout: 3 + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + hermes2: + threads: 1 + podAnnotations: {} + bulk: 1000 + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + judgeCleaner: + threads: 2 + podAnnotations: {} + resources: + limits: + memory: "1200Mi" + cpu: "1000m" + requests: + memory: "400Mi" + cpu: "1000m" + + judgeEvaluator: + threads: 2 + podAnnotations: {} + resources: + limits: + memory: "3000Mi" + cpu: "2000m" + requests: + memory: "3000Mi" + cpu: "2000m" + + judgeInjector: + threads: 2 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + judgeRepairer: + threads: 2 + podAnnotations: {} + resources: + limits: + memory: "6000Mi" + cpu: "1000m" + requests: + memory: "800Mi" + cpu: "1000m" + + oauthManager: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + undertaker: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + reaper: + greedy: 1 + scheme: "" + threads: 2 + chunkSize: 2 + includeRses: "" + podAnnotations: {} + resources: + limits: + memory: "1600Mi" + cpu: "2000m" + requests: + memory: "500Mi" + cpu: "400m" + + replicaRecoverer: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + transmogrifier: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + tracerKronos: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "200Mi" + cpu: "700m" + + minos: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "100Mi" + cpu: "100m" + + minosTemporaryExpiration: + threads: 1 + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "100Mi" + cpu: "100m" + + necromancer: + podAnnotations: {} + resources: + limits: + memory: "200Mi" + cpu: "700m" + requests: + memory: "100Mi" + cpu: "100m" + + ftsRenewal: + schedule: "12 */6 * * *" + enabled: 1 + image: + repository: rucio/fts-cron + tag: 34.0.0 + pullPolicy: Always + vos: + - vo: "escape" + voms: "escape" + servers: "https://fts3-devel.cern.ch:8446,https://fts3-pilot.cern.ch:8446" + script: "escape" + secretMounts: + - secretName: fts-cert + mountPath: /opt/rucio/certs + # subPath: usercert.pem + - secretName: fts-key + mountPath: /opt/rucio/certs + # subPath: userkey.pem + additionalEnvs: + - name: RUCIO_FTS_SECRETS + value: daemons-rucio-x509up + - name: USERCERT_NAME + value: usercert.pem + - name: USERKEY_NAME + value: userkey.pem + + resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + + automaticRestart: + enabled: 1 + image: + repository: bitnami/kubectl + tag: latest + pullPolicy: IfNotPresent + # schedule: "7 1 * * *" + schedule: "10 */8 * * *" + selectorLabel: "app-group=rucio-daemons" + resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + + config: + + oidc: + idpsecrets: "/opt/rucio/etc/idpsecrets.json" + admin_issuer: "escape" + expected_audience: "rucio" + # expected_scope: "openid profile" + + monitor: + enable_metrics: "True" + + policy: + permission: "escape" + schema: "escape" + + conveyor: + scheme: "srm,gsiftp,root,http,https" + transfertool: "fts3" + ftshosts: "https://fts3-pilot.cern.ch:8446" + cacert: "/etc/grid-security/ca.pem" + usercert: "/tmp/x509up" + allow_user_oidc_tokens: "True" + request_oidc_scope: "openid profile offline_access wlcg.groups email fts:submit-transfer" + request_oidc_audience: "fts" + + # this needs to be messaging_hermes and not messaging-hermes for some reason + + messaging_hermes: + port: "61113" + + nonssl_port: "61113" + use_ssl: "False" + destination: "/topic/eosc.rucio.events" #"/topic/escape.rucio.events" changed from escape + brokers: "dashb-mb.cern.ch" + voname: "escape" + + messaging_fts3: + port: "61123" + use_ssl: "False" + brokers: "dashb-mb.cern.ch" + voname: "escape" + + # credentials: + # gcs: "/opt/rucio/etc/rse-accounts.cfg" + # signature_lifetime: "3600" diff --git a/infrastructure/cluster/flux/rucio/rucio-servers.yaml b/infrastructure/cluster/flux/rucio/rucio-servers.yaml new file mode 100644 index 0000000..dddd5fc --- /dev/null +++ b/infrastructure/cluster/flux/rucio/rucio-servers.yaml @@ -0,0 +1,140 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: servers + namespace: rucio + annotations: + flux.weave.works/automated: "false" + +spec: + releaseName: servers + interval: 5m + chart: + spec: + sourceRef: + kind: HelmRepository + name: rucio-charts + namespace: rucio + interval: 1m + chart: rucio-server + version: 34.0.4 + valuesFrom: + - kind: Secret + name: rucio-db + valuesKey: dbfullstring + targetPath: config.database.default + + values: + secretMounts: + - secretName: server-gridca + mountPath: /etc/grid-security/certificates/ + # - secretName: server-hostcert + # mountPath: /etc/grid-security/hostcert.pem + # subPath: hostcert.pem + # - secretName: server-hostkey + # mountPath: /etc/grid-security/hostkey.pem + # subPath: hostkey.pem + # - secretName: server-cafile + # mountPath: /etc/grid-security/ca.pem + # subPath: ca.pem + # - secretName: rse-accounts + # mountPath: /opt/rucio/etc/rse-accounts.cfg + # subPath: rse-accounts.cfg + + additionalEnvs: + - name: RUCIO_HTTPD_GRID_SITE_ENABLED + value: "True" + - name: RUCIO_CA_PATH + value: "/etc/grid-security/certificates" + # - name: RUCIO_CA_PATH + # value: "/etc/grid-security/certificates" + - name: RUCIO_SSL_PROTOCOL + value: "-SSLv3 -TLSv1 -TLSv1.1 +TLSv1.2 +TLSv1.3" + # - name: RUCIO_HTTPD_PROXY_PROTOCOL_EXCEPTIONS + # valueFrom: + # fieldRef: + # fieldPath: status.hostIP + + replicaCount: 1 + useSSL: true + + image: + repository: rucio/rucio-server + tag: release-34.6.0 + pullPolicy: Always + imagePullSecrets: [] + # - name: docker-regcreds + + # The API server listens on port 6443 (by default). + # Therefore, expose the API server on port 443 and listen to 6443. + + + service: + type: LoadBalancer + port: 443 + targetPort: 443 + protocol: TCP + name: https + # # annotations: + # # # These annotations are only required for cluster templates <=1.18 + # # loadbalancer.openstack.org/network-id: "798d00f3-2af9-48a0-a7c3-a26d909a2d64" + # # service.beta.kubernetes.io/openstack-internal-load-balancer: "true" + # # loadbalancer.openstack.org/cascade-delete: "false" + + serverType: flask + + # automaticRestart: + # enabled: 1 + # image: + # repository: bitnami/kubectl + # tag: latest + # pullPolicy: IfNotPresent + # schedule: "01 0 * * *" + # selectorLabel: "'release in (servers-dev),app in (rucio-dev)'" + + # httpd_config: + # mpm_mode: "event" + # timeout: "300" + # enable_status: "True" + # legacy_dn: "True" + # keep_alive: "On" + # keep_alive_timeout: "5" + # max_keep_alive_requests: "128" + # server_limit: "10" + # start_servers: "4" + # thread_limit: "128" + # threads_per_child: "128" + # min_spare_threads: "256" + # max_spare_threads: "512" + # max_request_workers: "1280" + # max_connections_per_child: "2048" + # max_clients: "20" + # mpm_mode: "worker" + + ## values used to configure Rucio + config: + database: + pool_size: 10 + max_overflow: 20 + + # oidc: + # idpsecrets: "/opt/rucio/etc/idpsecrets.json" + # admin_issuer: "escape" + # expected_audience: "rucio" + # expected_scope: "openid profile" + + policy: + permission: "escape" + schema: "escape" + + # credentials: + # gcs: "/opt/rucio/etc/rse-accounts.cfg" + + serverResources: + limits: + cpu: "2000m" #"4000m" + memory: "1000Mi" + requests: + cpu: #"1000m" #"2000m" + memory: "500Mi" + diff --git a/infrastructure/cluster/flux/rucio/rucio-serversAuth.yaml b/infrastructure/cluster/flux/rucio/rucio-serversAuth.yaml new file mode 100644 index 0000000..1c64b7e --- /dev/null +++ b/infrastructure/cluster/flux/rucio/rucio-serversAuth.yaml @@ -0,0 +1,129 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: servers-auth + namespace: rucio + annotations: + flux.weave.works/automated: "false" + +spec: + releaseName: servers-auth + interval: 5m + chart: + spec: + sourceRef: + kind: HelmRepository + name: rucio-charts + namespace: rucio + interval: 1m + chart: rucio-server + version: 34.0.4 + valuesFrom: + - kind: Secret + name: rucio-db + valuesKey: dbfullstring + targetPath: config.database.default + + values: + secretMounts: + # - secretName: server-hostcert + # mountPath: /etc/grid-security/hostcert.pem + # subPath: hostcert.pem + # - secretName: server-hostkey + # mountPath: /etc/grid-security/hostkey.pem + # subPath: hostkey.pem + # - secretName: server-cafile + # mountPath: /etc/grid-security/ca.pem + # subPath: ca.pem + - secretName: idpsecrets + mountPath: /opt/rucio/etc/idpsecrets.json + subPath: idpsecrets.json + + additionalEnvs: + # - name: RUCIO_CA_PATH + # value: "/etc/grid-security/certificates" + - name: RUCIO_SSL_PROTOCOL + value: "-SSLv3 -TLSv1 -TLSv1.1 +TLSv1.2 +TLSv1.3" + # - name: RUCIO_HTTPD_PROXY_PROTOCOL_EXCEPTIONS + # valueFrom: + # fieldRef: + # fieldPath: status.hostIP + + + replicaCount: 1 + useSSL: true + + image: + repository: rucio/rucio-server + tag: release-34.6.0 + pullPolicy: Always + + service: + type: LoadBalancer + port: 443 + targetPort: 443 + protocol: TCP + name: https + # allocateLoadBalancerNodePorts: true + # externalTrafficPolicy: Cluster + # annotations: + # service.beta.kubernetes.io/openstack-internal-load-balancer: "true" # this just tells openstack that the loadbalanced services are internal to the cluster. + # loadbalancer.openstack.org/cascade-delete: "false" + + + serverType: flask + + # automaticRestart: + # enabled: 1 + # image: + # repository: bitnami/kubectl + # tag: latest + # pullPolicy: IfNotPresent + # schedule: "01 0 * * *" + # selectorLabel: "'release in (servers-auth-dev),app in (rucio-dev)'" + + # httpd_config: + # mpm_mode: "event" + # timeout: "300" + # enable_status: "True" + # legacy_dn: "True" + # keep_alive: "On" + # keep_alive_timeout: "5" + # max_keep_alive_requests: "128" + # server_limit: "10" + # start_servers: "4" + # thread_limit: "128" + # threads_per_child: "128" + # min_spare_threads: "256" + # max_spare_threads: "512" + # max_request_workers: "1280" + # max_connections_per_child: "2048" + # max_clients: "20" + # # mpm_mode: "worker" + + ## values used to configure Rucio + config: + database: + pool_size: 10 + max_overflow: 20 + + api: + endpoints: "ping, auth" + + # oidc: + # idpsecrets: "/opt/rucio/etc/idpsecrets.json" + # admin_issuer: "escape" + # expected_audience: "rucio" + # expected_scope: "openid profile" + + # policy: + # permission: "escape" + # schema: "escape" + + serverResources: + limits: + cpu: "2000m" #"4000m" + memory: "1000Mi" + requests: + cpu: #"1000m" #"2000m" + memory: "500Mi"