diff --git a/slinky-azimuth/.helmignore b/slinky-azimuth/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/slinky-azimuth/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/slinky-azimuth/Chart.yaml b/slinky-azimuth/Chart.yaml new file mode 100644 index 00000000..241f3015 --- /dev/null +++ b/slinky-azimuth/Chart.yaml @@ -0,0 +1,27 @@ +apiVersion: v2 +name: slinky-azimuth +description: A containerised Slurm cluster. +icon: https://raw.githubusercontent.com/SlinkyProject/slurm-operator/main/docs/_static/images/slinky.svg +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.4.0" + +annotations: + azimuth.stackhpc.com/label: Slinky diff --git a/slinky-azimuth/README.md b/slinky-azimuth/README.md new file mode 100644 index 00000000..807ea5d4 --- /dev/null +++ b/slinky-azimuth/README.md @@ -0,0 +1,59 @@ +# slinky-azimuth + +Installs Certmanager, Slurm Operator and a Slurm control plane into the +target K8s cluster. + +Only available when installed with Azimuth Apps Operator. Requires the +`managementInstall` field in its AppTemplate spec to be set to `true` e.g + +```yaml +apiVersion: apps.azimuth-cloud.io/v1alpha1 +kind: AppTemplate +metadata: + name: slinky + # Access-control annotations, if required + # https://azimuth-config.readthedocs.io/en/latest/configuration/13-access-control/#annotations + # annotations: + # acl.azimuth.stackhpc.com/allow-list: "" + # acl.azimuth.stackhpc.com/deny-list: "" + # acl.azimuth.stackhpc.com/allow-regex: "" + # acl.azimuth.stackhpc.com/deny-regex: "" +spec: + # The chart and versions to use + chart: + repo: https://azimuth-cloud.github.io/azimuth-charts + name: slinky-azimuth + # The range of versions to consider + # Here, we consider all stable versions (the default) + versionRange: ">=0.0.0" + + managementInstall: true + + # Synchronisation options + # The number of versions to make available + keepVersions: 5 + # The frequency at which to check for new versions + syncFrequency: 86400 + + # Default values for the deployment, applied on top of the chart defaults + defaultValues: {} +``` + +## Current Limitations + +- There are currently no services exposed to access the cluster, accessing + the cluster requires access to the tenancy's kubeconfig, the Slurm controller + can then be accessed with + `kubectl --namespace=slurm exec -it statefulsets/slurm-controller -- bash --login` +- Slinky currently doesn't clean up the PVC created for its database on uninstall + which future deployments don't have the credentials to access, leading the + installation to fail. This can be worked around by deleting the + `slurm-mariadb` and `slurm-controller` PVCs created by the chart manually. + Future releases will require + [patching a post-delete hook](https://github.com/azimuth-cloud/azimuth-charts/blob/bed545c4c2d14a4c3f70f2896adf44cb3878b6a2/slinky-azimuth/templates/cleanup-pvcs.yml) + into a wrapper chart or fork of the upstream Slurm control plane chart. +- CertManager is currently installed as a dependency of this app. Running multiple + CertManager instances may lead to conflicts, so only one instance of this app + per cluster is recommended. Future updates could amend this by removing the + CertManager dependency and instead ensuring its existence per cluster + via patches to the Azimuth Apps Operator. diff --git a/slinky-azimuth/azimuth-ui.schema.yaml b/slinky-azimuth/azimuth-ui.schema.yaml new file mode 100644 index 00000000..a72c85c3 --- /dev/null +++ b/slinky-azimuth/azimuth-ui.schema.yaml @@ -0,0 +1 @@ +controls: {} \ No newline at end of file diff --git a/slinky-azimuth/templates/NOTES.txt b/slinky-azimuth/templates/NOTES.txt new file mode 100644 index 00000000..c2e18974 --- /dev/null +++ b/slinky-azimuth/templates/NOTES.txt @@ -0,0 +1,10 @@ +Slinky - https://slurm.schedmd.com/slinky.html is a bridge between Slurm and Kubernetes, +allowing Slurm nodes and job scheduling to be run on a Kubernetes cluster. + +This chart is designed to work with Azimuth (Standalone mode), +a self service portal for applications on kubernetes. + +This chart installs the flux sources and helmreleases into namespace.sources ("flux" by default) +and then slurm-operator and slurm into namespace.slurm-operator and .slurm ("slinky" and "slurm" by default) + +CURRENTLY IN ALPHA AND UNSTABLE. \ No newline at end of file diff --git a/slinky-azimuth/templates/cert-helm.yaml b/slinky-azimuth/templates/cert-helm.yaml new file mode 100644 index 00000000..49cc1b3a --- /dev/null +++ b/slinky-azimuth/templates/cert-helm.yaml @@ -0,0 +1,32 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: cert-manager +spec: + targetNamespace: {{ .Values.targetNamespace }} + storageNamespace: {{ .Values.targetNamespace }} + kubeConfig: + secretRef: + name: {{ .Values.kubeconfig.name }} + key: {{ .Values.kubeconfig.key }} + interval: 10m + timeout: 5m + chart: + spec: + chart: cert-manager + sourceRef: + kind: HelmRepository + name: jetstack-charts + interval: 5m + install: + remediation: + retries: 3 + createNamespace: true + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + values: + installCRDs: true diff --git a/slinky-azimuth/templates/jetstack-source.yaml b/slinky-azimuth/templates/jetstack-source.yaml new file mode 100644 index 00000000..061a2ad8 --- /dev/null +++ b/slinky-azimuth/templates/jetstack-source.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: jetstack-charts +spec: + interval: 10m0s + url: https://charts.jetstack.io diff --git a/slinky-azimuth/templates/slurm-operator-release.yaml b/slinky-azimuth/templates/slurm-operator-release.yaml new file mode 100644 index 00000000..4c411213 --- /dev/null +++ b/slinky-azimuth/templates/slurm-operator-release.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: slurm-operator +spec: + targetNamespace: {{ .Values.targetNamespace }} + storageNamespace: {{ .Values.targetNamespace }} + kubeConfig: + secretRef: + name: {{ .Values.kubeconfig.name }} + key: {{ .Values.kubeconfig.key }} + chart: + spec: + chart: slurm-operator + sourceRef: + kind: HelmRepository + name: slurm-operator-repo + version: ">0.1.0-dev.0.stackhpc-0-3-0.0" + dependsOn: + - name: cert-manager + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + interval: 11m + timeout: 10m + values: + operator: + image: + tag: "0.3.0" + webhook: + image: + tag: "0.3.0" diff --git a/slinky-azimuth/templates/slurm-release.yaml b/slinky-azimuth/templates/slurm-release.yaml new file mode 100644 index 00000000..2dd1f4b5 --- /dev/null +++ b/slinky-azimuth/templates/slurm-release.yaml @@ -0,0 +1,93 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: slurm +spec: + targetNamespace: {{ .Values.targetNamespace }} + storageNamespace: {{ .Values.targetNamespace }} + kubeConfig: + secretRef: + name: {{ .Values.kubeconfig.name}} + key: {{ .Values.kubeconfig.key}} + chart: + spec: + chart: slurm + sourceRef: + kind: HelmRepository + name: slurm-operator-repo + version: ">0.1.0-dev.0.stackhpc-0-3-0.0" + dependsOn: + - name: slurm-operator + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + interval: 11m + timeout: 10m + driftDetection: + mode: disabled + values: + global: + security: + allowInsecureImages: true # TODO: remove after upgrade + mariadb: + image: + repository: bitnamilegacy/mariadb # TODO: remove after upgrade + slurm: + extraSlurmConf: + MCSPlugin: mcs/label + MCSParameters: ondemand,ondemandselect + compute: + nodesets: + - name: compute + replicas: {{ .Values.nodes }} + enabled: true + imagePullPolicy: IfNotPresent + image: + repository: "" + tag: "" + priorityClassName: "" + nodeSelector: + kubernetes.io/os: linux + affinity: {} + tolerations: + - effect: "NoExecute" + operator: "Exists" + key: slinky.slurm.net/managed-node + resources: {} + useResourceLimits: true + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 20% + persistentVolumeClaimRetentionPolicy: + whenDeleted: Retain + whenScaled: Retain + volumeClaimTemplates: [] + extraVolumeMounts: [] + extraVolumes: [] + nodeConfig: {} + partition: + enabled: false + partitions: + - name: batch + nodesets: + - compute + enabled: true + config: + State: UP + MaxTime: UNLIMITED + Default: "YES" + PriorityTier: 1 + - name: apps + nodesets: + - compute + enabled: true + config: + State: UP + MaxTime: UNLIMITED + PriorityTier: 65533 diff --git a/slinky-azimuth/templates/slurm-source.yaml b/slinky-azimuth/templates/slurm-source.yaml new file mode 100644 index 00000000..f77e1017 --- /dev/null +++ b/slinky-azimuth/templates/slurm-source.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: slurm-operator-repo +spec: + interval: 10m0s + url: https://stackhpc.github.io/slurm-operator diff --git a/slinky-azimuth/values.schema.json b/slinky-azimuth/values.schema.json new file mode 100644 index 00000000..11102e5d --- /dev/null +++ b/slinky-azimuth/values.schema.json @@ -0,0 +1,14 @@ +{ + "$schema": "http://json-schema.org/schema#", + "type": "object", + "properties": { + "nodes": { + "type": "integer", + "title": "Compute nodes", + "description": "The number of nodes to spawn slurmd pods on", + "minimum": 1, + "default": 1 + } + }, + "required": ["nodes"] +} \ No newline at end of file diff --git a/slinky-azimuth/values.yaml b/slinky-azimuth/values.yaml new file mode 100644 index 00000000..9e16987b --- /dev/null +++ b/slinky-azimuth/values.yaml @@ -0,0 +1,22 @@ +# Default values for slinky-helm. + +targetNamespace: default + +kubeconfig: + # name: + # key: + +nodes: 1 + +# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/ +# autoscaling: +# enabled: false +# minReplicas: 1 +# maxReplicas: 10 +# targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +zenithClient: + iconUrl: https://raw.githubusercontent.com/argoproj/argo-cd/release-2.11/docs/assets/logo.png + description: + label: "Slinky"