diff --git a/defaults/main.yml b/defaults/main.yml index ba84090..8744f72 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -228,6 +228,9 @@ rke2_etcd_snapshot_destination_dir: "{{ rke2_data_path }}/server/db/snapshots" rke2_snapshotter: "{{ rke2_snapshooter }}" rke2_snapshooter: overlayfs # legacy variable that only exists to keep backward compatibility with previous configurations +# when doing restore allow cleanup of old nodes secrets and remove not existing nodes +rke2_cleanup_on_restore: false + # Deploy RKE2 with default CNI canal rke2_cni: [canal] diff --git a/tasks/first_server.yml b/tasks/first_server.yml index 27b4947..2b94778 100644 --- a/tasks/first_server.yml +++ b/tasks/first_server.yml @@ -152,44 +152,6 @@ - not ansible_check_mode - rke2_cni != 'none' -- name: Restore etcd - when: do_etcd_restore is defined or do_etcd_restore_from_s3 is defined - block: - - name: Get registered nodes - ansible.builtin.shell: - cmd: | - set -o pipefail - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - get nodes --no-headers | awk '{print $1}' - args: - executable: /bin/bash - changed_when: false - register: registered_node_names - - - name: Get all node names - ansible.builtin.set_fact: - node_names: "{{ hostvars | dict2items | map(attribute='value.rke2_node_name') }}" - run_once: true - register: node_names - - - name: Remove old .node-password.rke2 secrets - ansible.builtin.shell: | - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - delete secret {{ item }}.node-password.rke2 -n kube-system 2>&1 || true - args: - executable: /bin/bash - with_items: "{{ registered_node_names.stdout_lines | difference(node_names) }}" - changed_when: false - - - name: Remove old nodes - ansible.builtin.shell: | - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - delete node {{ item }} 2>&1 || true - args: - executable: /bin/bash - with_items: "{{ registered_node_names.stdout_lines | difference(node_names) }}" - changed_when: false - - name: Set an Active Server variable ansible.builtin.set_fact: active_server: "{{ inventory_hostname }}" diff --git a/tasks/first_server_restore.yml b/tasks/first_server_restore.yml index 5b4ab0f..2b0ed94 100644 --- a/tasks/first_server_restore.yml +++ b/tasks/first_server_restore.yml @@ -5,13 +5,22 @@ args: executable: /bin/bash changed_when: false - register: node_names + register: registered_node_names -- name: Restore etcd - remove old nodes +- name: Restore etcd - cleanup .node-password.rke2 secrets + ansible.builtin.shell: | + {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ + delete secret {{ item }}.node-password.rke2 -n kube-system 2>&1 || true + args: + executable: /bin/bash + with_items: "{{ registered_node_names.stdout_lines }}" + when: item != rke2_node_name + +- name: Restore etcd - remove old (not existing) nodes ansible.builtin.shell: | {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ delete node {{ item }} 2>&1 || true args: executable: /bin/bash - with_items: "{{ node_names.stdout_lines | difference(groups[rke2_cluster_group_name]) }}" + with_items: "{{ registered_node_names.stdout_lines | difference(groups[rke2_cluster_group_name]) }}" changed_when: false diff --git a/tasks/main.yml b/tasks/main.yml index 0858ec9..0a6a1f3 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -51,12 +51,13 @@ - inventory_hostname == groups[rke2_servers_group_name].0 - active_server is not defined or groups[rke2_cluster_group_name] | length | int == 1 -- name: Restore etcd specific tasks +- name: Run cleanup tasks after etcd restore ansible.builtin.include_tasks: first_server_restore.yml when: - active_server is defined - inventory_hostname == active_server or inventory_hostname == groups[rke2_servers_group_name].0 - do_etcd_restore is defined or do_etcd_restore_from_s3 is defined + - rke2_cleanup_on_restore - name: Download kubeconfig to ansible localhost ansible.builtin.include_tasks: download_kubeconfig.yaml diff --git a/tasks/remaining_nodes.yml b/tasks/remaining_nodes.yml index e6cc0a7..3f74ca9 100644 --- a/tasks/remaining_nodes.yml +++ b/tasks/remaining_nodes.yml @@ -46,6 +46,46 @@ when: (rke2_custom_registry_mirrors | length > 0 or rke2_custom_registry_configs | length > 0) notify: "Config file changed" +- name: Get current nodes secrets + delegate_to: "{{ active_server }}" + run_once: true + block: + - name: Get list of existing node secrets + ansible.builtin.shell: | + set -o pipefail + "{{ rke2_data_path }}/bin/kubectl" --kubeconfig /etc/rancher/rke2/rke2.yaml \ + get secrets -n kube-system -o jsonpath="{.items[*].metadata.name}" | tr ' ' '\n' | grep -E 'node-password\.rke2$' | sed 's/\.node-password\.rke2//g' + args: + executable: /bin/bash + register: nodes_with_passwords # A node name on each line + changed_when: false + - name: Set fact for existing node passwords + ansible.builtin.set_fact: + nodes_with_existing_passwords: "{{ nodes_with_passwords.stdout_lines }}" + + +- name: Validate presence of node password file when secret exists + block: + - name: Register if rke2 password file exists + ansible.builtin.stat: + path: /etc/rancher/node/password + register: node_password_file + + - name: Fail if the cluster already has a .node-password.rke2 secret and the node doesn't have a password file + ansible.builtin.fail: + msg: | + The node password secret already exists for node name {{ rke2_node_name }}, but no password file exists in /etc/rancher/node/password! + The node will not be able to join the cluster with this node name without a password file matching the secret. + This can happen for a few reasons: + - The node was previously part of the cluster and RKE2 was removed without running `kubectl delete node {{ rke2_node_name }}`. + - The cluster etcd was restored from a backup from before the node was correctly removed from the cluster. + + To join this node, please recreate the file with the password, use a different node name (rke2_node_name), or remove the secret from the cluster using: + kubectl delete secret {{ rke2_node_name}}.node-password.rke2 -n kube-system + when: + - rke2_node_name in nodes_with_existing_passwords + - not node_password_file.stat.exists + - name: Start RKE2 service on the rest of the nodes ansible.builtin.systemd: name: "{{ rke2_service_name }}"