From 3dc105b5e2ddeb68c0780f7fdfce16c2f6623726 Mon Sep 17 00:00:00 2001 From: Simon Ungar Felding <45149055+simonfelding@users.noreply.github.com> Date: Thu, 10 Jul 2025 17:48:14 +0200 Subject: [PATCH 1/7] add the fail condition --- tasks/first_server.yml | 38 ---------------------------------- tasks/first_server_restore.yml | 17 --------------- tasks/remaining_nodes.yml | 26 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 55 deletions(-) delete mode 100644 tasks/first_server_restore.yml diff --git a/tasks/first_server.yml b/tasks/first_server.yml index d65efad5..917faf07 100644 --- a/tasks/first_server.yml +++ b/tasks/first_server.yml @@ -152,44 +152,6 @@ - not ansible_check_mode - rke2_cni != 'none' -- name: Restore etcd - when: do_etcd_restore is defined or do_etcd_restore_from_s3 is defined - block: - - name: Get registered nodes - ansible.builtin.shell: - cmd: | - set -o pipefail - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - get nodes --no-headers | awk '{print $1}' - args: - executable: /bin/bash - changed_when: false - register: registered_node_names - - - name: Get all node names - ansible.builtin.set_fact: - node_names: "{{ hostvars | dict2items | map(attribute='value.rke2_node_name') }}" - run_once: true - register: node_names - - - name: Remove old .node-password.rke2 secrets - ansible.builtin.shell: | - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - delete secret {{ item }}.node-password.rke2 -n kube-system 2>&1 || true - args: - executable: /bin/bash - with_items: "{{ registered_node_names.stdout_lines | difference(node_names) }}" - changed_when: false - - - name: Remove old nodes - ansible.builtin.shell: | - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - delete node {{ item }} 2>&1 || true - args: - executable: /bin/bash - with_items: "{{ registered_node_names.stdout_lines | difference(node_names) }}" - changed_when: false - - name: Set an Active Server variable ansible.builtin.set_fact: active_server: "{{ inventory_hostname }}" diff --git a/tasks/first_server_restore.yml b/tasks/first_server_restore.yml deleted file mode 100644 index 5b4ab0f3..00000000 --- a/tasks/first_server_restore.yml +++ /dev/null @@ -1,17 +0,0 @@ -- name: Restore etcd - get all nodes - ansible.builtin.shell: | - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - get nodes --no-headers -o custom-columns=":metadata.name" - args: - executable: /bin/bash - changed_when: false - register: node_names - -- name: Restore etcd - remove old nodes - ansible.builtin.shell: | - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - delete node {{ item }} 2>&1 || true - args: - executable: /bin/bash - with_items: "{{ node_names.stdout_lines | difference(groups[rke2_cluster_group_name]) }}" - changed_when: false diff --git a/tasks/remaining_nodes.yml b/tasks/remaining_nodes.yml index e6cc0a7a..7fbb23be 100644 --- a/tasks/remaining_nodes.yml +++ b/tasks/remaining_nodes.yml @@ -46,6 +46,32 @@ when: (rke2_custom_registry_mirrors | length > 0 or rke2_custom_registry_configs | length > 0) notify: "Config file changed" +- name: Fail if the cluster already has a .node-password.rke2 secret and the node doesn't have a password file + block: + - name: Get list of existing node secrets + when: inventory_hostname is active_server + ansible.builtin.shell: | + {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ + get secrets -n kube-system -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -E 'node-password\.rke2$' | sed s/.node-password.rke2//g + args: + executable: /bin/bash + register: nodes_with_existing_passwords # A node name on each line + - name: Set fact for existing node passwords + ansible.builtin.set_fact: + existing_node_passwords: "{{ nodes_with_existing_passwords.stdout_lines }}" + when: inventory_hostname is active_server + - name: Warn if the node password already exists and /etc/rancher/node/password does not exist on the node + ansible.builtin.fail: + msg: "The node password secret already exists for node name {{ rke2_node_name }}, but no password file exists in /etc/rancher/node/password!\n\ + The node will not be able to join the cluster with this node name without a password file matching the secret.\n\n\ + This can happen for a few reasons:\n\ + - The node was previously part of the cluster and RKE2 was removed without running `kubectl delete node {{ rke2_node_name }}`.\n\ + - The cluster etcd was restored from a backup from before the node was correctly removed from the cluster.\n\n\ + To join this node, please recreate the file with the password, use a different node name (rke2_node_name), or remove the secret from the cluster using:\n\ + kubectl delete secret {{ rke2_node_name}}.node-password.rke2 -n kube-system" + when: rke2_node_name in nodes_with_existing_passwords.stdout_lines + and not ansible.builtin.stat(path="/etc/rancher/node/password").stat.exists + - name: Start RKE2 service on the rest of the nodes ansible.builtin.systemd: name: "{{ rke2_service_name }}" From 8917178aa1d076b2f2f25143bbd28cc392d32f6d Mon Sep 17 00:00:00 2001 From: Simon Ungar Felding <45149055+simonfelding@users.noreply.github.com> Date: Thu, 10 Jul 2025 17:54:08 +0200 Subject: [PATCH 2/7] remove first_server_restore.yml from main.yml --- tasks/main.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tasks/main.yml b/tasks/main.yml index 5c42807f..72686c2e 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -51,13 +51,6 @@ - inventory_hostname == groups[rke2_servers_group_name].0 - active_server is not defined -- name: Restore etcd specific tasks - ansible.builtin.include_tasks: first_server_restore.yml - when: - - active_server is defined - - inventory_hostname == active_server or inventory_hostname == groups[rke2_servers_group_name].0 - - do_etcd_restore is defined or do_etcd_restore_from_s3 is defined - - name: Prepare and join remaining nodes of the cluster ansible.builtin.include_tasks: remaining_nodes.yml when: From 41bb39b75e7e3f11331553810aa3b482375df570 Mon Sep 17 00:00:00 2001 From: Simon Ungar Felding <45149055+simonfelding@users.noreply.github.com> Date: Thu, 10 Jul 2025 17:59:05 +0200 Subject: [PATCH 3/7] better variables --- tasks/remaining_nodes.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/remaining_nodes.yml b/tasks/remaining_nodes.yml index 7fbb23be..25a90367 100644 --- a/tasks/remaining_nodes.yml +++ b/tasks/remaining_nodes.yml @@ -55,10 +55,10 @@ get secrets -n kube-system -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -E 'node-password\.rke2$' | sed s/.node-password.rke2//g args: executable: /bin/bash - register: nodes_with_existing_passwords # A node name on each line + register: nodes_with_passwords # A node name on each line - name: Set fact for existing node passwords ansible.builtin.set_fact: - existing_node_passwords: "{{ nodes_with_existing_passwords.stdout_lines }}" + nodes_with_existing_passwords: "{{ nodes_with_passwords.stdout_lines }}" when: inventory_hostname is active_server - name: Warn if the node password already exists and /etc/rancher/node/password does not exist on the node ansible.builtin.fail: @@ -69,7 +69,7 @@ - The cluster etcd was restored from a backup from before the node was correctly removed from the cluster.\n\n\ To join this node, please recreate the file with the password, use a different node name (rke2_node_name), or remove the secret from the cluster using:\n\ kubectl delete secret {{ rke2_node_name}}.node-password.rke2 -n kube-system" - when: rke2_node_name in nodes_with_existing_passwords.stdout_lines + when: rke2_node_name in nodes_with_existing_passwords and not ansible.builtin.stat(path="/etc/rancher/node/password").stat.exists - name: Start RKE2 service on the rest of the nodes From ae322a760ca4dbd76cc6dfad6eb581e26e94e8a7 Mon Sep 17 00:00:00 2001 From: Simon Ungar Felding <45149055+simonfelding@users.noreply.github.com> Date: Thu, 10 Jul 2025 18:01:03 +0200 Subject: [PATCH 4/7] set -o pipefail --- tasks/remaining_nodes.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/remaining_nodes.yml b/tasks/remaining_nodes.yml index 25a90367..31cef519 100644 --- a/tasks/remaining_nodes.yml +++ b/tasks/remaining_nodes.yml @@ -51,6 +51,7 @@ - name: Get list of existing node secrets when: inventory_hostname is active_server ansible.builtin.shell: | + set -o pipefail {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ get secrets -n kube-system -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -E 'node-password\.rke2$' | sed s/.node-password.rke2//g args: From f519b6d1299ead735aa833b077cbf540f7f988b0 Mon Sep 17 00:00:00 2001 From: Michal Gawrys <63045346+michalg91@users.noreply.github.com> Date: Thu, 10 Jul 2025 19:08:37 +0200 Subject: [PATCH 5/7] Revert "remove first_server_restore.yml from main.yml" This reverts commit 8917178aa1d076b2f2f25143bbd28cc392d32f6d. --- tasks/main.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tasks/main.yml b/tasks/main.yml index 72686c2e..5c42807f 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -51,6 +51,13 @@ - inventory_hostname == groups[rke2_servers_group_name].0 - active_server is not defined +- name: Restore etcd specific tasks + ansible.builtin.include_tasks: first_server_restore.yml + when: + - active_server is defined + - inventory_hostname == active_server or inventory_hostname == groups[rke2_servers_group_name].0 + - do_etcd_restore is defined or do_etcd_restore_from_s3 is defined + - name: Prepare and join remaining nodes of the cluster ansible.builtin.include_tasks: remaining_nodes.yml when: From e0615958e01999abd2e8e727e36dfeecad50e57d Mon Sep 17 00:00:00 2001 From: Michal Gawrys <63045346+michalg91@users.noreply.github.com> Date: Thu, 10 Jul 2025 19:27:18 +0200 Subject: [PATCH 6/7] allow some dangerous parts to be executed --- defaults/main.yml | 3 +++ tasks/first_server_restore.yml | 26 +++++++++++++++++++++ tasks/main.yml | 3 ++- tasks/remaining_nodes.yml | 42 ++++++++++++++++++++++------------ 4 files changed, 58 insertions(+), 16 deletions(-) create mode 100644 tasks/first_server_restore.yml diff --git a/defaults/main.yml b/defaults/main.yml index 0f285206..fcd46aea 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -224,6 +224,9 @@ rke2_etcd_snapshot_destination_dir: "{{ rke2_data_path }}/server/db/snapshots" rke2_snapshotter: "{{ rke2_snapshooter }}" rke2_snapshooter: overlayfs # legacy variable that only exists to keep backward compatibility with previous configurations +# when doing restore allow cleanup of old nodes secrets and remove not existing nodes +rke2_cleanup_on_restore: false + # Deploy RKE2 with default CNI canal rke2_cni: [canal] diff --git a/tasks/first_server_restore.yml b/tasks/first_server_restore.yml new file mode 100644 index 00000000..2b0ed940 --- /dev/null +++ b/tasks/first_server_restore.yml @@ -0,0 +1,26 @@ +- name: Restore etcd - get all nodes + ansible.builtin.shell: | + {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ + get nodes --no-headers -o custom-columns=":metadata.name" + args: + executable: /bin/bash + changed_when: false + register: registered_node_names + +- name: Restore etcd - cleanup .node-password.rke2 secrets + ansible.builtin.shell: | + {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ + delete secret {{ item }}.node-password.rke2 -n kube-system 2>&1 || true + args: + executable: /bin/bash + with_items: "{{ registered_node_names.stdout_lines }}" + when: item != rke2_node_name + +- name: Restore etcd - remove old (not existing) nodes + ansible.builtin.shell: | + {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ + delete node {{ item }} 2>&1 || true + args: + executable: /bin/bash + with_items: "{{ registered_node_names.stdout_lines | difference(groups[rke2_cluster_group_name]) }}" + changed_when: false diff --git a/tasks/main.yml b/tasks/main.yml index 5c42807f..50f5dae8 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -51,12 +51,13 @@ - inventory_hostname == groups[rke2_servers_group_name].0 - active_server is not defined -- name: Restore etcd specific tasks +- name: Run cleanup tasks after etcd restore ansible.builtin.include_tasks: first_server_restore.yml when: - active_server is defined - inventory_hostname == active_server or inventory_hostname == groups[rke2_servers_group_name].0 - do_etcd_restore is defined or do_etcd_restore_from_s3 is defined + - rke2_cleanup_on_restore - name: Prepare and join remaining nodes of the cluster ansible.builtin.include_tasks: remaining_nodes.yml diff --git a/tasks/remaining_nodes.yml b/tasks/remaining_nodes.yml index 31cef519..dbf0c553 100644 --- a/tasks/remaining_nodes.yml +++ b/tasks/remaining_nodes.yml @@ -46,32 +46,44 @@ when: (rke2_custom_registry_mirrors | length > 0 or rke2_custom_registry_configs | length > 0) notify: "Config file changed" -- name: Fail if the cluster already has a .node-password.rke2 secret and the node doesn't have a password file +- name: Get current nodes secrets + delegate_to: "{{ active_server }}" + run_once: true block: - name: Get list of existing node secrets - when: inventory_hostname is active_server ansible.builtin.shell: | set -o pipefail - {{ rke2_data_path }}/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml \ - get secrets -n kube-system -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -E 'node-password\.rke2$' | sed s/.node-password.rke2//g + "{{ rke2_data_path }}/bin/kubectl" --kubeconfig /etc/rancher/rke2/rke2.yaml \ + get secrets -n kube-system -o jsonpath="{.items[*].metadata.name}" | tr ' ' '\n' | grep -E 'node-password\.rke2$' | sed 's/\.node-password\.rke2//g' args: executable: /bin/bash register: nodes_with_passwords # A node name on each line - name: Set fact for existing node passwords ansible.builtin.set_fact: nodes_with_existing_passwords: "{{ nodes_with_passwords.stdout_lines }}" - when: inventory_hostname is active_server - - name: Warn if the node password already exists and /etc/rancher/node/password does not exist on the node + + +- name: Validate presence of node password file when secret exists + block: + - name: Register if rke2 password file exists + ansible.builtin.stat: + path: /etc/rancher/node/password + register: node_password_file + + - name: Fail if the cluster already has a .node-password.rke2 secret and the node doesn't have a password file ansible.builtin.fail: - msg: "The node password secret already exists for node name {{ rke2_node_name }}, but no password file exists in /etc/rancher/node/password!\n\ - The node will not be able to join the cluster with this node name without a password file matching the secret.\n\n\ - This can happen for a few reasons:\n\ - - The node was previously part of the cluster and RKE2 was removed without running `kubectl delete node {{ rke2_node_name }}`.\n\ - - The cluster etcd was restored from a backup from before the node was correctly removed from the cluster.\n\n\ - To join this node, please recreate the file with the password, use a different node name (rke2_node_name), or remove the secret from the cluster using:\n\ - kubectl delete secret {{ rke2_node_name}}.node-password.rke2 -n kube-system" - when: rke2_node_name in nodes_with_existing_passwords - and not ansible.builtin.stat(path="/etc/rancher/node/password").stat.exists + msg: | + The node password secret already exists for node name {{ rke2_node_name }}, but no password file exists in /etc/rancher/node/password! + The node will not be able to join the cluster with this node name without a password file matching the secret. + This can happen for a few reasons: + - The node was previously part of the cluster and RKE2 was removed without running `kubectl delete node {{ rke2_node_name }}`. + - The cluster etcd was restored from a backup from before the node was correctly removed from the cluster. + + To join this node, please recreate the file with the password, use a different node name (rke2_node_name), or remove the secret from the cluster using: + kubectl delete secret {{ rke2_node_name}}.node-password.rke2 -n kube-system + when: + - rke2_node_name in nodes_with_existing_passwords + - not node_password_file.stat.exists - name: Start RKE2 service on the rest of the nodes ansible.builtin.systemd: From b619d096c5175766a6f2f93b1f24a6d4e3d1ca24 Mon Sep 17 00:00:00 2001 From: Michal Gawrys <63045346+michalg91@users.noreply.github.com> Date: Tue, 15 Jul 2025 12:05:52 +0200 Subject: [PATCH 7/7] add changed_when:false to get current nodes secrets --- tasks/remaining_nodes.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/remaining_nodes.yml b/tasks/remaining_nodes.yml index dbf0c553..3f74ca92 100644 --- a/tasks/remaining_nodes.yml +++ b/tasks/remaining_nodes.yml @@ -58,6 +58,7 @@ args: executable: /bin/bash register: nodes_with_passwords # A node name on each line + changed_when: false - name: Set fact for existing node passwords ansible.builtin.set_fact: nodes_with_existing_passwords: "{{ nodes_with_passwords.stdout_lines }}"