Skip to content

Commit 7b64e99

Browse files
Ensure that end-of-private-recovery service open transaction is written at-most-once (#6926)
Co-authored-by: Amaury Chamayou <[email protected]>
1 parent 1d701c0 commit 7b64e99

File tree

5 files changed

+168
-3
lines changed

5 files changed

+168
-3
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
77

88
## [6.0.0]
99

10-
[6.0.0]: https://github.com/microsoft/CCF/releases/tag/6.0.0
10+
[6.0.0]: https://github.com/microsoft/CCF/releases/tag/ccf-6.0.0
1111

1212
### Important
1313

@@ -134,6 +134,7 @@ It also adds COSE_Sign1 ledger signatures, to support the generation of COSE rec
134134
- Fixed thread-safety issues when CCF nodes attempted to contact non-TLS servers. This previously could cause errors when running SNP builds with multiple worker threads (#6836).
135135
- SNP nodes will no longer crash when run on firmware returning v3 attestations (#6841).
136136
- Fixed potential races in indexing interfaces and `LazyStrategy` (#6886).
137+
- Fixed a bug which could produce an invalid secret chain (containing duplicate ledger secret sealing entries) in the ledger if an election occurred during private recovery (#6926).
137138

138139
### Removed
139140

Diff for: scripts/setup-ci.sh

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ chmod +x /opt/pebble/pebble_linux-amd64 /opt/pebble/pebble-challtestsrv_linux-am
5959

6060
# partitions test
6161
tdnf -y install iptables
62+
tdnf -y install strace
6263

6364
# For packaging
6465
tdnf -y install rpm-build

Diff for: src/node/node_state.h

+27-1
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,10 @@ namespace ccf
12161216
{
12171217
auto entry = ::consensus::LedgerEnclave::get_entry(data, size);
12181218

1219-
LOG_INFO_FMT("Deserialising private ledger entry [{}]", entry.size());
1219+
LOG_INFO_FMT(
1220+
"Deserialising private ledger entry {} [{}]",
1221+
last_recovered_idx + 1,
1222+
entry.size());
12201223

12211224
// When reading the private ledger, deserialise in the recovery store
12221225
ccf::kv::ApplyResult result = ccf::kv::ApplyResult::FAIL;
@@ -1309,6 +1312,29 @@ namespace ccf
13091312

13101313
auto tx = network.tables->create_tx();
13111314

1315+
{
1316+
// Ensure this transition happens at-most-once, by checking that no
1317+
// other node has already advanced the state
1318+
auto service = tx.ro<ccf::Service>(Tables::SERVICE);
1319+
auto active_service = service->get();
1320+
1321+
if (!active_service.has_value())
1322+
{
1323+
throw std::logic_error(fmt::format(
1324+
"Error in {}: no value in {}", __func__, Tables::SERVICE));
1325+
}
1326+
1327+
if (
1328+
active_service->status !=
1329+
ServiceStatus::WAITING_FOR_RECOVERY_SHARES)
1330+
{
1331+
throw std::logic_error(fmt::format(
1332+
"Error in {}: current service status is {}",
1333+
__func__,
1334+
active_service->status));
1335+
}
1336+
}
1337+
13121338
// Clear recovery shares that were submitted to initiate the recovery
13131339
// procedure
13141340
ShareManager::clear_submitted_recovery_shares(tx);

Diff for: tests/infra/network.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1698,7 +1698,7 @@ def save_service_identity(self, args):
16981698
with open(previous_identity, "w", encoding="utf-8") as f:
16991699
f.write(current_ident)
17001700
args.previous_service_identity_file = previous_identity
1701-
return args
1701+
return current_ident
17021702

17031703
def identity(self, name=None):
17041704
if name is not None:

Diff for: tests/partitions_test.py

+137
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import contextlib
1818
import ccf.ledger
1919
from reconfiguration import test_ledger_invariants
20+
import subprocess
2021

2122
from loguru import logger as LOG
2223

@@ -711,6 +712,141 @@ def wait_for_new_view(node, original_view, timeout_multiplier):
711712
return network
712713

713714

715+
@reqs.supports_methods("/app/log/public")
716+
def test_recovery_elections(orig_network, args):
717+
# Ensure we have 3 nodes
718+
original_size = orig_network.resize(3, args)
719+
720+
old_primary, _ = orig_network.find_nodes()
721+
with old_primary.client("user0") as c:
722+
LOG.warning("Writing some initial state")
723+
for _ in range(300):
724+
r = c.post(
725+
"/app/log/public",
726+
{
727+
"id": 42,
728+
"msg": "Uninteresting recoverable transactions",
729+
},
730+
)
731+
assert r.status_code == 200, r
732+
733+
r = c.get("/node/network")
734+
assert r.status_code == 200, r
735+
previous_identity = orig_network.save_service_identity(args)
736+
c.wait_for_commit(
737+
orig_network.consortium.set_recovery_threshold(old_primary, 1)
738+
)
739+
orig_network.stop_all_nodes(skip_verification=True)
740+
current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()
741+
742+
# Create a recovery network, where we will manually take the recovery steps (transition to open and submit share)
743+
network = infra.network.Network(
744+
args.nodes,
745+
args.binary_dir,
746+
args.debug_nodes,
747+
args.perf_nodes,
748+
existing_network=orig_network,
749+
)
750+
network.start_in_recovery(
751+
args,
752+
ledger_dir=current_ledger_dir,
753+
committed_ledger_dirs=committed_ledger_dirs,
754+
)
755+
new_primary, new_backups = network.find_nodes()
756+
network.consortium.transition_service_to_open(
757+
new_primary, previous_service_identity=previous_identity
758+
)
759+
760+
with new_primary.client("user0") as c:
761+
previous_identity = network.save_service_identity(args)
762+
763+
member = network.consortium.get_active_recovery_participants()[0]
764+
765+
# We need to delay a backup's private recovery process until:
766+
# - The primary has completed its private recovery, and fully opened the network
767+
# - The backup has called and won an election
768+
# So that the backup node _is primary_ at the point it completes private recovery.
769+
# We force the delay by injecting a delay into the file operations of the backup,
770+
# and force an election (after the primary has completed its recovery) by killing
771+
# the original primary node.
772+
backup = new_backups[0]
773+
LOG.info(f"Using strace to inject delays in file IO of {backup}")
774+
assert not backup.remote.check_done()
775+
776+
strace_command = [
777+
"strace",
778+
f"--attach={backup.remote.remote.proc.pid}",
779+
"--inject=lseek:delay_exit=10s",
780+
"-tt",
781+
"--trace=lseek,read,open,openat",
782+
"--decode-fds=all",
783+
"--output=strace_output.txt",
784+
]
785+
LOG.warning(f"About to run strace: {strace_command}")
786+
strace_process = subprocess.Popen(
787+
strace_command,
788+
stdout=subprocess.PIPE,
789+
stderr=subprocess.PIPE,
790+
)
791+
792+
member.get_and_submit_recovery_share(new_primary)
793+
network.recovery_count += 1
794+
795+
LOG.info("Confirming that primary completes private recovery")
796+
network.wait_for_state(
797+
new_primary,
798+
infra.node.State.PART_OF_NETWORK.value,
799+
timeout=30,
800+
)
801+
802+
election_s = args.election_timeout_ms / 1000
803+
LOG.info(
804+
f"Holding backup stalled via strace for {election_s}, to trigger an election"
805+
)
806+
time.sleep(election_s)
807+
808+
LOG.info("Ending strace, and terminating primary node")
809+
strace_process.terminate()
810+
strace_process.communicate()
811+
812+
new_primary.stop()
813+
814+
LOG.info(
815+
f"Give {backup} time to finish its recovery (including becoming primary), and confirm that it dies in the process"
816+
)
817+
time.sleep(election_s)
818+
# The result of all of that is that this node, which had become primary while it
819+
# completed its private recovery, crashed at the end of recovery (rather than)
820+
# producing an invalid ledger)
821+
assert backup.remote.check_done()
822+
823+
network.ignore_errors_on_shutdown()
824+
network.stop_all_nodes(skip_verification=True)
825+
current_ledger_dir, committed_ledger_dirs = backup.get_ledger()
826+
827+
LOG.info(
828+
"Trying a further recovery, to confirm that the ledger is in a recoverable state"
829+
)
830+
recovery_network = infra.network.Network(
831+
args.nodes,
832+
args.binary_dir,
833+
args.debug_nodes,
834+
args.perf_nodes,
835+
existing_network=network,
836+
)
837+
recovery_network.start_in_recovery(
838+
args,
839+
ledger_dir=current_ledger_dir,
840+
committed_ledger_dirs=committed_ledger_dirs,
841+
)
842+
recovery_network.recover(args)
843+
844+
# Restore original network size
845+
recovery_network.resize(original_size, args)
846+
847+
return recovery_network
848+
849+
714850
def run(args):
715851
txs = app.LoggingTxs("user0")
716852

@@ -737,6 +873,7 @@ def run(args):
737873
# HTTP2 doesn't support forwarding
738874
if not args.http2:
739875
test_session_consistency(network, args)
876+
network = test_recovery_elections(network, args)
740877
test_ledger_invariants(network, args)
741878

742879

0 commit comments

Comments
 (0)