diff --git a/HA_Design/create_backup.sh b/HA_Design/create_backup.sh new file mode 100644 index 0000000..388080e --- /dev/null +++ b/HA_Design/create_backup.sh @@ -0,0 +1,130 @@ +#!/bin/sh + +PATH=/usr/bin:/sbin:; export PATH + +date=$(date '+%Y_%m_%d_%H_%M_%S') + +echo "=== starting backup: now is $date ===" + +name="backup_$date.tgz" +tmp_name="backup_$date.tmp.tgz" +name2="backup_$date.evm.tgz" +tmp_name2="backup_$date.evm.tmp.tgz" + +curl --location --request POST "127.0.0.1:8881/" --header "Content-Type: application/json" --data-raw "{\"method\":\"eth_blockNumber\",\"params\":[\"0x1\",false],\"id\":0}" +r=$? + +if [ $r -eq 0 ] +then + echo "rpc seems responding" +else + echo "OH! rpc not working!!!" + exit 1 +fi + +echo "=== try to grep error from rpc response ===" +r=`curl --location --request POST "127.0.0.1:8881/" --header "Content-Type: application/json" --data-raw "{\"method\":\"eth_blockNumber\",\"params\":[\"0x1\",false],\"id\":0}" | grep error | wc -l` + +if [ $r -eq 0 ] +then + echo "rpc seems working fine without error" +else + echo "OH! rpc response has error!!!" + exit 1 +fi + +LEAP_BASE=/home/ubuntu/leap +${LEAP_BASE}/cleos get info +r=$? + +if [ $r -eq 0 ] +then + echo "cleos get info seems working fine" +else + echo "OH! cleos get info not working!!!" + exit 2 +fi + +echo "stop eos-evm node & rpc" +pkill eos-evm-node +pkill eos-evm-rpc +sleep 2.0 + +echo "create leap snapshot..." +rm -rf ${LEAP_BASE}/data-dir/snapshots/* +curl http://127.0.0.1:8888/v1/producer/create_snapshot + +echo "snapshot created.. stop nodeos" +pkill nodeos +sleep 30.0 + +curl --location --request POST "127.0.0.1:8881/" --header "Content-Type: application/json" --data-raw "{\"method\":\"eth_blockNumber\",\"params\":[\"0x1\",false],\"id\":0}" +r=$? + +if [ $r -eq 0 ] +then + echo "rpc not killed!!!" + exit 2 +fi + +${LEAP_BASE}/cleos get info +r=$? +if [ $r -eq 0 ] +then + echo "nodeos not killed!!!" + exit 3 +fi + +cd /home/ubuntu +r=`ls ./backups/ -ltr | wc -l` +if [ $r -gt 6 ] +then + # remove old backups first to save space + find ./backups/ -mtime +1 -type f -delete + echo "removed old backups" +fi +rm ${LEAP_BASE}/nodeos.log +rm /home/ubuntu/node/eos-evm/node.log + +mkdir backups +tar zcvf backups/$tmp_name2 ./node/eos-evm/chain-data +tar zcvf backups/$tmp_name ./leap/data-dir/state-history ./leap/data-dir/blocks ./leap/data-dir/snapshots ./leap/data-dir/protocol_features + +echo "now bring back nodeos" +cd ${LEAP_BASE} +./start.sh > nodeos.log 2>&1 & +sleep 30.0 + +echo "now bring back eos-evm" +cd /home/ubuntu/node/eos-evm +./start_evm_node.sh > node.log 2>&1 & +./start_rpc.sh > rpc.log 2>&1 & + +cd /home/ubuntu +mv backups/$tmp_name backups/$name +mv backups/$tmp_name2 backups/$name2 + +ln -sf backups/$name ./last_full_backup.tgz +ln -sf backups/$name2 ./last_evm_backup.tgz + +echo "backup files $name & $name2 created successfully!" + +r=`ls ./backups/ -ltr | wc -l` +if [ $r -gt 6 ] +then + find ./backups/ -mtime +1 -type f -delete + echo "removed old backups" +fi + +if [ -z "$(ls -A ${LEAP_BASE}/data-dir/snapshots)" ]; then + echo "snapshot dir ${LEAP_BASE}/data-dir/snapshots contain no files, please fix snapshot" + exit 4 +else + rm -rf /home/ubuntu/snapshots.old + mv /home/ubuntu/snapshots /home/ubuntu/snapshots.old + mv ${LEAP_BASE}/data-dir/snapshots /home/ubuntu/snapshots + ln -sf ./snapshots/* last_snapshot.bin + echo "snapshot created at /home/ubuntu/snapshots" +fi +exit 0 + diff --git a/HA_Design/readme.md b/HA_Design/readme.md new file mode 100644 index 0000000..6a15cb9 --- /dev/null +++ b/HA_Design/readme.md @@ -0,0 +1,180 @@ +# High availability design for EOS EVM Infrastucture + +This document will describe how to setup an EOS EVM infrastucture with high availability. + +## Prerequisite: setup a miniumum EOS EVM service infrastructure +This is a minimum EOS EVM infrastructure setup without high availablity support. Follow the steps from https://github.com/eosnetworkfoundation/evm-public-docs/tree/main/deployments_plan_for_CEX#RMS to setup this infrasture step by step. +``` +Real-time service: + +--VM1 (leap VM) -------------------+ +-- VM2 (EVM node VM) -------+ + | leap node running in head mode | <-- | eos-evm-node & eos-evm-rpc | <-- read requests + | with state_history_plugin enabled | +----------------------------+ \ +------ VM2 -------+ + +-----------------------------------+ ---- | proxy | + ^ / | web-socket-proxy | + | +-- VM2 (EVM node VM) ----+ / +------------------+ + \-- push EOS ---| eos-evm-miner (wrapper) | <-- write requests + transactions +-------------------------+ + +Periodic Backup service (no need to scale): + +--VM3 (Backup VM) ------------------------+ +--VM3 (Backup VM) ----------+ + | leap node running in irreversible mode | <----- | eos-evm-node & eos-evm-rpc | + | with state_history_plugin enabled | +----------------------------+ + +------------------------------------------+ +``` + +## High availability design step 1: deploy leap nodes to multiple VMs +We first scale the leap node (in the real-time service) from 1 leap VM instance into 2 or more leap VM instance in the same region. +``` +Real-time service: + +-- VM11 ----------+ + | leap node | <--- + +------------------+ \ +---- VM2 -----------------+ + select the available leap to connect --| eos-evm-node, rpc, miner | + +-- VM12 ----------+ / | proxy, web-socket-proxy | + | leap node | <--- +--------------------------+ + +------------------+ +``` +We can use the `get_info` request via a script to find out the available leap node to connect/reconnect. + + +## High availability design step 2: deploy eos-evm-node, rpc, miner, proxy and other services to multiple VMs +We then scale up the number of eos-evm-node VM instances from 1 to 2 or even more. Each of them will independently detect and select the available leap node to connect with. +``` +Real-time service: + +----- VM21 -------------+ + /| eos-evm-node, rpc, ... | + +-- VM11 ----------+ / +------------------------+ + | leap node | <------\ / + +------------------+ \ VM21, VM22, VM23 independently / +----- VM22 -------------+ + select the available leap to connect ---| eos-evm-node, rpc, ... | + +-- VM12 ----------+ / \ +------------------------+ + | leap node | <------/ \ + +------------------+ \ +----- VM23 -------------+ + \ | eos-evm-node, rpc, ... | + +------------------------+ +``` + + +## High availability design step 3: Using script & pm2 service to manage leap node process +In order to make sure all leap node will be running all the time, we need some auto restart & recover script so that it will: + +- 1. detect if there's already a running leap (nodeos) process +- 2. try start leap process normally. +- 3. if leap start fails, clean up the state, recover the state via snapshot generated from backup VM, and restart leap process with snapshot + +[This is the template for leap's start.sh script](start_leap.sh)
+ +we also need to use pm2 service to run the above script as a service.
+ + +## High availability design step 4: Using script & pm2 service to manage eos-evm-node, rpc, miner, proxy.. +We also need to use a script `start_evm_node.sh` to auto restart & recover eos-evm-node. The script will do:
+ +- 1. detect which leap node is avaiable +- 2. try to start eos-evm-node normally, connecting to the state-history-plugin endpoint of the avaiable leap node +- 3. if eos-evm-node start fails, clean up evm-node database, download the evm backup from backup VM, try step 2 one more time. + +[This is the template for start_evm_node.sh script](start_evm_node.sh)
+ + +we also need to use pm2 service to run the script as a service. for example:
+`cd eos-evm && pm2 start start_evm_node.sh -l node.log --name evm_node --kill-timeout 10000` + +use pm2 to run eos-evm-rpc. for example:
+`cd eos-evm && pm2 start start_rpc.sh -l rpc.log --name evm_rpc1 --kill-timeout 10000`
+in which start_rpc.sh is:
+`./eos-evm-rpc --api-spec=eth,debug,net,trace --chain-id=17777 --http-port=0.0.0.0:8881 --eos-evm-node=127.0.0.1:8080 --chaindata=./chain-data` + +use pm2 to run miner. for example:
+`cd tx_wrapper && pm2 start index.js -l wrapper.log --name tx_wrapper --kill-timeout 10000` + +use docker to run proxy when VM starts. +``` +cd tx_proxy +sudo mkdir -p logs +sudo mkdir -p logs/error +sudo docker run --add-host=host.docker.internal:host-gateway -p 80:80 -v ${PWD}/logs:/var/log/nginx -d --restart=always --name=tx_proxy evm/tx_proxy +sudo docker restart tx_proxy +``` +see https://github.com/eosnetworkfoundation/eos-evm-node/tree/main/peripherals/proxy for more details.
+ + +use pm2 to run web-socket-proxy. for example:
+``` +cd eos-evm-ws-proxy +WS_LISTENING_HOST=0.0.0.0 pm2 start main.js -l ws_proxy.log --name ws_proxy --kill-timeout 10000 --update-env +``` +see https://github.com/eosnetworkfoundation/eos-evm-node/tree/main/peripherals/eos-evm-ws-proxy for more details.
+ + +
+ +## High availability design step 5 (Optional): scale up multiple miners in each EVM VM: +To further scale up transactions per second, we may also consider scale up multiple miners in each evm machine: +``` +Real-time service: + +--VM1 (leap VM) -------------------+ +-- VM2 (EVM node VM) -------+ + | leap node running in head mode | <-- | eos-evm-node & eos-evm-rpc | <-- read requests + | with state_history_plugin enabled | +----------------------------+ \ +------ VM2 -------+ + +-----------------------------------+ ---- | proxy | + ^ / | web-socket-proxy | + | +-- VM2 (EVM node VM) ----+ / +------------------+ + \-- push EOS ---| eos-evm-miner1 | <-- write requests + transactions | eos-evm-miner2 | + | eos-evm-miner3 | + | eos-evm-miner4 | + +-------------------------+ +``` +This can be easily done by appending `-i 4` into the pm2 command: +``` +pm2 start ./dist/index.js --name evm-miner -l miner.log --name evm-miner --kill-timeout 10000 -i 4 +``` + +## High availability design step 6: setup the same infrastructure on a second region +We further scale the infrastructure from 1 region to 2 or multiple regions.
+In each region, we can setup a target group for load balancing the traffice between multiple evm-nodes.
+And finally, setup a global DNS load balancer to split the traffic between different region according to their geographical locations.
+ +However, the backup service is not required to be scaled. +``` + +---- Region 1 (Real-time service) -------------------------- + + | VM11 (leap) | + | VM12 (leap) | + | VM21 (evm-node, rpc, miners, proxy,...)-\ | + | - target group1 | <----\ + | VM22 (evm-node, rpc, miners, proxy,...)-/ | \ + +-------------------------------------------------------------+ \--- Global DNS Load balancer + / + +---- Region 2 (Real-time service) ---------------------------+ / + | VM11 (leap) | <----/ + | VM12 (leap) | + | VM21 (evm-node, rpc, miners, proxy,...)-\ | + | - target group1 | + | VM22 (evm-node, rpc, miners, proxy,...)-/ | + +-------------------------------------------------------------+ + + +--- Backup VM in region 1 --------+ + | leap (backup, irreversible mode) | + | evm-node, rpc | + +----------------------------------+ +``` + + + +## Generate leap & evm backup periodically +Here are the steps to generate leap backup and EVM backup:
+ +- 1. ensure leap & eos-evm-node is up +- 2. gracefully stop eos-evm-node & eos-evm-rpc +- 3. create leap snapshot +- 4. gracefully stop leap +- 5. backup evm chain-data folder +- 6. backup leap's snapshot, state_history, block logs +- 7. bring up leap +- 8. bring up eos-evm +- 9. remove old backups + +[This sample script can be used in backup VM to create leap & evm backup](create_backup.sh) Each time when a backup is generated, both leap node and evm node need to be gracefully shutted down. + + + diff --git a/HA_Design/start_evm_node.sh b/HA_Design/start_evm_node.sh new file mode 100644 index 0000000..889c242 --- /dev/null +++ b/HA_Design/start_evm_node.sh @@ -0,0 +1,56 @@ +#!/bin/sh +PATH=/usr/bin:/sbin:; export PATH + +curpwd=${PWD} + +leap1= +leap2= +backupip= +backup_sshkeyfile= + +url=$leap1 +date=$(date '+%Y_%m_%d_%H_%M_%S') +sec=`date +%s` + +curl http://$leap1:8888/v1/chain/get_info +r1=$? +curl http://$leap2:8888/v1/chain/get_info +r2=$? + +if [ $r1 -eq 0 ] +then + url=$leap1 +else + if [ $r2 -eq 0 ] + then + url=$leap2 + else + echo "no leap connection available!!!" + exit 1 + fi +fi + +sleep 5.0 + +sec1=`date +%s` +./eos-evm-node --ship-endpoint=$url:8999 --ship-core-account eosio.evm --chain-data ./chain-data --plugin block_conversion_plugin --plugin blockchain_plugin --nocolor 1 --verbosity=4 --genesis-json=./genesis.json + +sec2=`date +%s` +diff=$((sec2-sec1)) + +if [ $diff -lt 5 ] +then + echo "=== failed to start eos-evm-node, try to restore from backup from $backupip===" + # wait for port 8080 released + sleep 60.0 + scp -i $backup_sshkeyfile -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@$backupip:~/last_evm_backup.tgz ~/ + mv ./chain-data ./chain-data.$date + mkdir ./chain-data + cd ~/ + tar zxvf ~/last_evm_backup.tgz + echo "=== killing eos-evm-rpc process & start eos-evm-node again ===" + pkill eos-evm-rpc + cd $curpwd + ./eos-evm-node --ship-endpoint=$url:8999 --ship-core-account eosio.evm --chain-data ./chain-data --plugin block_conversion_plugin --plugin blockchain_plugin --nocolor 1 --verbosity=4 --genesis-json=./genesis.json +fi + diff --git a/HA_Design/start_leap.sh b/HA_Design/start_leap.sh new file mode 100644 index 0000000..d97a054 --- /dev/null +++ b/HA_Design/start_leap.sh @@ -0,0 +1,37 @@ +#!/bin/sh + +# modify the following parameters if necessary +backupip= +backup_sshkeyfile= + +sysctl -w kernel.core_pattern=core.leap + +cd /home/ubuntu/leap/ + +./cleos get info +r=$? +if [ $r -eq 0 ] +then + echo "=== seems nodeos is running already ===" + # avoid this script frequently being called + sleep 30 + exit 1 +fi + +sec1=`date +%s` + +sudo sh -c "ulimit -c unlimited && ulimit -n 30000 && ulimit -s 64000 && ./nodeos --p2p-accept-transactions=0 --database-map-mode=locked --data-dir=./data-dir --config-dir=./data-dir --http-max-response-time-ms=1000 --disable-replay-opts --max-body-size=10000000 $@" + +sec2=`date +%s` +diff=$((sec2-sec1)) + +if [ $diff -lt 60 ] +then + echo "=== failed to start nodeos, try to recover from snapshot file ===" + sudo pkill -9 nodeos + sleep 10 + scp -i $backup_sshkeyfile -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@$backupip:~/last_snapshot.bin ~/last_snapshot.bin + sudo rm -rf ./data-dir/state/* + sudo sh -c "ulimit -c unlimited && ulimit -n 30000 && ulimit -s 64000 && ./nodeos --p2p-accept-transactions=0 --database-map-mode=locked --data-dir=./data-dir --config-dir=./data-dir --http-max-response-time-ms=1000 --disable-replay-opts --max-body-size=10000000 --snapshot /home/ubuntu/last_snapshot.bin $@" +fi +