Skip to content

Commit 5d8927a

Browse files
committed
feature(scale_test): new scale test
1 parent 8ded9b5 commit 5d8927a

9 files changed

+331
-0
lines changed

data_dir/templated_100_table.yaml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
### DML ###
2+
3+
# Keyspace Name
4+
keyspace: testing_keyspaces
5+
6+
# The CQL for creating a keyspace (optional if it already exists)
7+
keyspace_definition: |
8+
CREATE KEYSPACE testing_keyspaces WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND durable_writes = true;
9+
10+
# Table name
11+
table: ${table_name}
12+
13+
# The CQL for creating a table you wish to stress (optional if it already exists)
14+
table_definition: |
15+
CREATE TABLE testing_keyspaces.${table_name} (
16+
key1 bigint,
17+
key2 text,
18+
clustering1 bigint,
19+
clustering2 timeuuid,
20+
column1 text,
21+
column2 int, PRIMARY KEY ((key1, key2), clustering1, clustering2)
22+
) WITH bloom_filter_fp_chance = 0.01
23+
AND caching = {'keys': 'ALL', 'rows_per_partition': 'ALL'}
24+
AND comment = ''
25+
AND compaction = {'class': 'SizeTieredCompactionStrategy'}
26+
AND compression = {}
27+
AND crc_check_chance = 1.0
28+
AND dclocal_read_repair_chance = 0.1
29+
AND default_time_to_live = 0
30+
AND gc_grace_seconds = 864000
31+
AND max_index_interval = 2048
32+
AND memtable_flush_period_in_ms = 0
33+
AND min_index_interval = 128
34+
AND read_repair_chance = 0.0
35+
AND speculative_retry = '99.0PERCENTILE';
36+
37+
38+
# extra_definitions:
39+
# - CREATE INDEX IF NOT EXISTS ${table_name}_field4_${table_name} ON feeds.${table_name} (field4);
40+
41+
# ### Column Distribution Specifications ###
42+
43+
# ### Batch Ratio Distribution Specifications ###
44+
# insert:
45+
# partitions: fixed(1)
46+
# select: fixed(1)/1000
47+
# batchtype: UNLOGGED
48+
49+
# #
50+
# # A list of queries you wish to run against the schema
51+
# #
52+
# queries:
53+
# read1:
54+
# cql: SELECT * FROM feeds.${table_name} WHERE field1 = ?
55+
# fields: samerow
56+
57+
# Run stress
58+
# cassandra-stress user profile={} cl=QUORUM 'ops(insert=1, read1=5)' duration={} -rate threads=2 -errors ignore
59+
60+
# customer wish (different than what we are using!)
61+
# "INSERT INTO short (k,time,data) values (?,?,?) USING TTL ?"
62+
# "SELECT * FROM short WHERE name = ? AND time >= ? AND time < ?"

defaults/test_default.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
db_type: "scylla"
22

33
test_duration: 60
4+
idle_duration: 0
45
prepare_stress_duration: 300 # 5 hours
56
stress_duration: 0
67

docs/configuration_options.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ Test duration (min). Parameter used to keep instances produced by tests<br>and f
4242
**type:** int
4343

4444

45+
## **idle_duration** / SCT_IDLE_DURATION
46+
47+
Idle duration (min). Parameter used to run test without any workload
48+
49+
**default:** N/A
50+
51+
**type:** int
52+
53+
4554
## **prepare_stress_duration** / SCT_PREPARE_STRESS_DURATION
4655

4756
Time in minutes, which is required to run prepare stress commands<br>defined in prepare_*_cmd for dataset generation, and is used in<br>test duration calculation
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!groovy
2+
3+
// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
4+
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
5+
6+
longevityPipeline(
7+
backend: 'aws',
8+
region: '''["eu-west-1","eu-west-2"]''',
9+
availability_zone: 'a,b,c',
10+
test_name: 'scale_cluster_test.ScaleClusterTest.test_resize_cluster',
11+
test_config: 'test-cases/scale/scale-multi-dc-100-empty-tables-cluster-resize.yaml',
12+
)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!groovy
2+
3+
// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
4+
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
5+
6+
longevityPipeline(
7+
backend: 'aws',
8+
region: 'eu-west-1',
9+
test_name: 'scale_cluster_test.ScaleClusterTest.test_grow_shrink_cluster',
10+
test_config: 'test-cases/scale/scale-20-200-20-cluster-resize.yaml',
11+
)

scale_cluster_test.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# This program is distributed in the hope that it will be useful,
2+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
3+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
4+
#
5+
# See LICENSE for more details.
6+
#
7+
# Copyright (c) 2016 ScyllaDB
8+
9+
import time
10+
11+
from longevity_test import LongevityTest
12+
from sdcm.utils.adaptive_timeouts import adaptive_timeout, Operations
13+
from sdcm.utils.cluster_tools import group_nodes_by_dc_idx
14+
from sdcm.sct_events.system import InfoEvent
15+
from sdcm.cluster import MAX_TIME_WAIT_FOR_NEW_NODE_UP
16+
17+
18+
class ScaleClusterTest(LongevityTest):
19+
@staticmethod
20+
def is_target_reached(current: list[int], target: list[int]) -> bool:
21+
return all([x >= y for x, y in zip(current, target)])
22+
23+
def grow_to_cluster_target_size(self, current_cluster_size: list[int], target_cluster_size: list):
24+
InfoEvent(
25+
message=f"Starting to grow cluster from {current_cluster_size} to {target_cluster_size}").publish()
26+
27+
add_node_cnt = self.params.get('add_node_cnt')
28+
try:
29+
while not self.is_target_reached(current_cluster_size, target_cluster_size):
30+
for dcx, target in enumerate(target_cluster_size):
31+
if current_cluster_size[dcx] >= target:
32+
continue
33+
add_nodes_num = add_node_cnt if (
34+
target - current_cluster_size[dcx]) >= add_node_cnt else target - current_cluster_size[dcx]
35+
36+
for rack in range(self.db_cluster.racks_count):
37+
added_nodes = []
38+
InfoEvent(
39+
message=f"Adding next number of nodes {add_nodes_num} to dc_idx {dcx} and rack {rack}").publish()
40+
added_nodes.extend(self.db_cluster.add_nodes(
41+
count=add_nodes_num, enable_auto_bootstrap=True, dc_idx=dcx, rack=rack))
42+
self.monitors.reconfigure_scylla_monitoring()
43+
up_timeout = MAX_TIME_WAIT_FOR_NEW_NODE_UP
44+
with adaptive_timeout(Operations.NEW_NODE, node=self.db_cluster.data_nodes[0], timeout=up_timeout):
45+
self.db_cluster.wait_for_init(
46+
node_list=added_nodes, timeout=up_timeout, check_node_health=False)
47+
self.db_cluster.wait_for_nodes_up_and_normal(nodes=added_nodes)
48+
InfoEvent(f"New nodes up and normal {[node.name for node in added_nodes]}").publish()
49+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
50+
current_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
51+
finally:
52+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
53+
current_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
54+
InfoEvent(message=f"Grow cluster finished, cluster size is {current_cluster_size}").publish()
55+
56+
def shrink_to_cluster_target_size(self, current_cluster_size: list[int], target_cluster_size: list[int]):
57+
InfoEvent(
58+
message=f"Starting to shrink cluster from {current_cluster_size} to {target_cluster_size}").publish()
59+
try:
60+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
61+
while not self.is_target_reached(target_cluster_size, current_cluster_size):
62+
for dcx, _ in enumerate(current_cluster_size):
63+
nodes_by_racks = self.db_cluster.get_nodes_per_datacenter_and_rack_idx(nodes_by_dcx[dcx])
64+
for nodes in nodes_by_racks.values():
65+
decommissioning_node = nodes[-1]
66+
decommissioning_node.running_nemesis = "Decommissioning node"
67+
self.db_cluster.decommission(node=decommissioning_node, timeout=7200)
68+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
69+
current_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
70+
finally:
71+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
72+
current_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
73+
InfoEvent(
74+
message=f"Reached cluster size {current_cluster_size}").publish()
75+
76+
def create_schema(self):
77+
number_of_table = self.params.get(
78+
'user_profile_table_count') or 0
79+
cs_user_profiles = self.params.get('cs_user_profiles')
80+
keyspace_num = self.params.get('keyspace_num')
81+
if not number_of_table and not cs_user_profiles:
82+
self.log.debug("User schema will not be created")
83+
return
84+
if not cs_user_profiles:
85+
region_dc_names = self.db_cluster.get_datacenter_name_per_region(self.db_cluster.nodes)
86+
replication_factor = self.db_cluster.racks_count
87+
InfoEvent("Create keyspace and 100 empty tables").publish()
88+
for i in range(1, keyspace_num + 1):
89+
self.create_keyspace(keyspace_name=f"testing_keyspace_{i}", replication_factor={
90+
dc_name: replication_factor for dc_name in region_dc_names.values()})
91+
for j in range(1, number_of_table + 1):
92+
self.create_table(name=f"table_{j}", keyspace_name=f"testing_keyspace_{i}")
93+
InfoEvent(f"{keyspace_num} Keyspaces and {number_of_table} tables were created").publish()
94+
else:
95+
self._pre_create_templated_user_schema()
96+
97+
def test_grow_target_size_of_empty_cluster(self):
98+
self.create_schema()
99+
cluster_target_size = self.params.get('cluster_target_size')
100+
InfoEvent(f"Start grow cluster up to {cluster_target_size}").publish()
101+
if not cluster_target_size:
102+
self.log.error("cluster_target_size param is not set, cannot grow cluster")
103+
raise ValueError("cluster_target_size param is not set, cannot grow cluster")
104+
cluster_target_size = list(map(int, cluster_target_size.split())) if isinstance(
105+
cluster_target_size, str) else [cluster_target_size]
106+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
107+
current_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
108+
109+
self.grow_to_cluster_target_size(current_cluster_size, cluster_target_size)
110+
111+
def test_shrink_target_size_of_empty_cluster(self):
112+
self.create_schema()
113+
cluster_target_size = self.params.get('cluster_target_size')
114+
InfoEvent(f"Start shrink cluster to {cluster_target_size}").publish()
115+
if not cluster_target_size:
116+
self.log.error("cluster_target_size param is not set, cannot shrink cluster")
117+
raise ValueError("cluster_target_size param is not set, cannot shrink cluster")
118+
cluster_target_size = list(map(int, cluster_target_size.split())) if isinstance(
119+
cluster_target_size, str) else [cluster_target_size]
120+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
121+
current_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
122+
123+
self.shrink_to_cluster_target_size(current_cluster_size, cluster_target_size)
124+
125+
def test_grow_shrink_cluster(self):
126+
self.create_schema()
127+
cluster_target_size = self.params.get('cluster_target_size')
128+
InfoEvent(f"Start grow cluster up to {cluster_target_size}").publish()
129+
if not cluster_target_size:
130+
self.log.error("cluster_target_size param is not set, cannot grow cluster")
131+
raise ValueError("cluster_target_size param is not set, cannot grow cluster")
132+
cluster_target_size = list(map(int, cluster_target_size.split())) if isinstance(
133+
cluster_target_size, str) else [cluster_target_size]
134+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
135+
init_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
136+
137+
try:
138+
self.grow_to_cluster_target_size(init_cluster_size, cluster_target_size)
139+
except Exception as ex: # noqa: BLE001
140+
self.log.error(f"Failed to grow cluster: {ex}")
141+
nodes_by_dcx = group_nodes_by_dc_idx(self.db_cluster.data_nodes)
142+
current_cluster_size = [len(nodes_by_dcx[dcx]) for dcx in sorted(nodes_by_dcx)]
143+
try:
144+
InfoEvent(message=f"Cluster size is {current_cluster_size}").publish()
145+
self.shrink_to_cluster_target_size(current_cluster_size, init_cluster_size)
146+
except Exception as ex: # noqa: BLE001
147+
self.log.error(f"Failed to shrink cluster: {ex}")
148+
149+
def test_resize_cluster(self):
150+
151+
self.create_schema()
152+
153+
self.db_cluster.add_nemesis(nemesis=self.get_nemesis_class(), tester_obj=self)
154+
self.db_cluster.start_nemesis()
155+
duration = int(self.params.get('idle_duration'))
156+
InfoEvent(f"Wait {duration} minutes while cluster resizing").publish()
157+
time.sleep(duration * 60)
158+
InfoEvent("Test done")

sdcm/sct_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,8 @@ class SCTConfiguration(dict):
257257
Test duration (min). Parameter used to keep instances produced by tests
258258
and for jenkins pipeline timeout and TimoutThread.
259259
"""),
260+
dict(name="idle_duration", env="SCT_IDLE_DURATION", type=int,
261+
help="""Idle duration (min). Parameter used to run test without any workload"""),
260262
dict(name="prepare_stress_duration", env="SCT_PREPARE_STRESS_DURATION", type=int,
261263
help="""
262264
Time in minutes, which is required to run prepare stress commands
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
test_duration: 6000
2+
3+
keyspace_num: 0
4+
user_profile_table_count: 0
5+
add_cs_user_profiles_extra_tables: true
6+
7+
n_loaders: 0
8+
n_db_nodes: 20
9+
add_node_cnt: 1
10+
cluster_target_size: 200
11+
12+
instance_type_db: 'i4i.2xlarge'
13+
instance_type_monitor: 'm6i.xlarge'
14+
root_disk_size_monitor: 120
15+
16+
nemesis_class_name: 'NoOpMonkey'
17+
18+
# This is in order to start the basic cluster faster
19+
use_legacy_cluster_init: false
20+
parallel_node_operations: true
21+
seeds_num: 2
22+
# Takes too long on big clusters
23+
cluster_health_check: false
24+
25+
backtrace_decoding: false
26+
27+
append_scylla_yaml:
28+
enable_repair_based_node_ops: true
29+
30+
run_fullscan: []
31+
32+
simulated_racks: 0
33+
instance_type_runner: 'c7i.16xlarge'
34+
root_disk_size_runner: 200
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
test_duration: 4000
2+
idle_duration: 180
3+
4+
# cs_user_profiles:
5+
# - data_dir/templated_100_table.yaml
6+
user_profile_table_count: 100
7+
add_cs_user_profiles_extra_tables: true
8+
keyspace_num: 1
9+
10+
n_loaders: 0
11+
n_db_nodes: "60 60"
12+
add_node_cnt: 1
13+
round_robin: true
14+
15+
instance_type_db: 'i4i.2xlarge'
16+
instance_type_loader: 'c7i.4xlarge'
17+
instance_type_monitor: 'm6i.xlarge'
18+
root_disk_size_monitor: 120
19+
20+
21+
# decommission 'add_node_cnt' number of nodes and add the same number of nodes
22+
nemesis_class_name: 'DecommissionMonkey'
23+
# as fast as possible including health checks
24+
nemesis_interval: 1
25+
26+
# This is in order to start the basic cluster faster
27+
use_legacy_cluster_init: false
28+
parallel_node_operations: true
29+
seeds_num: 5
30+
# Takes too long on big clusters
31+
cluster_health_check: false
32+
33+
backtrace_decoding: true
34+
35+
append_scylla_yaml:
36+
enable_repair_based_node_ops: true
37+
38+
run_fullscan: []
39+
40+
simulated_racks: 0
41+
instance_type_runner: 'c7i.16xlarge'
42+
root_disk_size_runner: 200

0 commit comments

Comments
 (0)