Skip to content

Commit 7d0b1d3

Browse files
authored
Merge pull request #5907 from yeya24/release-1.17.0-rc1
Cherrypick commits to release-1.17 for new RC
2 parents cd3f7c6 + e9704d1 commit 7d0b1d3

File tree

13 files changed

+130
-89
lines changed

13 files changed

+130
-89
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* [FEATURE] Ruler: Add `ruler.concurrent-evals-enabled` flag to enable concurrent evaluation within a single rule group for independent rules. Maximum concurrency can be configured via `ruler.max-concurrent-evals`. #5766
2525
* [FEATURE] Distributor Queryable: Experimental: Add config `zone_results_quorum_metadata`. When querying ingesters using metadata APIs such as label names and values, only results from quorum number of zones will be included and merged. #5779
2626
* [FEATURE] Storage Cache Clients: Add config `set_async_circuit_breaker_config` to utilize the circuit breaker pattern for dynamically thresholding asynchronous set operations. Implemented in both memcached and redis cache clients. #5789
27-
* [FEATURE] Ruler: Add experimental `experimental.ruler.api-deduplicate-rules` flag to remove duplicate rule groups from the Prometheus compatible rules API endpoint. Add experimental `ruler.ring.replication-factor` and `ruler.ring.zone-awareness-enabled` flags to configure rule group replication, but only the first ruler in the replicaset evaluates the rule group, the rest will just hold a copy as backup. Add experimental `experimental.ruler.api-enable-rules-backup` flag to configure rulers to send the rule group backups stored in the replicaset to handle events when a ruler is down during an API request to list rules. #5782
27+
* [FEATURE] Ruler: Add experimental `experimental.ruler.api-deduplicate-rules` flag to remove duplicate rule groups from the Prometheus compatible rules API endpoint. Add experimental `ruler.ring.replication-factor` and `ruler.ring.zone-awareness-enabled` flags to configure rule group replication, but only the first ruler in the replicaset evaluates the rule group, the rest will be used as backup to handle events when a ruler is down during an API request to list rules. #5782 #5901
2828
* [FEATURE] Ring: Add experimental `-ingester.tokens-generator-strategy=minimize-spread` flag to enable the new minimize spread token generator strategy. #5855
2929
* [FEATURE] Ring Status Page: Add `Ownership Diff From Expected` column in the ring table to indicate the extent to which the ownership of a specific ingester differs from the expected ownership. #5889
3030
* [ENHANCEMENT] Ingester: Add per-tenant new metric `cortex_ingester_tsdb_data_replay_duration_seconds`. #5477

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.17.0-rc.0
1+
1.17.0-rc.1

docs/configuration/config-file-reference.md

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4271,15 +4271,6 @@ ring:
42714271
# CLI flag: -experimental.ruler.enable-api
42724272
[enable_api: <boolean> | default = false]
42734273
4274-
# EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers
4275-
# with default state (state before any evaluation) and send this copy in list
4276-
# API requests as backup in case the ruler who owns the rule fails to send its
4277-
# rules. This allows the rules API to handle ruler outage by returning rules
4278-
# with default state. Ring replication-factor needs to be set to 2 or more for
4279-
# this to be useful.
4280-
# CLI flag: -experimental.ruler.api-enable-rules-backup
4281-
[api_enable_rules_backup: <boolean> | default = false]
4282-
42834274
# EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API
42844275
# response. If there are duplicate rules the rule with the latest evaluation
42854276
# timestamp will be kept.

integration/ruler_test.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ func TestRulerAPIShardingWithAPIRulesBackupEnabled(t *testing.T) {
402402
testRulerAPIWithSharding(t, true)
403403
}
404404

405-
func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
405+
func testRulerAPIWithSharding(t *testing.T, enableRulesBackup bool) {
406406
const numRulesGroups = 100
407407

408408
random := rand.New(rand.NewSource(time.Now().UnixNano()))
@@ -459,9 +459,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
459459
// Enable the bucket index so we can skip the initial bucket scan.
460460
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
461461
}
462-
if enableAPIRulesBackup {
462+
if enableRulesBackup {
463463
overrides["-ruler.ring.replication-factor"] = "3"
464-
overrides["-experimental.ruler.api-enable-rules-backup"] = "true"
465464
}
466465
rulerFlags := mergeFlags(
467466
BlocksStorageFlags(),
@@ -556,8 +555,8 @@ func testRulerAPIWithSharding(t *testing.T, enableAPIRulesBackup bool) {
556555
},
557556
}
558557
// For each test case, fetch the rules with configured filters, and ensure the results match.
559-
if enableAPIRulesBackup {
560-
err := ruler2.Kill() // if api-enable-rules-backup is enabled the APIs should be able to handle a ruler going down
558+
if enableRulesBackup {
559+
err := ruler2.Kill() // if rules backup is enabled the APIs should be able to handle a ruler going down
561560
require.NoError(t, err)
562561
}
563562
for name, tc := range testCases {

pkg/distributor/distributor_test.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ var (
5757
emptyResponse = &cortexpb.WriteResponse{}
5858
)
5959

60+
var (
61+
randomStrings = []string{}
62+
)
63+
64+
func init() {
65+
randomStrings = util.GenerateRandomStrings()
66+
}
67+
6068
func TestConfig_Validate(t *testing.T) {
6169
t.Parallel()
6270
tests := map[string]struct {
@@ -2466,8 +2474,8 @@ func prepare(tb testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []
24662474
// Strings to be used for get labels values/Names
24672475
var unusedStrings []string
24682476
if cfg.lblValuesPerIngester > 0 {
2469-
unusedStrings = make([]string, min(len(util.RandomStrings), cfg.numIngesters*cfg.lblValuesPerIngester))
2470-
copy(unusedStrings, util.RandomStrings)
2477+
unusedStrings = make([]string, min(len(randomStrings), cfg.numIngesters*cfg.lblValuesPerIngester))
2478+
copy(unusedStrings, randomStrings)
24712479
}
24722480
s := &prepState{
24732481
unusedStrings: unusedStrings,

pkg/ruler/manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, eva
116116
registry: reg,
117117
logger: logger,
118118
}
119-
if cfg.APIEnableRulesBackup {
119+
if cfg.RulesBackupEnabled() {
120120
m.rulesBackupManager = newRulesBackupManager(cfg, logger, reg)
121121
}
122122
return m, nil

pkg/ruler/manager_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,9 @@ func TestBackupRules(t *testing.T) {
262262
1 * time.Millisecond,
263263
}
264264
ruleManagerFactory := RuleManagerFactory(nil, waitDurations)
265-
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir, APIEnableRulesBackup: true}, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
265+
config := Config{RulePath: dir}
266+
config.Ring.ReplicationFactor = 3
267+
m, err := NewDefaultMultiTenantManager(config, ruleManagerFactory, evalMetrics, reg, log.NewNopLogger())
266268
require.NoError(t, err)
267269

268270
const user1 = "testUser"

pkg/ruler/ruler.go

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,8 @@ type Config struct {
130130
Ring RingConfig `yaml:"ring"`
131131
FlushCheckPeriod time.Duration `yaml:"flush_period"`
132132

133-
EnableAPI bool `yaml:"enable_api"`
134-
APIEnableRulesBackup bool `yaml:"api_enable_rules_backup"`
135-
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`
133+
EnableAPI bool `yaml:"enable_api"`
134+
APIDeduplicateRules bool `yaml:"api_deduplicate_rules"`
136135

137136
EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
138137
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
@@ -200,7 +199,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
200199
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
201200
f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
202201
f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
203-
f.BoolVar(&cfg.APIEnableRulesBackup, "experimental.ruler.api-enable-rules-backup", false, "EXPERIMENTAL: Enable rulers to store a copy of rules owned by other rulers with default state (state before any evaluation) and send this copy in list API requests as backup in case the ruler who owns the rule fails to send its rules. This allows the rules API to handle ruler outage by returning rules with default state. Ring replication-factor needs to be set to 2 or more for this to be useful.")
204202
f.BoolVar(&cfg.APIDeduplicateRules, "experimental.ruler.api-deduplicate-rules", false, "EXPERIMENTAL: Remove duplicate rules in the prometheus rules and alerts API response. If there are duplicate rules the rule with the latest evaluation timestamp will be kept.")
205203
f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
206204
f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
@@ -217,6 +215,12 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
217215
cfg.RingCheckPeriod = 5 * time.Second
218216
}
219217

218+
func (cfg *Config) RulesBackupEnabled() bool {
219+
// If the replication factor is greater the 1, only the first replica is responsible for evaluating the rule,
220+
// the rest of the replica will store the rule groups as backup only for API HA.
221+
return cfg.Ring.ReplicationFactor > 1
222+
}
223+
220224
// MultiTenantManager is the interface of interaction with a Manager that is tenant aware.
221225
type MultiTenantManager interface {
222226
// SyncRuleGroups is used to sync the Manager with rules from the RuleStore.
@@ -581,7 +585,7 @@ func (r *Ruler) syncRules(ctx context.Context, reason string) {
581585
// This will also delete local group files for users that are no longer in 'configs' map.
582586
r.manager.SyncRuleGroups(ctx, loadedConfigs)
583587

584-
if r.cfg.APIEnableRulesBackup {
588+
if r.cfg.RulesBackupEnabled() {
585589
r.manager.BackUpRuleGroups(ctx, backupConfigs)
586590
}
587591
}
@@ -604,7 +608,7 @@ func (r *Ruler) loadRuleGroups(ctx context.Context) (map[string]rulespb.RuleGrou
604608
if err != nil {
605609
level.Warn(r.logger).Log("msg", "failed to load some rules owned by this ruler", "count", len(ownedConfigs)-len(loadedOwnedConfigs), "err", err)
606610
}
607-
if r.cfg.APIEnableRulesBackup {
611+
if r.cfg.RulesBackupEnabled() {
608612
loadedBackupConfigs, err := r.store.LoadRuleGroups(ctx, backupConfigs)
609613
if err != nil {
610614
level.Warn(r.logger).Log("msg", "failed to load some rules backed up by this ruler", "count", len(backupConfigs)-len(loadedBackupConfigs), "err", err)
@@ -685,7 +689,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
685689
if len(owned) > 0 {
686690
ownedConfigs[userID] = owned
687691
}
688-
if r.cfg.APIEnableRulesBackup {
692+
if r.cfg.RulesBackupEnabled() {
689693
backup := filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
690694
if len(backup) > 0 {
691695
backedUpConfigs[userID] = backup
@@ -748,7 +752,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
748752

749753
filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
750754
var filterBackup []*rulespb.RuleGroupDesc
751-
if r.cfg.APIEnableRulesBackup {
755+
if r.cfg.RulesBackupEnabled() {
752756
filterBackup = filterBackupRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
753757
}
754758
if len(filterOwned) == 0 && len(filterBackup) == 0 {
@@ -1121,7 +1125,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
11211125
)
11221126

11231127
zoneByAddress := make(map[string]string)
1124-
if r.cfg.APIEnableRulesBackup {
1128+
if r.cfg.RulesBackupEnabled() {
11251129
for _, ruler := range rulers.Instances {
11261130
zoneByAddress[ruler.Addr] = ruler.Zone
11271131
}
@@ -1146,9 +1150,9 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
11461150
if err != nil {
11471151
level.Error(r.logger).Log("msg", "unable to retrieve rules from ruler", "addr", addr, "err", err)
11481152
r.rulerGetRulesFailures.WithLabelValues(addr).Inc()
1149-
// If APIEnableRulesBackup is enabled and there are enough rulers replicating the rules, we should
1153+
// If rules backup is enabled and there are enough rulers replicating the rules, we should
11501154
// be able to handle failures.
1151-
if r.cfg.APIEnableRulesBackup && len(jobs) >= r.cfg.Ring.ReplicationFactor {
1155+
if r.cfg.RulesBackupEnabled() && len(jobs) >= r.cfg.Ring.ReplicationFactor {
11521156
mtx.Lock()
11531157
failedZones[zoneByAddress[addr]] = struct{}{}
11541158
errCount += 1
@@ -1168,7 +1172,7 @@ func (r *Ruler) getShardedRules(ctx context.Context, userID string, rulesRequest
11681172
return nil
11691173
})
11701174

1171-
if err == nil && (r.cfg.APIEnableRulesBackup || r.cfg.APIDeduplicateRules) {
1175+
if err == nil && (r.cfg.RulesBackupEnabled() || r.cfg.APIDeduplicateRules) {
11721176
merged = mergeGroupStateDesc(merged)
11731177
}
11741178

@@ -1183,7 +1187,7 @@ func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, er
11831187
return nil, fmt.Errorf("no user id found in context")
11841188
}
11851189

1186-
groupDescs, err := r.getLocalRules(userID, *in, r.cfg.APIEnableRulesBackup)
1190+
groupDescs, err := r.getLocalRules(userID, *in, r.cfg.RulesBackupEnabled())
11871191
if err != nil {
11881192
return nil, err
11891193
}

pkg/ruler/ruler_ring.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,10 @@ func GetReplicationSetForListRule(r ring.ReadRing, cfg *RingConfig) (ring.Replic
144144
// to 0, and then we update them whether zone-awareness is enabled or not.
145145
maxErrors := 0
146146
maxUnavailableZones := 0
147-
if cfg.ZoneAwarenessEnabled {
147+
// Because ring's Get method returns a number of ruler equal to the replication factor even if there is only 1 zone
148+
// and ZoneAwarenessEnabled, we can consider that ZoneAwarenessEnabled is disabled if there is only 1 zone since
149+
// rules are still replicated to rulers in the same zone.
150+
if cfg.ZoneAwarenessEnabled && len(ringZones) > 1 {
148151
numReplicatedZones := min(len(ringZones), r.ReplicationFactor())
149152
// Given that quorum is not required, we only need at least one of the zone to be healthy to succeed. But we
150153
// also need to handle case when RF < number of zones.

pkg/ruler/ruler_ring_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,23 @@ func TestGetReplicationSetForListRule(t *testing.T) {
180180
"z2": {},
181181
},
182182
},
183+
"should succeed on 1 unhealthy instances in RF=3 zone replication enabled but only 1 zone": {
184+
ringInstances: map[string]ring.InstanceDesc{
185+
"instance-1": {Addr: "127.0.0.1", State: ring.ACTIVE, Timestamp: now.Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-1", "z1", 128, true), Zone: "z1"},
186+
"instance-2": {Addr: "127.0.0.2", State: ring.ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-2", "z1", 128, true), Zone: "z1"},
187+
"instance-3": {Addr: "127.0.0.3", State: ring.ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-3", "z1", 128, true), Zone: "z1"},
188+
"instance-4": {Addr: "127.0.0.4", State: ring.PENDING, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: g.GenerateTokens(ring.NewDesc(), "instance-4", "z1", 128, true), Zone: "z1"},
189+
},
190+
ringHeartbeatTimeout: time.Minute,
191+
ringReplicationFactor: 3,
192+
expectedSet: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"},
193+
enableAZReplication: true,
194+
expectedFailedZones: map[string]struct{}{
195+
"z1": {},
196+
},
197+
expectedMaxUnavailableZones: 0,
198+
expectedMaxError: 1,
199+
},
183200
}
184201

185202
for testName, testData := range tests {

0 commit comments

Comments
 (0)