From 1e300fc137b2c6bcbbed981f4a57d40827d45445 Mon Sep 17 00:00:00 2001 From: Felipe Imanishi Date: Fri, 21 Jun 2024 10:11:50 -0700 Subject: [PATCH] Persist failover history in DomainInfo data (#6139) **What changed?** Added functionality to persist recent failover event data in the DomainInfo whenever a valid failover is executed. `FailoverEvent` contains the failover `timestamp`, `fromCluster`, `toCluster`, and `FailoverType`("Force"/"Grace") information. `FailoverHistory` is the key in the `DomainInfo` `data`. It's a slice stored as a string containing the `FailoverEvents`, with max size defined by `dynamicconfig.FrontendFailoverHistoryMaxSize`. It has a default value of 5 and domain filter allowed. `FailoverHistory` always keep the n most recent `FailoverEvents` and it's sorted by descending timestamp. **Why?** Persist failover information, improving failover visibility to clients and the cadence team. **How did you test it?** Unit tests and integration tests. Tested locally, triggering failovers in a multiple Cadence clusters with replication environment. **Potential risks** The change does not affect the logic of UpdateDomain. It adds the failover info to the DomainInfo data. The main risk is that we introduce something that can cause the code to panic. Errors while adding the FailoverHistory are logged as warnings and do not return/interrupt the UpdateDomain action. **Release notes** **Documentation Changes** --- common/constants.go | 23 ++ common/domain/handler.go | 57 ++++ common/domain/handler_MasterCluster_test.go | 31 +- .../domain/handler_NotMasterCluster_test.go | 31 +- common/domain/handler_integration_test.go | 23 +- common/domain/handler_test.go | 290 ++++++++++++------ common/dynamicconfig/constants.go | 12 + service/frontend/config/config.go | 1 + service/frontend/config/config_test.go | 1 + 9 files changed, 347 insertions(+), 122 deletions(-) diff --git a/common/constants.go b/common/constants.go index db79264d543..3f5f97f2357 100644 --- a/common/constants.go +++ b/common/constants.go @@ -166,6 +166,8 @@ const ( DomainDataKeyForManagedFailover = "IsManagedByCadence" // DomainDataKeyForPreferredCluster is the key of DomainData for domain rebalance DomainDataKeyForPreferredCluster = "PreferredCluster" + // DomainDataKeyForFailoverHistory is the key of DomainData for failover history + DomainDataKeyForFailoverHistory = "FailoverHistory" // DomainDataKeyForReadGroups stores which groups have read permission of the domain API DomainDataKeyForReadGroups = "READ_GROUPS" // DomainDataKeyForWriteGroups stores which groups have write permission of the domain API @@ -269,3 +271,24 @@ const ( // WorkflowIDRateLimitReason is the reason set in ServiceBusyError when workflow ID rate limit is exceeded WorkflowIDRateLimitReason = "external-workflow-id-rate-limit" ) + +type ( + // FailoverType is the enum for representing different failover types + FailoverType int +) + +const ( + FailoverTypeForce = iota + 1 + FailoverTypeGrace +) + +func (v FailoverType) String() string { + switch v { + case FailoverTypeForce: + return "Force" + case FailoverTypeGrace: + return "Grace" + default: + return "Unknown" + } +} diff --git a/common/domain/handler.go b/common/domain/handler.go index fca2ae6773b..4eb71b3daf3 100644 --- a/common/domain/handler.go +++ b/common/domain/handler.go @@ -24,6 +24,7 @@ package domain import ( "context" + "encoding/json" "fmt" "regexp" "time" @@ -101,6 +102,20 @@ type ( RequiredDomainDataKeys dynamicconfig.MapPropertyFn MaxBadBinaryCount dynamicconfig.IntPropertyFnWithDomainFilter FailoverCoolDown dynamicconfig.DurationPropertyFnWithDomainFilter + FailoverHistoryMaxSize dynamicconfig.IntPropertyFnWithDomainFilter + } + + // FailoverEvent is the failover information to be stored for each failover event in domain data + FailoverEvent struct { + EventTime time.Time `json:"eventTime"` + FromCluster string `json:"fromCluster"` + ToCluster string `json:"toCluster"` + FailoverType string `json:"failoverType"` + } + + // FailoverHistory is the history of failovers for a domain limited by the FailoverHistoryMaxSize config + FailoverHistory struct { + FailoverEvents []FailoverEvent } ) @@ -496,9 +511,14 @@ func (d *handlerImpl) UpdateDomain( if configurationChanged { configVersion++ } + if activeClusterChanged && isGlobalDomain { + var failoverType common.FailoverType = common.FailoverTypeGrace + // Force failover cleans graceful failover state if updateRequest.FailoverTimeoutInSeconds == nil { + failoverType = common.FailoverTypeForce + // force failover cleanup graceful failover state gracefulFailoverEndTime = nil previousFailoverVersion = common.InitialPreviousFailoverVersion @@ -508,6 +528,11 @@ func (d *handlerImpl) UpdateDomain( failoverVersion, updateRequest.Name, ) + err = updateFailoverHistory(info, d.config, now, currentActiveCluster, *updateRequest.ActiveClusterName, failoverType) + if err != nil { + d.logger.Warn("failed to update failover history", tag.Error(err)) + } + failoverNotificationVersion = notificationVersion } lastUpdatedTime = now @@ -1262,3 +1287,35 @@ func createUpdateRequest( NotificationVersion: notificationVersion, } } + +func updateFailoverHistory( + info *persistence.DomainInfo, + config Config, + eventTime time.Time, + fromCluster string, + toCluster string, + failoverType common.FailoverType, +) error { + data := info.Data + if info.Data == nil { + data = make(map[string]string) + } + + newFailoverEvent := FailoverEvent{EventTime: eventTime, FromCluster: fromCluster, ToCluster: toCluster, FailoverType: failoverType.String()} + + var failoverHistory []FailoverEvent + _ = json.Unmarshal([]byte(data[common.DomainDataKeyForFailoverHistory]), &failoverHistory) + + failoverHistory = append([]FailoverEvent{newFailoverEvent}, failoverHistory...) + + // Truncate the history to the max size + failoverHistoryJSON, err := json.Marshal(failoverHistory[:common.MinInt(config.FailoverHistoryMaxSize(info.Name), len(failoverHistory))]) + if err != nil { + return err + } + data[common.DomainDataKeyForFailoverHistory] = string(failoverHistoryJSON) + + info.Data = data + + return nil +} diff --git a/common/domain/handler_MasterCluster_test.go b/common/domain/handler_MasterCluster_test.go index ddd00c55152..883452f2506 100644 --- a/common/domain/handler_MasterCluster_test.go +++ b/common/domain/handler_MasterCluster_test.go @@ -22,6 +22,7 @@ package domain import ( "context" + "encoding/json" "log" "os" "testing" @@ -50,13 +51,14 @@ type ( domainHandlerGlobalDomainEnabledPrimaryClusterSuite struct { *persistencetests.TestBase - minRetentionDays int - maxBadBinaryCount int - domainManager persistence.DomainManager - mockProducer *mocks.KafkaProducer - mockDomainReplicator Replicator - archivalMetadata archiver.ArchivalMetadata - mockArchiverProvider *provider.MockArchiverProvider + minRetentionDays int + maxBadBinaryCount int + failoverHistoryMaxSize int + domainManager persistence.DomainManager + mockProducer *mocks.KafkaProducer + mockDomainReplicator Replicator + archivalMetadata archiver.ArchivalMetadata + mockArchiverProvider *provider.MockArchiverProvider handler *handlerImpl } @@ -89,6 +91,7 @@ func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) SetupTest() { dcCollection := dc.NewCollection(dc.NewNopClient(), logger) s.minRetentionDays = 1 s.maxBadBinaryCount = 10 + s.failoverHistoryMaxSize = 5 s.domainManager = s.TestBase.DomainManager s.mockProducer = &mocks.KafkaProducer{} s.mockDomainReplicator = NewDomainReplicator(s.mockProducer, logger) @@ -102,9 +105,10 @@ func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) SetupTest() { ) s.mockArchiverProvider = &provider.MockArchiverProvider{} domainConfig := Config{ - MinRetentionDays: dc.GetIntPropertyFn(s.minRetentionDays), - MaxBadBinaryCount: dc.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount), - FailoverCoolDown: dc.GetDurationPropertyFnFilteredByDomain(0 * time.Second), + MinRetentionDays: dc.GetIntPropertyFn(s.minRetentionDays), + MaxBadBinaryCount: dc.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount), + FailoverCoolDown: dc.GetDurationPropertyFnFilteredByDomain(0 * time.Second), + FailoverHistoryMaxSize: dc.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize), } s.handler = NewHandler( domainConfig, @@ -114,7 +118,7 @@ func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) SetupTest() { s.mockDomainReplicator, s.archivalMetadata, s.mockArchiverProvider, - clock.NewRealTimeSource(), + clock.NewMockedTimeSource(), ).(*handlerImpl) } @@ -820,6 +824,11 @@ func (s *domainHandlerGlobalDomainEnabledPrimaryClusterSuite) TestUpdateGetDomai }) s.Nil(err) + var failoverHistory []FailoverEvent + failoverHistory = append(failoverHistory, FailoverEvent{EventTime: s.handler.timeSource.Now(), FromCluster: prevActiveClusterName, ToCluster: nextActiveClusterName, FailoverType: common.FailoverType(common.FailoverTypeForce).String()}) + failoverHistoryJSON, _ := json.Marshal(failoverHistory) + data[common.DomainDataKeyForFailoverHistory] = string(failoverHistoryJSON) + fnTest := func(info *types.DomainInfo, config *types.DomainConfiguration, replicationConfig *types.DomainReplicationConfiguration, isGlobalDomain bool, failoverVersion int64) { s.NotEmpty(info.GetUUID()) diff --git a/common/domain/handler_NotMasterCluster_test.go b/common/domain/handler_NotMasterCluster_test.go index 2e043185236..82b44c6d753 100644 --- a/common/domain/handler_NotMasterCluster_test.go +++ b/common/domain/handler_NotMasterCluster_test.go @@ -22,6 +22,7 @@ package domain import ( "context" + "encoding/json" "log" "os" "testing" @@ -50,13 +51,14 @@ type ( domainHandlerGlobalDomainEnabledNotPrimaryClusterSuite struct { *persistencetests.TestBase - minRetentionDays int - maxBadBinaryCount int - domainManager persistence.DomainManager - mockProducer *mocks.KafkaProducer - mockDomainReplicator Replicator - archivalMetadata archiver.ArchivalMetadata - mockArchiverProvider *provider.MockArchiverProvider + minRetentionDays int + maxBadBinaryCount int + failoverHistoryMaxSize int + domainManager persistence.DomainManager + mockProducer *mocks.KafkaProducer + mockDomainReplicator Replicator + archivalMetadata archiver.ArchivalMetadata + mockArchiverProvider *provider.MockArchiverProvider handler *handlerImpl } @@ -89,6 +91,7 @@ func (s *domainHandlerGlobalDomainEnabledNotPrimaryClusterSuite) SetupTest() { dcCollection := dc.NewCollection(dc.NewNopClient(), logger) s.minRetentionDays = 1 s.maxBadBinaryCount = 10 + s.failoverHistoryMaxSize = 5 s.domainManager = s.TestBase.DomainManager s.mockProducer = &mocks.KafkaProducer{} s.mockDomainReplicator = NewDomainReplicator(s.mockProducer, logger) @@ -102,9 +105,10 @@ func (s *domainHandlerGlobalDomainEnabledNotPrimaryClusterSuite) SetupTest() { ) s.mockArchiverProvider = &provider.MockArchiverProvider{} domainConfig := Config{ - MinRetentionDays: dc.GetIntPropertyFn(s.minRetentionDays), - MaxBadBinaryCount: dc.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount), - FailoverCoolDown: dc.GetDurationPropertyFnFilteredByDomain(0 * time.Second), + MinRetentionDays: dc.GetIntPropertyFn(s.minRetentionDays), + MaxBadBinaryCount: dc.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount), + FailoverCoolDown: dc.GetDurationPropertyFnFilteredByDomain(0 * time.Second), + FailoverHistoryMaxSize: dc.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize), } s.handler = NewHandler( domainConfig, @@ -114,7 +118,7 @@ func (s *domainHandlerGlobalDomainEnabledNotPrimaryClusterSuite) SetupTest() { s.mockDomainReplicator, s.archivalMetadata, s.mockArchiverProvider, - clock.NewRealTimeSource(), + clock.NewMockedTimeSource(), ).(*handlerImpl) } @@ -683,6 +687,11 @@ func (s *domainHandlerGlobalDomainEnabledNotPrimaryClusterSuite) TestUpdateGetDo }) s.Nil(err) + var failoverHistory []FailoverEvent + failoverHistory = append(failoverHistory, FailoverEvent{EventTime: s.handler.timeSource.Now(), FromCluster: prevActiveClusterName, ToCluster: nextActiveClusterName, FailoverType: common.FailoverType(common.FailoverTypeForce).String()}) + failoverHistoryJSON, _ := json.Marshal(failoverHistory) + data[common.DomainDataKeyForFailoverHistory] = string(failoverHistoryJSON) + fnTest := func(info *types.DomainInfo, config *types.DomainConfiguration, replicationConfig *types.DomainReplicationConfiguration, isGlobalDomain bool, failoverVersion int64) { s.NotEmpty(info.GetUUID()) diff --git a/common/domain/handler_integration_test.go b/common/domain/handler_integration_test.go index a907117ccf0..25cfcfae82a 100644 --- a/common/domain/handler_integration_test.go +++ b/common/domain/handler_integration_test.go @@ -54,13 +54,14 @@ type ( domainHandlerCommonSuite struct { *persistencetests.TestBase - minRetentionDays int - maxBadBinaryCount int - domainManager persistence.DomainManager - mockProducer *mocks.KafkaProducer - mockDomainReplicator Replicator - archivalMetadata archiver.ArchivalMetadata - mockArchiverProvider *provider.MockArchiverProvider + minRetentionDays int + maxBadBinaryCount int + failoverHistoryMaxSize int + domainManager persistence.DomainManager + mockProducer *mocks.KafkaProducer + mockDomainReplicator Replicator + archivalMetadata archiver.ArchivalMetadata + mockArchiverProvider *provider.MockArchiverProvider handler *handlerImpl } @@ -95,6 +96,7 @@ func (s *domainHandlerCommonSuite) SetupTest() { dcCollection := dc.NewCollection(dc.NewNopClient(), logger) s.minRetentionDays = 1 s.maxBadBinaryCount = 10 + s.failoverHistoryMaxSize = 5 s.domainManager = s.TestBase.DomainManager s.mockProducer = &mocks.KafkaProducer{} s.mockDomainReplicator = NewDomainReplicator(s.mockProducer, logger) @@ -108,9 +110,10 @@ func (s *domainHandlerCommonSuite) SetupTest() { ) s.mockArchiverProvider = &provider.MockArchiverProvider{} domainConfig := Config{ - MinRetentionDays: dc.GetIntPropertyFn(s.minRetentionDays), - MaxBadBinaryCount: dc.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount), - FailoverCoolDown: dc.GetDurationPropertyFnFilteredByDomain(0 * time.Second), + MinRetentionDays: dc.GetIntPropertyFn(s.minRetentionDays), + MaxBadBinaryCount: dc.GetIntPropertyFilteredByDomain(s.maxBadBinaryCount), + FailoverCoolDown: dc.GetDurationPropertyFnFilteredByDomain(0 * time.Second), + FailoverHistoryMaxSize: dc.GetIntPropertyFilteredByDomain(s.failoverHistoryMaxSize), } s.handler = NewHandler( domainConfig, diff --git a/common/domain/handler_test.go b/common/domain/handler_test.go index 3dcceab29df..f8de74e3f99 100644 --- a/common/domain/handler_test.go +++ b/common/domain/handler_test.go @@ -22,10 +22,12 @@ package domain import ( "context" + "encoding/json" "errors" "fmt" "net/url" "reflect" + "strconv" "testing" "time" @@ -1277,7 +1279,7 @@ func TestHandler_UpdateDomain(t *testing.T) { domainReplicator *MockReplicator, ) request *types.UpdateDomainRequest - response *types.UpdateDomainResponse + response func(timeSource clock.MockedTimeSource) *types.UpdateDomainResponse err error }{ { @@ -1346,29 +1348,33 @@ func TestHandler_UpdateDomain(t *testing.T) { Name: constants.TestDomainName, ActiveClusterName: common.Ptr(cluster.TestAlternativeClusterName), }, - response: &types.UpdateDomainResponse{ - IsGlobalDomain: true, - FailoverVersion: cluster.TestCurrentClusterInitialFailoverVersion/cluster.TestFailoverVersionIncrement*cluster.TestFailoverVersionIncrement + cluster.TestAlternativeClusterInitialFailoverVersion, - DomainInfo: &types.DomainInfo{ - Name: constants.TestDomainName, - UUID: constants.TestDomainID, - Status: common.Ptr(types.DomainStatusRegistered), - }, - Configuration: &types.DomainConfiguration{ - WorkflowExecutionRetentionPeriodInDays: 1, - EmitMetric: true, - HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, - IsolationGroups: &types.IsolationGroupConfiguration{}, - AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, - }, - ReplicationConfiguration: &types.DomainReplicationConfiguration{ - ActiveClusterName: cluster.TestAlternativeClusterName, - Clusters: []*types.ClusterReplicationConfiguration{ - {ClusterName: cluster.TestCurrentClusterName}, {ClusterName: cluster.TestAlternativeClusterName}, + response: func(timeSource clock.MockedTimeSource) *types.UpdateDomainResponse { + data, _ := json.Marshal([]FailoverEvent{{EventTime: timeSource.Now(), FromCluster: cluster.TestCurrentClusterName, ToCluster: cluster.TestAlternativeClusterName, FailoverType: common.FailoverType(common.FailoverTypeForce).String()}}) + return &types.UpdateDomainResponse{ + IsGlobalDomain: true, + FailoverVersion: cluster.TestCurrentClusterInitialFailoverVersion/cluster.TestFailoverVersionIncrement*cluster.TestFailoverVersionIncrement + cluster.TestAlternativeClusterInitialFailoverVersion, + DomainInfo: &types.DomainInfo{ + Name: constants.TestDomainName, + UUID: constants.TestDomainID, + Data: map[string]string{common.DomainDataKeyForFailoverHistory: string(data)}, + Status: common.Ptr(types.DomainStatusRegistered), }, - }, + Configuration: &types.DomainConfiguration{ + WorkflowExecutionRetentionPeriodInDays: 1, + EmitMetric: true, + HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, + IsolationGroups: &types.IsolationGroupConfiguration{}, + AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, + }, + ReplicationConfiguration: &types.DomainReplicationConfiguration{ + ActiveClusterName: cluster.TestAlternativeClusterName, + Clusters: []*types.ClusterReplicationConfiguration{ + {ClusterName: cluster.TestCurrentClusterName}, {ClusterName: cluster.TestAlternativeClusterName}, + }, + }, + } }, }, { @@ -1439,29 +1445,33 @@ func TestHandler_UpdateDomain(t *testing.T) { ActiveClusterName: common.Ptr(cluster.TestCurrentClusterName), FailoverTimeoutInSeconds: common.Int32Ptr(10), }, - response: &types.UpdateDomainResponse{ - IsGlobalDomain: true, - FailoverVersion: cluster.TestFailoverVersionIncrement, - DomainInfo: &types.DomainInfo{ - Name: constants.TestDomainName, - UUID: constants.TestDomainID, - Status: common.Ptr(types.DomainStatusRegistered), - }, - Configuration: &types.DomainConfiguration{ - WorkflowExecutionRetentionPeriodInDays: 1, - EmitMetric: true, - HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, - IsolationGroups: &types.IsolationGroupConfiguration{}, - AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, - }, - ReplicationConfiguration: &types.DomainReplicationConfiguration{ - ActiveClusterName: cluster.TestCurrentClusterName, - Clusters: []*types.ClusterReplicationConfiguration{ - {ClusterName: cluster.TestCurrentClusterName}, {ClusterName: cluster.TestAlternativeClusterName}, + response: func(timeSource clock.MockedTimeSource) *types.UpdateDomainResponse { + data, _ := json.Marshal([]FailoverEvent{{EventTime: timeSource.Now(), FromCluster: cluster.TestAlternativeClusterName, ToCluster: cluster.TestCurrentClusterName, FailoverType: common.FailoverType(common.FailoverTypeGrace).String()}}) + return &types.UpdateDomainResponse{ + IsGlobalDomain: true, + FailoverVersion: cluster.TestFailoverVersionIncrement, + DomainInfo: &types.DomainInfo{ + Name: constants.TestDomainName, + UUID: constants.TestDomainID, + Data: map[string]string{common.DomainDataKeyForFailoverHistory: string(data)}, + Status: common.Ptr(types.DomainStatusRegistered), }, - }, + Configuration: &types.DomainConfiguration{ + WorkflowExecutionRetentionPeriodInDays: 1, + EmitMetric: true, + HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, + IsolationGroups: &types.IsolationGroupConfiguration{}, + AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, + }, + ReplicationConfiguration: &types.DomainReplicationConfiguration{ + ActiveClusterName: cluster.TestCurrentClusterName, + Clusters: []*types.ClusterReplicationConfiguration{ + {ClusterName: cluster.TestCurrentClusterName}, {ClusterName: cluster.TestAlternativeClusterName}, + }, + }, + } }, }, { @@ -1530,29 +1540,31 @@ func TestHandler_UpdateDomain(t *testing.T) { Name: constants.TestDomainName, EmitMetric: common.Ptr(false), }, - response: &types.UpdateDomainResponse{ - IsGlobalDomain: true, - FailoverVersion: cluster.TestCurrentClusterInitialFailoverVersion, - DomainInfo: &types.DomainInfo{ - Name: constants.TestDomainName, - UUID: constants.TestDomainID, - Status: common.Ptr(types.DomainStatusRegistered), - }, - Configuration: &types.DomainConfiguration{ - WorkflowExecutionRetentionPeriodInDays: 1, - EmitMetric: false, - HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, - IsolationGroups: &types.IsolationGroupConfiguration{}, - AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, - }, - ReplicationConfiguration: &types.DomainReplicationConfiguration{ - ActiveClusterName: cluster.TestCurrentClusterName, - Clusters: []*types.ClusterReplicationConfiguration{ - {ClusterName: cluster.TestCurrentClusterName}, {ClusterName: cluster.TestAlternativeClusterName}, + response: func(_ clock.MockedTimeSource) *types.UpdateDomainResponse { + return &types.UpdateDomainResponse{ + IsGlobalDomain: true, + FailoverVersion: cluster.TestCurrentClusterInitialFailoverVersion, + DomainInfo: &types.DomainInfo{ + Name: constants.TestDomainName, + UUID: constants.TestDomainID, + Status: common.Ptr(types.DomainStatusRegistered), }, - }, + Configuration: &types.DomainConfiguration{ + WorkflowExecutionRetentionPeriodInDays: 1, + EmitMetric: false, + HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, + IsolationGroups: &types.IsolationGroupConfiguration{}, + AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, + }, + ReplicationConfiguration: &types.DomainReplicationConfiguration{ + ActiveClusterName: cluster.TestCurrentClusterName, + Clusters: []*types.ClusterReplicationConfiguration{ + {ClusterName: cluster.TestCurrentClusterName}, {ClusterName: cluster.TestAlternativeClusterName}, + }, + }, + } }, }, { @@ -1601,29 +1613,32 @@ func TestHandler_UpdateDomain(t *testing.T) { Name: constants.TestDomainName, ActiveClusterName: common.Ptr(cluster.TestCurrentClusterName), }, - response: &types.UpdateDomainResponse{ - IsGlobalDomain: false, - FailoverVersion: cluster.TestCurrentClusterInitialFailoverVersion/cluster.TestFailoverVersionIncrement*cluster.TestFailoverVersionIncrement + cluster.TestCurrentClusterInitialFailoverVersion, - DomainInfo: &types.DomainInfo{ - Name: constants.TestDomainName, - UUID: constants.TestDomainID, - Status: common.Ptr(types.DomainStatusRegistered), - }, - Configuration: &types.DomainConfiguration{ - WorkflowExecutionRetentionPeriodInDays: 1, - EmitMetric: true, - HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), - BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, - IsolationGroups: &types.IsolationGroupConfiguration{}, - AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, - }, - ReplicationConfiguration: &types.DomainReplicationConfiguration{ - ActiveClusterName: cluster.TestCurrentClusterName, - Clusters: []*types.ClusterReplicationConfiguration{ - {ClusterName: cluster.TestCurrentClusterName}, + response: func(_ clock.MockedTimeSource) *types.UpdateDomainResponse { + return &types.UpdateDomainResponse{ + IsGlobalDomain: false, + FailoverVersion: cluster.TestCurrentClusterInitialFailoverVersion/cluster.TestFailoverVersionIncrement*cluster.TestFailoverVersionIncrement + cluster.TestCurrentClusterInitialFailoverVersion, + DomainInfo: &types.DomainInfo{ + Name: constants.TestDomainName, + UUID: constants.TestDomainID, + + Status: common.Ptr(types.DomainStatusRegistered), }, - }, + Configuration: &types.DomainConfiguration{ + WorkflowExecutionRetentionPeriodInDays: 1, + EmitMetric: true, + HistoryArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + VisibilityArchivalStatus: common.Ptr(types.ArchivalStatusDisabled), + BadBinaries: &types.BadBinaries{Binaries: map[string]*types.BadBinaryInfo{}}, + IsolationGroups: &types.IsolationGroupConfiguration{}, + AsyncWorkflowConfig: &types.AsyncWorkflowConfiguration{Enabled: true}, + }, + ReplicationConfiguration: &types.DomainReplicationConfiguration{ + ActiveClusterName: cluster.TestCurrentClusterName, + Clusters: []*types.ClusterReplicationConfiguration{ + {ClusterName: cluster.TestCurrentClusterName}, + }, + }, + } }, }, { @@ -1971,6 +1986,7 @@ func TestHandler_UpdateDomain(t *testing.T) { RequiredDomainDataKeys: nil, MaxBadBinaryCount: dynamicconfig.GetIntPropertyFilteredByDomain(maxLength), FailoverCoolDown: func(string) time.Duration { return time.Second }, + FailoverHistoryMaxSize: dynamicconfig.GetIntPropertyFilteredByDomain(5), } clusterMetadata := cluster.GetTestClusterMetadata(true) @@ -1998,7 +2014,7 @@ func TestHandler_UpdateDomain(t *testing.T) { } else { assert.NoError(t, err) assert.NotNil(t, response) - assert.Equal(t, tc.response, response) + assert.Equal(t, tc.response(mockTimeSource), response) } }) } @@ -2480,3 +2496,97 @@ func TestHandleGracefulFailover(t *testing.T) { }) } } + +func TestUpdateFailoverHistory(t *testing.T) { + now := time.Now() + fromCluster := "fromCluster" + toCluster := "toCluster" + failoverType := common.FailoverType(common.FailoverTypeForce) + failoverHistoryMaxSize := 5 + + testCases := []struct { + name string + domainInfo func() *persistence.DomainInfo + response func() string + err error + }{ + { + name: "Success case - DomainInfo data is nil", + domainInfo: func() *persistence.DomainInfo { return &persistence.DomainInfo{} }, + response: func() string { + failoverHistory := []FailoverEvent{{EventTime: now, FromCluster: fromCluster, ToCluster: toCluster, FailoverType: failoverType.String()}} + jsonResp, _ := json.Marshal(failoverHistory) + return string(jsonResp) + }, + }, + { + name: "Success case - FailoverHistory is nil", + domainInfo: func() *persistence.DomainInfo { return &persistence.DomainInfo{Data: map[string]string{}} }, + response: func() string { + failoverHistory := []FailoverEvent{{EventTime: now, FromCluster: fromCluster, ToCluster: toCluster, FailoverType: failoverType.String()}} + jsonResp, _ := json.Marshal(failoverHistory) + return string(jsonResp) + }, + }, + { + name: "Success case - FailoverHistory is not nil", + domainInfo: func() *persistence.DomainInfo { + eventTime := time.Date(2021, 1, 1, 1, 1, 1, 1, time.UTC) + failoverHistory := []FailoverEvent{{EventTime: eventTime, FromCluster: "fromCluster1", ToCluster: "toCluster1", FailoverType: common.FailoverType(common.FailoverTypeGrace).String()}} + failoverHistoryJSON, _ := json.Marshal(failoverHistory) + return &persistence.DomainInfo{Data: map[string]string{common.DomainDataKeyForFailoverHistory: string(failoverHistoryJSON)}} + }, + response: func() string { + eventTime := time.Date(2021, 1, 1, 1, 1, 1, 1, time.UTC) + failoverHistory := []FailoverEvent{{EventTime: eventTime, FromCluster: "fromCluster1", ToCluster: "toCluster1", FailoverType: common.FailoverType(common.FailoverTypeGrace).String()}} + failoverHistory = append([]FailoverEvent{{EventTime: now, FromCluster: fromCluster, ToCluster: toCluster, FailoverType: failoverType.String()}}, failoverHistory...) + jsonResp, _ := json.Marshal(failoverHistory) + return string(jsonResp) + }, + }, + { + name: "Success case - FailoverHistory is at max size", + domainInfo: func() *persistence.DomainInfo { + var failoverHistory []FailoverEvent + for i := 0; i < failoverHistoryMaxSize; i++ { + eventTime := time.Date(2021, 1, i, 1, 1, 1, 1, time.UTC) + failoverHistory = append(failoverHistory, FailoverEvent{EventTime: eventTime, FromCluster: "fromCluster" + strconv.Itoa(i), ToCluster: "toCluster" + strconv.Itoa(i), FailoverType: common.FailoverType(common.FailoverTypeGrace).String()}) + } + failoverHistoryJSON, _ := json.Marshal(failoverHistory) + return &persistence.DomainInfo{Data: map[string]string{common.DomainDataKeyForFailoverHistory: string(failoverHistoryJSON)}} + }, + response: func() string { + var failoverHistory []FailoverEvent + for i := 0; i < 5; i++ { + eventTime := time.Date(2021, 1, i, 1, 1, 1, 1, time.UTC) + failoverHistory = append(failoverHistory, FailoverEvent{EventTime: eventTime, FromCluster: "fromCluster" + strconv.Itoa(i), ToCluster: "toCluster" + strconv.Itoa(i), FailoverType: common.FailoverType(common.FailoverTypeGrace).String()}) + } + failoverHistory = append([]FailoverEvent{{EventTime: now, FromCluster: fromCluster, ToCluster: toCluster, FailoverType: failoverType.String()}}, failoverHistory[:(5-1)]...) + jsonResp, _ := json.Marshal(failoverHistory) + return string(jsonResp) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + cfg := Config{ + FailoverHistoryMaxSize: func(domain string) int { + return failoverHistoryMaxSize + }, + } + + domainInfo := tc.domainInfo() + err := updateFailoverHistory(domainInfo, cfg, now, fromCluster, toCluster, failoverType) + + if tc.err != nil { + assert.Equal(t, tc.err, err) + } else { + assert.NoError(t, err) + assert.NotNil(t, domainInfo.Data) + assert.NotNil(t, domainInfo.Data[common.DomainDataKeyForFailoverHistory]) + assert.Equal(t, tc.response(), domainInfo.Data[common.DomainDataKeyForFailoverHistory]) + } + }) + } +} diff --git a/common/dynamicconfig/constants.go b/common/dynamicconfig/constants.go index 366f2fa0d4b..41228f99be6 100644 --- a/common/dynamicconfig/constants.go +++ b/common/dynamicconfig/constants.go @@ -630,6 +630,12 @@ const ( // Default value: 10000 // Allowed filters: N/A VisibilityArchivalQueryMaxPageSize + // FrontendFailoverHistoryMaxSize is the maximum size for the number of failover event records in a domain failover history + // KeyName: frontend.failoverHistoryMaxSize + // Value type: Int + // Default value: 5 + // Allowed filters: DomainName + FrontendFailoverHistoryMaxSize // key for matching @@ -3139,6 +3145,12 @@ var IntKeys = map[IntKey]DynamicInt{ Description: "VisibilityArchivalQueryMaxPageSize is the maximum page size for a visibility archival query", DefaultValue: 10000, }, + FrontendFailoverHistoryMaxSize: { + KeyName: "frontend.failoverHistoryMaxSize", + Filters: []Filter{DomainName}, + Description: "FrontendFailoverHistoryMaxSize is the maximum size for the number of failover event records in a domain failover history", + DefaultValue: 5, + }, MatchingUserRPS: { KeyName: "matching.rps", Description: "MatchingUserRPS is request rate per second for each matching host", diff --git a/service/frontend/config/config.go b/service/frontend/config/config.go index 3ebbef0f5b1..34d80b0b6e3 100644 --- a/service/frontend/config/config.go +++ b/service/frontend/config/config.go @@ -188,6 +188,7 @@ func NewConfig(dc *dynamicconfig.Collection, numHistoryShards int, isAdvancedVis MaxRetentionDays: dc.GetIntProperty(dynamicconfig.MaxRetentionDays), FailoverCoolDown: dc.GetDurationPropertyFilteredByDomain(dynamicconfig.FrontendFailoverCoolDown), RequiredDomainDataKeys: dc.GetMapProperty(dynamicconfig.RequiredDomainDataKeys), + FailoverHistoryMaxSize: dc.GetIntPropertyFilteredByDomain(dynamicconfig.FrontendFailoverHistoryMaxSize), }, HostName: hostName, } diff --git a/service/frontend/config/config_test.go b/service/frontend/config/config_test.go index 0d3b8008b51..a04728ee004 100644 --- a/service/frontend/config/config_test.go +++ b/service/frontend/config/config_test.go @@ -111,6 +111,7 @@ func TestNewConfig(t *testing.T) { "MaxRetentionDays": {dynamicconfig.MaxRetentionDays, 42}, "FailoverCoolDown": {dynamicconfig.FrontendFailoverCoolDown, time.Duration(43)}, "RequiredDomainDataKeys": {dynamicconfig.RequiredDomainDataKeys, map[string]interface{}{"bar": "baz"}}, + "FailoverHistoryMaxSize": {dynamicconfig.FrontendFailoverHistoryMaxSize, 44}, } client := dynamicconfig.NewInMemoryClient() dc := dynamicconfig.NewCollection(client, testlogger.New(t))