Orchestrator故障切换失败

Orchestrator故障切换失败

1. 故障发生之前,MySQL集群拓扑如下:

https://img1.sycdn.imooc.com/climg/feed0169092a70c213720483.jpg

3. https://img1.sycdn.imooc.com/climg/a3399e69092a711f07460466.jpg

https://img1.sycdn.imooc.com/climg/87a7a469092a714808130457.jpg



2. 模拟主库宕机,关闭主实例

https://img1.sycdn.imooc.com/climg/fbaae169092a71cd08940073.jpg


3. 页面上发现拓扑变化情况如下:

https://img1.sycdn.imooc.com/climg/c79bd069092a720b17380520.jpg

https://img1.sycdn.imooc.com/climg/f4e28d69092a724015080605.jpg

https://img1.sycdn.imooc.com/climg/a183ad69092a726813180484.jpg

4. 出现两个MySQL集群maria-05和maria-06,与视频中演示的情况不同

https://img1.sycdn.imooc.com/climg/92015669092a72dc13860588.jpg

5.到MySQL里验证复制关系:

maria-06:

https://img1.sycdn.imooc.com/climg/40b3ad69092a73f308710358.jpg

maria-07:

https://img1.sycdn.imooc.com/climg/9cc86569092a741d08050386.jpg

6. 恢复旧主实例,页面查看拓扑结构:

https://img1.sycdn.imooc.com/climg/8a274a69092a74af15100564.jpg

https://img1.sycdn.imooc.com/climg/4caa3a69092a74e011700493.jpg

https://img1.sycdn.imooc.com/climg/7d834969092a74ca12120435.jpg

还是存在两个集群

7. 查看Orchestrator参数MasterFailoverDetachReplicaMasterHost设置为false;

https://img1.sycdn.imooc.com/climg/f87f1869092a763706330732.jpg

8. 问题:为什么会出现这种情况?要怎么排查问题?


9. 配置文件Orchestrator.conf.json内容如下:

{
  "Debug": true,
  "EnableSyslog": false,
  "ListenAddress": ":3000",
  "MySQLTopologyUser": "orc_client_user",
  "MySQLTopologyPassword": "xxxxxx",
  "MySQLTopologyCredentialsConfigFile": "",
  "MySQLTopologySSLPrivateKeyFile": "",
  "MySQLTopologySSLCertFile": "",
  "MySQLTopologySSLCAFile": "",
  "MySQLTopologySSLSkipVerify": true,
  "MySQLTopologyUseMutualTLS": false,
  "MySQLOrchestratorHost": "127.0.0.1",
  "MySQLOrchestratorPort": 3306,
  "MySQLOrchestratorDatabase": "orchestrator",
  "MySQLOrchestratorUser": "orc_server_user",
  "MySQLOrchestratorPassword": "xxxxx",
  "MySQLOrchestratorCredentialsConfigFile": "",
  "MySQLOrchestratorSSLPrivateKeyFile": "",
  "MySQLOrchestratorSSLCertFile": "",
  "MySQLOrchestratorSSLCAFile": "",
  "MySQLOrchestratorSSLSkipVerify": true,
  "MySQLOrchestratorUseMutualTLS": false,
  "MySQLConnectTimeoutSeconds": 1,
  "DefaultInstancePort": 3306,
  "DiscoverByShowSlaveHosts": true,
  "InstancePollSeconds": 5,
  "DiscoveryIgnoreReplicaHostnameFilters": [
    "a_host_i_want_to_ignore[.]example[.]com",
    ".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
    "a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307"
  ],
  "UnseenInstanceForgetHours": 240,
  "SnapshotTopologiesIntervalHours": 0,
  "InstanceBulkOperationsWaitTimeoutSeconds": 10,
  "HostnameResolveMethod": "default",
  "MySQLHostnameResolveMethod": "@@hostname",
  "SkipBinlogServerUnresolveCheck": true,
  "ExpiryHostnameResolvesMinutes": 60,
  "RejectHostnameResolvePattern": "",
  "ReasonableReplicationLagSeconds": 10,
  "ProblemIgnoreHostnameFilters": [],
  "VerifyReplicationFilters": false,
  "ReasonableMaintenanceReplicationLagSeconds": 20,
  "CandidateInstanceExpireMinutes": 60,
  "AuditLogFile": "",
  "AuditToSyslog": false,
  "RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
  "ReadOnly": false,
  "AuthenticationMethod": "basic",
  "HTTPAuthUser": "admin",
  "HTTPAuthPassword": "Nimxxxxx",
  "RaftEnabled":true,
  "RaftDataDir":"/usr/local/orchestrator/raftdata",
  "RaftBind":"192.168.184.151",
  "DefaultRaftPort":10008,
  "RaftNodes":[
  "192.168.184.151",
  "192.168.184.152",
  "192.168.184.153"
  ],
  "AuthUserHeader": "",
  "PowerAuthUsers": [
    "*"
  ],
  "ClusterNameToAlias": {
    "127.0.0.1": "test suite"
  },
  "ReplicationLagQuery": "",
  "DetectClusterAliasQuery": "SELECT SUBSTRING_INDEX(@@hostname, '.', 1)",
  "DetectClusterDomainQuery": "",
  "DetectInstanceAliasQuery": "",
  "DetectPromotionRuleQuery": "",
  "DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
  "PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
  "PromotionIgnoreHostnameFilters": [],
  "DetectSemiSyncEnforcedQuery": "",
  "ServeAgentsHttp": false,
  "AgentsServerPort": ":3001",
  "AgentsUseSSL": false,
  "AgentsUseMutualTLS": false,
  "AgentSSLSkipVerify": false,
  "AgentSSLPrivateKeyFile": "",
  "AgentSSLCertFile": "",
  "AgentSSLCAFile": "",
  "AgentSSLValidOUs": [],
  "UseSSL": false,
  "UseMutualTLS": false,
  "SSLSkipVerify": false,
  "SSLPrivateKeyFile": "",
  "SSLCertFile": "",
  "SSLCAFile": "",
  "SSLValidOUs": [],
  "URLPrefix": "",
  "StatusEndpoint": "/api/status",
  "StatusSimpleHealth": true,
  "StatusOUVerify": false,
  "AgentPollMinutes": 60,
  "UnseenAgentForgetHours": 6,
  "StaleSeedFailMinutes": 60,
  "SeedAcceptableBytesDiff": 8192,
  "PseudoGTIDPattern": "",
  "PseudoGTIDPatternIsFixedSubstring": false,
  "PseudoGTIDMonotonicHint": "asc:",
  "DetectPseudoGTIDQuery": "",
  "BinlogEventsChunkSize": 10000,
  "SkipBinlogEventsContaining": [],
  "ReduceReplicationAnalysisCount": true,
  "FailureDetectionPeriodBlockMinutes": 60,
  "FailMasterPromotionOnLagMinutes": 0,
  "RecoveryPeriodBlockSeconds": 60,
  "RecoveryIgnoreHostnameFilters": [],
  "RecoverMasterClusterFilters": [
    "*"
  ],
  "RecoverIntermediateMasterClusterFilters": [
    "*"
  ],
  "OnFailureDetectionProcesses": [
    "echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
  ],
  "PreGracefulTakeoverProcesses": [
    "echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
  ],
  "PreFailoverProcesses": [
    "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
  ],
  "PostFailoverProcesses": [
    "echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostUnsuccessfulFailoverProcesses": [],
  "PostMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostIntermediateMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostGracefulTakeoverProcesses": [
    "echo 'Planned takeover complete' >> /tmp/recovery.log"
  ],
  "CoMasterRecoveryMustPromoteOtherCoMaster": true,
  "DetachLostSlavesAfterMasterFailover": true,
  "ApplyMySQLPromotionAfterMasterFailover": true,
  "PreventCrossDataCenterMasterFailover": false,
  "PreventCrossRegionMasterFailover": false,
  "MasterFailoverDetachReplicaMasterHost": false,
  "MasterFailoverLostInstancesDowntimeMinutes": 0,
  "PostponeReplicaRecoveryOnLagMinutes": 0,
  "OSCIgnoreHostnameFilters": [],
  "GraphiteAddr": "",
  "GraphitePath": "",
  "GraphiteConvertHostnameDotsToUnderscores": true,
  "ConsulAddress": "",
  "ConsulAclToken": "",
  "ConsulKVStoreProvider": "consul"
}




正在回答

登陆购买课程后可参与讨论,去登陆

1回答

我回看了视频,有可能这个页面当时没刷新,我其他实验室分成了两个独立集群,也就是故障的一个,剩下的两个实例是一个集群,跟你的实验结果一致,这样也更为合理,他异常就剔除,等解决异常,DBA再手动恢复到集群里,会更安全。

问题已解决,确定采纳
还有疑问,暂不采纳

恭喜解决一个难题,获得1积分~

来为老师/同学的回答评分吧

0 星
DBA数据库工程师
  • 参与学习       266    人
  • 解答问题       80    个

本课程主要讲解MySQL DBA需要掌握的技术,包括MySQL基础、日志文件、讲师的实战经验分享、常用工具、高可用、优化、DBA 实战项目等内容。

了解课程
请稍等 ...
微信客服

购课补贴
联系客服咨询优惠详情

帮助反馈 APP下载

慕课网APP
您的移动学习伙伴

公众号

扫描二维码
关注慕课网微信公众号

在线咨询

领取优惠

免费试听

领取大纲

扫描二维码,添加
你的专属老师