Details
-
Bug
-
Resolution: Incomplete
-
Minor - P4
-
None
-
None
-
None
-
ALL
Description
1. I have setup mongod replica consisted of 3 nodes + 1 delayed hidden node + arbiter.
2. I have setup DNS: primary and secondary internal DNS (bind) servers, so that I can reference nodes by normal FQDN name instead of IP address.
3. I have secondary DNS to handle requests while (if) primary is down.
*Problem*:
----------
When I simulate primary DNS down - I totally break Replica Set, as master node - doesn't see other nodes and becomes SECONDARY 5-10 seconds later
----------
this what my primary node (mongodb-cluster-shard-01-rA.site-aws.com) displays when primary DNS is shut down:
siteRS0:SECONDARY> rs.status()
|
{
|
"set" : "siteRS0",
|
"date" : ISODate("2014-08-10T03:16:22Z"),
|
"myState" : 2,
|
"members" : [
|
{
|
"_id" : 0,
|
"name" : "mongodb-cluster-shard-01-rA.site-aws.com:27017",
|
"health" : 1,
|
"state" : 2,
|
"stateStr" : "SECONDARY",
|
"uptime" : 1913839,
|
"optime" : Timestamp(1407628608, 1),
|
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
|
"self" : true
|
},
|
{
|
"_id" : 1,
|
"name" : "mongodb-cluster-shard-01-rB.site-aws.com:27017",
|
"health" : 0,
|
"state" : 8,
|
"stateStr" : "(not reachable/healthy)",
|
"uptime" : 0,
|
"optime" : Timestamp(1407628608, 1),
|
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
|
"lastHeartbeat" : ISODate("2014-08-10T03:16:08Z"),
|
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:52Z"),
|
"pingMs" : 0,
|
"syncingTo" : "mongodb-cluster-shard-01-rA.site-aws.com:27017"
|
},
|
{
|
"_id" : 2,
|
"name" : "mongodb-cluster-shard-01-arbiter.site-aws.com:30000",
|
"health" : 0,
|
"state" : 8,
|
"stateStr" : "(not reachable/healthy)",
|
"uptime" : 0,
|
"lastHeartbeat" : ISODate("2014-08-10T03:16:19Z"),
|
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:45Z"),
|
"pingMs" : 0
|
},
|
{
|
"_id" : 3,
|
"name" : "mongodb-cluster-shard-01-rC.site-aws.com:27017",
|
"health" : 0,
|
"state" : 8,
|
"stateStr" : "(not reachable/healthy)",
|
"uptime" : 0,
|
"optime" : Timestamp(1407628608, 1),
|
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
|
"lastHeartbeat" : ISODate("2014-08-10T03:16:16Z"),
|
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:52Z"),
|
"pingMs" : 0,
|
"syncingTo" : "mongodb-cluster-shard-01-rA.site-aws.com:27017"
|
},
|
{
|
"_id" : 4,
|
"name" : "mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com:27017",
|
"health" : 0,
|
"state" : 8,
|
"stateStr" : "(not reachable/healthy)",
|
"uptime" : 0,
|
"optime" : Timestamp(1407628608, 1),
|
"optimeDate" : ISODate("2014-08-09T23:56:48Z"),
|
"lastHeartbeat" : ISODate("2014-08-10T03:16:00Z"),
|
"lastHeartbeatRecv" : ISODate("2014-08-10T03:15:49Z"),
|
"pingMs" : 0,
|
"syncingTo" : "mongodb-cluster-shard-01-rA.site-aws.com:27017"
|
}
|
],
|
"ok" : 1
|
}
|
if I go to the log I see a lot of getaddrinfo messages:
[root@mongodb-cluster-shard-01-rA ec2-user]# tail /mongo/log/mongod.log
|
2014-08-10T02:35:13.044+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-arbiter.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:13.469+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:13.469+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rC.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rC.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
|
2014-08-10T02:35:13.968+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:13.968+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
|
2014-08-10T02:35:17.059+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rB.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:17.059+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rB.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rB.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
|
2014-08-10T02:35:18.476+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:18.669+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rC.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rC.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
|
2014-08-10T02:35:18.976+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com") failed: Name or service not known
|
[root@mongodb-cluster-shard-01-rA ec2-user]# tail /mongo/log/mongod.log
|
2014-08-10T02:35:17.059+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rB.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:17.059+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rB.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rB.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
|
2014-08-10T02:35:18.476+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:18.669+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rC.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rC.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
|
2014-08-10T02:35:18.976+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rA-backup-hidden.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:20.051+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-arbiter.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:20.051+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-arbiter.site-aws.com:30000: couldn't connect to server mongodb-cluster-shard-01-arbiter.site-aws.com:30000 (0.0.0.0) failed, address resolved to 0.0.0.0
|
2014-08-10T02:35:23.677+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rC.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:24.066+0000 [rsHealthPoll] getaddrinfo("mongodb-cluster-shard-01-rB.site-aws.com") failed: Name or service not known
|
2014-08-10T02:35:24.066+0000 [rsHealthPoll] couldn't connect to mongodb-cluster-shard-01-rB.site-aws.com:27017: couldn't connect to server mongodb-cluster-shard-01-rB.site-aws.com:27017 (0.0.0.0) failed, address resolved to 0.0.0.0
|
[root@mongodb-cluster-shard-01-rA ec2-user]#
|
however nslookup resolve FQDN to IP properly:
----------
[root@mongodb-cluster-shard-01-rA ec2-user]# nslookup mongodb-cluster-shard-01-rC.site-aws.com
|
Server: 10.233.147.18 (this is secondary dns)
|
Address: 10.233.147.18#53
|
|
Name: mongodb-cluster-shard-01-rC.site-aws.com
|
Address: 10.220.153.211
|
*after I start the primary dns (.119):*
then soon I will get it resolved by primary DNS
[root@mongodb-cluster-shard-01-rA ec2-user]# nslookup mongodb-cluster-shard-01-rC.site-aws.com
|
Server: 10.35.147.119
|
Address: 10.35.147.119#53
|
----------
I get everything back to normal once primary DNS is up & running. My replica gets primary and all is OK.
So what did I miss or doing wrong?
----------
my mongo instances have the following /etc/resolve.conf file:
----------
[root@mongodb-cluster-shard-01-rA log]# cat /etc/resolv.conf
|
; generated by /sbin/dhclient-script
|
search us-west-2.compute.internal site.com
|
nameserver 10.35.147.119
|
nameserver 10.233.147.18
|
nameserver 172.16.0.23
|
nameserver 172.16.0.23
|
primary DNS /etc/named.conf:
----------
options {
|
#listen-on port 53 { 127.0.0.1; 10.224.3.36};
|
listen-on-v6 port 53 { ::1; };
|
directory "/var/named";
|
dump-file "/var/named/data/cache_dump.db";
|
statistics-file "/var/named/data/named_stats.txt";
|
memstatistics-file "/var/named/data/named_mem_stats.txt";
|
allow-query { any; };
|
recursion no;
|
|
dnssec-enable yes;
|
dnssec-validation yes;
|
dnssec-lookaside auto;
|
|
/* Path to ISC DLV key */
|
bindkeys-file "/etc/named.iscdlv.key";
|
|
managed-keys-directory "/var/named/dynamic";
|
notify yes;
|
also-notify { 10.233.147.18; };
|
|
};
|
|
logging {
|
channel default_debug {
|
file "data/named.run";
|
severity dynamic;
|
};
|
};
|
|
zone "site-aws.com" IN {
|
type master;
|
file "site-aws.com.zone";
|
allow-update { none; };
|
allow-query { any; };
|
allow-transfer {10.233.147.18; };
|
};
|
|
|
include "/etc/named.rfc1912.zones";
|
include "/etc/named.root.key";
|
*"site-aws.com.zone" defined:*
$TTL 86400
|
@ IN SOA ns1.site-aws.com. root.site-aws.com. (
|
2013042203 ;Serial
|
300 ;Refresh
|
1800 ;Retry
|
604800 ;Expire
|
86400 ;Minimum TTL
|
)
|
; Specify our two nameservers
|
IN NS ns1.site-aws.com.
|
; IN NS ns2.site-aws.com.
|
; Resolve nameserver hostnames to IP, replace with your two droplet IP addresses.
|
ns1 IN A 10.224.3.36
|
;ns2 IN A 2.2.2.2
|
|
; Define hostname -> IP pairs which you wish to resolve
|
devops IN A 10.35.147.119
|
mongodb-cluster-shard-01-rA IN A 10.230.9.223
|
mongodb-cluster-shard-01-rB IN A 10.17.6.57
|
mongodb-cluster-shard-01-rC IN A 10.220.153.211
|
mongodb-cluster-shard-01-arbiter IN A 10.251.112.114
|
mongodb-cluster-shard-01-rA-backup-hidden IN A 10.230.20.83
|
mongodb-cluster-backup IN A 10.230.20.83
|
prod-redis-cluster-01-rA IN A 10.226.207.86
|
ns1 IN A 10.35.147.119
|
ns2
|
|
|
IN A 10.233.147.18
|
|
|
*secondary DNS /etc/named.conf:*
options {
|
#listen-on port 53 { 127.0.0.1; 10.224.3.36};
|
listen-on-v6 port 53 { ::1; };
|
directory "/var/named";
|
dump-file "/var/named/data/cache_dump.db";
|
statistics-file "/var/named/data/named_stats.txt";
|
memstatistics-file "/var/named/data/named_mem_stats.txt";
|
allow-query { any; };
|
recursion no;
|
|
dnssec-enable yes;
|
dnssec-validation yes;
|
dnssec-lookaside auto;
|
|
/* Path to ISC DLV key */
|
bindkeys-file "/etc/named.iscdlv.key";
|
|
managed-keys-directory "/var/named/dynamic";
|
};
|
|
logging {
|
channel default_debug {
|
file "data/named.run";
|
severity dynamic;
|
};
|
};
|
|
zone "site-aws.com" IN {
|
type slave;
|
file "site-aws.com.zone";
|
allow-query { any; };
|
allow-transfer {10.35.147.119; }; ## NS1 is allowed for zone transfer when necessary ##
|
masters {10.35.147.119; }; ## the master NS1 is defined ##
|
};
|
|
include "/etc/named.rfc1912.zones";
|
include "/etc/named.root.key";
|
secondary dns got synced site-aws.com.zone - file exists.
So question, why replica mongodb is behaved that way. How can I make sure that if primary DNS gets down, replica (and all other nodes that references internal nodes by FQDN remain operational)