Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-12793

One way heartbeats can result in stale set status

    • Type: Icon: Bug Bug
    • Resolution: Done
    • Priority: Icon: Major - P3 Major - P3
    • 2.7.8
    • Affects Version/s: None
    • Component/s: Replication
    • Labels:
    • Fully Compatible
    • ALL
    • Hide

      Start a replica set

      Run upgrade commands on one node:

      sslTest:PRIMARY> db.runCommand({"setParameter":1,"sslMode":"preferSSL"})
      { "was" : "allowSSL", "ok" : 1 }
      sslTest:PRIMARY> db.runCommand({"setParameter":1,"sslMode":"requireSSL"})
      { "was" : "preferSSL", "ok" : 1 }
      

      Check logs for both nodes

      Upgraded node:

      2014-02-19T15:55:58.536-0500 [conn1884] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      2014-02-19T15:55:58.571-0500 [initandlisten] connection accepted from 127.0.0.1:53283 #1885 (1 connection now open)
      2014-02-19T15:55:58.571-0500 [conn1885] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      2014-02-19T15:55:58.573-0500 [initandlisten] connection accepted from 127.0.0.1:53284 #1886 (1 connection now open)
      2014-02-19T15:55:58.574-0500 [conn1886] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      2014-02-19T15:55:58.609-0500 [initandlisten] connection accepted from 127.0.0.1:53285 #1887 (1 connection now open)
      2014-02-19T15:55:58.610-0500 [conn1887] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      2014-02-19T15:56:00.639-0500 [initandlisten] connection accepted from 127.0.0.1:53286 #1888 (1 connection now open)
      2014-02-19T15:56:00.640-0500 [conn1888] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      2014-02-19T15:56:00.677-0500 [initandlisten] connection accepted from 127.0.0.1:53287 #1889 (1 connection now open)
      2014-02-19T15:56:00.678-0500 [conn1889] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      2014-02-19T15:56:00.770-0500 [initandlisten] connection accepted from 127.0.0.1:53288 #1890 (1 connection now open)
      2014-02-19T15:56:00.771-0500 [conn1890] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      2014-02-19T15:56:00.809-0500 [initandlisten] connection accepted from 127.0.0.1:53289 #1891 (1 connection now open)
      2014-02-19T15:56:00.809-0500 [conn1891] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections
      

      Non upgraded node:

      2014-02-19T15:55:54.535-0500 [rsHealthPoll] replset info localhost:30001 just heartbeated us, but our heartbeat failed: , not changing state
      2014-02-19T15:55:55.635-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 5 more seconds
      2014-02-19T15:55:55.636-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 5 more seconds
      2014-02-19T15:55:56.543-0500 [rsHealthPoll] DBClientCursor::init call() failed
      2014-02-19T15:55:56.543-0500 [rsHealthPoll] replset info localhost:30001 heartbeat failed, retrying
      2014-02-19T15:55:56.554-0500 [rsHealthPoll] DBClientCursor::init call() failed
      2014-02-19T15:55:56.555-0500 [rsHealthPoll] replset info localhost:30001 just heartbeated us, but our heartbeat failed: , not changing state
      2014-02-19T15:55:58.574-0500 [rsHealthPoll] DBClientCursor::init call() failed
      2014-02-19T15:55:58.575-0500 [rsHealthPoll] replset info localhost:30001 heartbeat failed, retrying
      2014-02-19T15:55:58.610-0500 [rsHealthPoll] DBClientCursor::init call() failed
      2014-02-19T15:55:58.610-0500 [rsHealthPoll] replset info localhost:30001 just heartbeated us, but our heartbeat failed: , not changing state
      2014-02-19T15:56:00.641-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 0 more seconds
      2014-02-19T15:56:00.641-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 0 more seconds
      

      Replica set status:

      MongoDB shell version: 2.5.6-pre-
      connecting to: localhost:30001/test
      "sv.local:30001"
      {
      	"set" : "sslTest",
      	"date" : ISODate("2014-02-19T20:41:26Z"),
      	"myState" : 2,
      	"members" : [
      		{
      			"_id" : 0,
      			"name" : "localhost:30001",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 246,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"self" : true
      		},
      		{
      			"_id" : 1,
      			"name" : "localhost:30002",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 244,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"lastHeartbeat" : ISODate("2014-02-19T20:41:24Z"),
      			"lastHeartbeatRecv" : ISODate("2014-02-19T20:39:34Z"),
      			"pingMs" : 0,
      			"lastHeartbeatMessage" : "syncing to: localhost:30001"
      		},
      		{
      			"_id" : 2,
      			"name" : "localhost:30003",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 244,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"lastHeartbeat" : ISODate("2014-02-19T20:41:24Z"),
      			"lastHeartbeatRecv" : ISODate("2014-02-19T20:39:34Z"),
      			"pingMs" : 0,
      			"lastHeartbeatMessage" : "syncing to: localhost:30001"
      		}
      	],
      	"ok" : 1
      }
      { "sslMode" : "requireSSL", "ok" : 1 }
      { "clusterAuthMode" : "sendKeyFile", "ok" : 1 }
      MongoDB shell version: 2.5.6-pre-
      connecting to: localhost:30002/test
      "sv.local:30002"
      {
      	"set" : "sslTest",
      	"date" : ISODate("2014-02-19T20:41:26Z"),
      	"myState" : 2,
      	"members" : [
      		{
      			"_id" : 0,
      			"name" : "localhost:30001",
      			"health" : 1,
      			"state" : 1,
      			"stateStr" : "PRIMARY",
      			"uptime" : 244,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"lastHeartbeat" : ISODate("2014-02-19T20:41:25Z"),
      			"lastHeartbeatRecv" : ISODate("2014-02-19T20:41:26Z"),
      			"pingMs" : 169
      		},
      		{
      			"_id" : 1,
      			"name" : "localhost:30002",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 246,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"infoMessage" : "syncing to: localhost:30001",
      			"self" : true
      		},
      		{
      			"_id" : 2,
      			"name" : "localhost:30003",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 244,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"lastHeartbeat" : ISODate("2014-02-19T20:41:26Z"),
      			"lastHeartbeatRecv" : ISODate("2014-02-19T20:41:26Z"),
      			"pingMs" : 0,
      			"lastHeartbeatMessage" : "syncing to: localhost:30001"
      		}
      	],
      	"ok" : 1
      }
      { "sslMode" : "allowSSL", "ok" : 1 }
      { "clusterAuthMode" : "sendKeyFile", "ok" : 1 }
      MongoDB shell version: 2.5.6-pre-
      connecting to: localhost:30003/test
      "sv.local:30003"
      {
      	"set" : "sslTest",
      	"date" : ISODate("2014-02-19T20:41:27Z"),
      	"myState" : 2,
      	"members" : [
      		{
      			"_id" : 0,
      			"name" : "localhost:30001",
      			"health" : 1,
      			"state" : 1,
      			"stateStr" : "PRIMARY",
      			"uptime" : 245,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"lastHeartbeat" : ISODate("2014-02-19T20:41:27Z"),
      			"lastHeartbeatRecv" : ISODate("2014-02-19T20:41:26Z"),
      			"pingMs" : 137
      		},
      		{
      			"_id" : 1,
      			"name" : "localhost:30002",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 245,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"lastHeartbeat" : ISODate("2014-02-19T20:41:26Z"),
      			"lastHeartbeatRecv" : ISODate("2014-02-19T20:41:26Z"),
      			"pingMs" : 0,
      			"lastHeartbeatMessage" : "syncing to: localhost:30001"
      		},
      		{
      			"_id" : 2,
      			"name" : "localhost:30003",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 247,
      			"optime" : Timestamp(1392416492, 1),
      			"optimeDate" : ISODate("2014-02-14T22:21:32Z"),
      			"infoMessage" : "syncing to: localhost:30001",
      			"self" : true
      		}
      	],
      	"ok" : 1
      }
      { "sslMode" : "allowSSL", "ok" : 1 }
      { "clusterAuthMode" : "sendKeyFile", "ok" : 1 }
      
      Show
      Start a replica set Run upgrade commands on one node: sslTest:PRIMARY> db.runCommand({ "setParameter" :1, "sslMode" : "preferSSL" }) { "was" : "allowSSL" , "ok" : 1 } sslTest:PRIMARY> db.runCommand({ "setParameter" :1, "sslMode" : "requireSSL" }) { "was" : "preferSSL" , "ok" : 1 } Check logs for both nodes Upgraded node: 2014-02-19T15:55:58.536-0500 [conn1884] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections 2014-02-19T15:55:58.571-0500 [initandlisten] connection accepted from 127.0.0.1:53283 #1885 (1 connection now open) 2014-02-19T15:55:58.571-0500 [conn1885] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections 2014-02-19T15:55:58.573-0500 [initandlisten] connection accepted from 127.0.0.1:53284 #1886 (1 connection now open) 2014-02-19T15:55:58.574-0500 [conn1886] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections 2014-02-19T15:55:58.609-0500 [initandlisten] connection accepted from 127.0.0.1:53285 #1887 (1 connection now open) 2014-02-19T15:55:58.610-0500 [conn1887] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections 2014-02-19T15:56:00.639-0500 [initandlisten] connection accepted from 127.0.0.1:53286 #1888 (1 connection now open) 2014-02-19T15:56:00.640-0500 [conn1888] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections 2014-02-19T15:56:00.677-0500 [initandlisten] connection accepted from 127.0.0.1:53287 #1889 (1 connection now open) 2014-02-19T15:56:00.678-0500 [conn1889] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections 2014-02-19T15:56:00.770-0500 [initandlisten] connection accepted from 127.0.0.1:53288 #1890 (1 connection now open) 2014-02-19T15:56:00.771-0500 [conn1890] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections 2014-02-19T15:56:00.809-0500 [initandlisten] connection accepted from 127.0.0.1:53289 #1891 (1 connection now open) 2014-02-19T15:56:00.809-0500 [conn1891] AssertionException handling request, closing client connection: 17189 The server is configured to only allow SSL connections Non upgraded node: 2014-02-19T15:55:54.535-0500 [rsHealthPoll] replset info localhost:30001 just heartbeated us, but our heartbeat failed: , not changing state 2014-02-19T15:55:55.635-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 5 more seconds 2014-02-19T15:55:55.636-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 5 more seconds 2014-02-19T15:55:56.543-0500 [rsHealthPoll] DBClientCursor::init call() failed 2014-02-19T15:55:56.543-0500 [rsHealthPoll] replset info localhost:30001 heartbeat failed, retrying 2014-02-19T15:55:56.554-0500 [rsHealthPoll] DBClientCursor::init call() failed 2014-02-19T15:55:56.555-0500 [rsHealthPoll] replset info localhost:30001 just heartbeated us, but our heartbeat failed: , not changing state 2014-02-19T15:55:58.574-0500 [rsHealthPoll] DBClientCursor::init call() failed 2014-02-19T15:55:58.575-0500 [rsHealthPoll] replset info localhost:30001 heartbeat failed, retrying 2014-02-19T15:55:58.610-0500 [rsHealthPoll] DBClientCursor::init call() failed 2014-02-19T15:55:58.610-0500 [rsHealthPoll] replset info localhost:30001 just heartbeated us, but our heartbeat failed: , not changing state 2014-02-19T15:56:00.641-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 0 more seconds 2014-02-19T15:56:00.641-0500 [rsBackgroundSync] replSet not trying to sync from localhost:30001, it is vetoed for 0 more seconds Replica set status: MongoDB shell version: 2.5.6-pre- connecting to: localhost:30001/test "sv.local:30001" { "set" : "sslTest" , "date" : ISODate( "2014-02-19T20:41:26Z" ), "myState" : 2, "members" : [ { "_id" : 0, "name" : "localhost:30001" , "health" : 1, "state" : 2, "stateStr" : "SECONDARY" , "uptime" : 246, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "self" : true }, { "_id" : 1, "name" : "localhost:30002" , "health" : 1, "state" : 2, "stateStr" : "SECONDARY" , "uptime" : 244, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "lastHeartbeat" : ISODate( "2014-02-19T20:41:24Z" ), "lastHeartbeatRecv" : ISODate( "2014-02-19T20:39:34Z" ), "pingMs" : 0, "lastHeartbeatMessage" : "syncing to: localhost:30001" }, { "_id" : 2, "name" : "localhost:30003" , "health" : 1, "state" : 2, "stateStr" : "SECONDARY" , "uptime" : 244, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "lastHeartbeat" : ISODate( "2014-02-19T20:41:24Z" ), "lastHeartbeatRecv" : ISODate( "2014-02-19T20:39:34Z" ), "pingMs" : 0, "lastHeartbeatMessage" : "syncing to: localhost:30001" } ], "ok" : 1 } { "sslMode" : "requireSSL" , "ok" : 1 } { "clusterAuthMode" : "sendKeyFile" , "ok" : 1 } MongoDB shell version: 2.5.6-pre- connecting to: localhost:30002/test "sv.local:30002" { "set" : "sslTest" , "date" : ISODate( "2014-02-19T20:41:26Z" ), "myState" : 2, "members" : [ { "_id" : 0, "name" : "localhost:30001" , "health" : 1, "state" : 1, "stateStr" : "PRIMARY" , "uptime" : 244, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "lastHeartbeat" : ISODate( "2014-02-19T20:41:25Z" ), "lastHeartbeatRecv" : ISODate( "2014-02-19T20:41:26Z" ), "pingMs" : 169 }, { "_id" : 1, "name" : "localhost:30002" , "health" : 1, "state" : 2, "stateStr" : "SECONDARY" , "uptime" : 246, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "infoMessage" : "syncing to: localhost:30001" , "self" : true }, { "_id" : 2, "name" : "localhost:30003" , "health" : 1, "state" : 2, "stateStr" : "SECONDARY" , "uptime" : 244, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "lastHeartbeat" : ISODate( "2014-02-19T20:41:26Z" ), "lastHeartbeatRecv" : ISODate( "2014-02-19T20:41:26Z" ), "pingMs" : 0, "lastHeartbeatMessage" : "syncing to: localhost:30001" } ], "ok" : 1 } { "sslMode" : "allowSSL" , "ok" : 1 } { "clusterAuthMode" : "sendKeyFile" , "ok" : 1 } MongoDB shell version: 2.5.6-pre- connecting to: localhost:30003/test "sv.local:30003" { "set" : "sslTest" , "date" : ISODate( "2014-02-19T20:41:27Z" ), "myState" : 2, "members" : [ { "_id" : 0, "name" : "localhost:30001" , "health" : 1, "state" : 1, "stateStr" : "PRIMARY" , "uptime" : 245, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "lastHeartbeat" : ISODate( "2014-02-19T20:41:27Z" ), "lastHeartbeatRecv" : ISODate( "2014-02-19T20:41:26Z" ), "pingMs" : 137 }, { "_id" : 1, "name" : "localhost:30002" , "health" : 1, "state" : 2, "stateStr" : "SECONDARY" , "uptime" : 245, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "lastHeartbeat" : ISODate( "2014-02-19T20:41:26Z" ), "lastHeartbeatRecv" : ISODate( "2014-02-19T20:41:26Z" ), "pingMs" : 0, "lastHeartbeatMessage" : "syncing to: localhost:30001" }, { "_id" : 2, "name" : "localhost:30003" , "health" : 1, "state" : 2, "stateStr" : "SECONDARY" , "uptime" : 247, "optime" : Timestamp(1392416492, 1), "optimeDate" : ISODate( "2014-02-14T22:21:32Z" ), "infoMessage" : "syncing to: localhost:30001" , "self" : true } ], "ok" : 1 } { "sslMode" : "allowSSL" , "ok" : 1 } { "clusterAuthMode" : "sendKeyFile" , "ok" : 1 }

      In a replica set, if one node is upgraded to "requireSSL" using the parameters added by SERVER-11431 and the other nodes are still in the "allowSSL" state, the node in the "requireSSL" state can communicate with and send heartbeats to every other node, but the others cannot communicate with it.

      The result of this is that the state on the nodes that can't send heartbeats does not get updated because of this line https://github.com/mongodb/mongo/blob/v2.6/src/mongo/db/repl/heartbeat.cpp#L449. This makes rs.status() an ineffective way to check the state of connections between nodes.

            Assignee:
            Unassigned Unassigned
            Reporter:
            sverch Shaun Verch
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

              Created:
              Updated:
              Resolved: