[SERVER-20822] make sync source decisions based on ReplSetMetadata, because that data is less stale Created: 08/Oct/15  Updated: 15/Oct/15  Resolved: 14/Oct/15

Status: Closed
Project: Core Server
Component/s: Replication, Testing Infrastructure
Affects Version/s: None
Fix Version/s: 3.2.0-rc0

Type: Bug Priority: Major - P3
Reporter: Max Hirschhorn Assignee: Matt Dannenberg
Resolution: Done Votes: 0
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Related
related to SERVER-20804 replica set nodes should consider cha... Closed
Backwards Compatibility: Fully Compatible
Operating System: ALL
Sprint: Repl B (10/30/15)
Participants:
Linked BF Score: 0

 Description   

Task
Logs

[js_test:chaining_removal] 2015-10-07T22:37:56.287+0000 2015-10-07T22:37:56.286+0000 E QUERY    [thread1] Error: assert.soon failed, msg:node 4 failed to start chaining: undefined :
[js_test:chaining_removal] 2015-10-07T22:37:56.287+0000 doassert@src/mongo/shell/assert.js:15:14
[js_test:chaining_removal] 2015-10-07T22:37:56.287+0000 assert.soon@src/mongo/shell/assert.js:194:13
[js_test:chaining_removal] 2015-10-07T22:37:56.287+0000 @jstests/replsets/chaining_removal.js:28:1
[js_test:chaining_removal] 2015-10-07T22:37:56.287+0000 @jstests/replsets/chaining_removal.js:3:2
[js_test:chaining_removal] 2015-10-07T22:37:56.288+0000
[js_test:chaining_removal] 2015-10-07T22:37:56.288+0000 failed to load: jstests/replsets/chaining_removal.js



 Comments   
Comment by Githook User [ 14/Oct/15 ]

Author:

{u'username': u'dannenberg', u'name': u'matt dannenberg', u'email': u'matt.dannenberg@10gen.com'}

Message: SERVER-20822 make sync source decisions based on ReplSetMetadata
Branch: master
https://github.com/mongodb/mongo/commit/c860db7d39f8559054d1cebb83a6838accef8d94

Comment by Matt Dannenberg [ 09/Oct/15 ]

Both this and BF-1266 are failing for the same reason:
1) node a is told to sync from node b
2) node a agrees to sync from node b
3) node a sees node b is not a primary, nor does it have a sync source (due to stale heartbeat data)
4) node a stops syncing from node b before our check passes

I think we need a better solution for SERVER-20804.

Comment by Max Hirschhorn [ 09/Oct/15 ]

Failed again, but now there's more information from the assert!

Task
Logs

[js_test:chaining_removal] 2015-10-08T22:15:39.790+0000 2015-10-08T22:15:39.781+0000 E QUERY    [thread1] Error: node 4 failed to start chaining through node 1: {
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 	"set" : "chaining_removal",
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 	"date" : ISODate("2015-10-08T22:15:39.778Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 	"myState" : 2,
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 	"term" : NumberLong(-1),
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 	"heartbeatIntervalMillis" : NumberLong(2000),
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 	"members" : [
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 		{
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 			"_id" : 0,
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 			"name" : "ip-10-41-142-30:20010",
[js_test:chaining_removal] 2015-10-08T22:15:39.791+0000 			"health" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"state" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"stateStr" : "PRIMARY",
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"uptime" : 34,
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"optime" : Timestamp(1444342506, 1),
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"optimeDate" : ISODate("2015-10-08T22:15:06Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"lastHeartbeat" : ISODate("2015-10-08T22:15:38.592Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"lastHeartbeatRecv" : ISODate("2015-10-08T22:15:39.547Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"pingMs" : NumberLong(0),
[js_test:chaining_removal] 2015-10-08T22:15:39.792+0000 			"lastHeartbeatMessage" : "could not find member to sync from",
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"electionTime" : Timestamp(1444342505, 1),
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"electionDate" : ISODate("2015-10-08T22:15:05Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"configVersion" : 1
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 		},
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 		{
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"_id" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"name" : "ip-10-41-142-30:20011",
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"health" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"state" : 2,
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"stateStr" : "SECONDARY",
[js_test:chaining_removal] 2015-10-08T22:15:39.793+0000 			"uptime" : 34,
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"optime" : Timestamp(1444342506, 1),
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"optimeDate" : ISODate("2015-10-08T22:15:06Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"lastHeartbeat" : ISODate("2015-10-08T22:15:38.591Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"lastHeartbeatRecv" : ISODate("2015-10-08T22:15:38.558Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"pingMs" : NumberLong(0),
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"lastHeartbeatMessage" : "syncing from: ip-10-41-142-30:20010 by request",
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"syncingTo" : "ip-10-41-142-30:20010",
[js_test:chaining_removal] 2015-10-08T22:15:39.794+0000 			"configVersion" : 1
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 		},
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 		{
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"_id" : 2,
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"name" : "ip-10-41-142-30:20012",
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"health" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"state" : 2,
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"stateStr" : "SECONDARY",
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"uptime" : 34,
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"optime" : Timestamp(1444342506, 1),
[js_test:chaining_removal] 2015-10-08T22:15:39.795+0000 			"optimeDate" : ISODate("2015-10-08T22:15:06Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"lastHeartbeat" : ISODate("2015-10-08T22:15:38.591Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"lastHeartbeatRecv" : ISODate("2015-10-08T22:15:39.606Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"pingMs" : NumberLong(0),
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"lastHeartbeatMessage" : "could not find member to sync from",
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"configVersion" : 1
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 		},
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 		{
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"_id" : 3,
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"name" : "ip-10-41-142-30:20013",
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"health" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.796+0000 			"state" : 2,
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"stateStr" : "SECONDARY",
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"uptime" : 34,
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"optime" : Timestamp(1444342506, 1),
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"optimeDate" : ISODate("2015-10-08T22:15:06Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"lastHeartbeat" : ISODate("2015-10-08T22:15:38.591Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"lastHeartbeatRecv" : ISODate("2015-10-08T22:15:38.530Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"pingMs" : NumberLong(0),
[js_test:chaining_removal] 2015-10-08T22:15:39.797+0000 			"lastHeartbeatMessage" : "syncing from: ip-10-41-142-30:20010",
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"syncingTo" : "ip-10-41-142-30:20010",
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"configVersion" : 1
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 		},
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 		{
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"_id" : 4,
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"name" : "ip-10-41-142-30:20014",
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"health" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"state" : 2,
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"stateStr" : "SECONDARY",
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"uptime" : 36,
[js_test:chaining_removal] 2015-10-08T22:15:39.798+0000 			"optime" : Timestamp(1444342506, 1),
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 			"optimeDate" : ISODate("2015-10-08T22:15:06Z"),
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 			"infoMessage" : "could not find member to sync from",
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 			"configVersion" : 1,
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 			"self" : true
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 		}
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 	],
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 	"ok" : 1
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 } :
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 doassert@src/mongo/shell/assert.js:15:14
[js_test:chaining_removal] 2015-10-08T22:15:39.799+0000 assert.soon@src/mongo/shell/assert.js:194:13
[js_test:chaining_removal] 2015-10-08T22:15:39.800+0000 @jstests/replsets/chaining_removal.js:35:1
[js_test:chaining_removal] 2015-10-08T22:15:39.800+0000 @jstests/replsets/chaining_removal.js:3:2
[js_test:chaining_removal] 2015-10-08T22:15:39.800+0000 
[js_test:chaining_removal] 2015-10-08T22:15:39.800+0000 failed to load: jstests/replsets/chaining_removal.js

Additional failure:
https://evergreen.mongodb.com/task/mongodb_mongo_master_linux_64_replicasets_legacy_f6f222926658eb9e79463e2bf0ee94beb15813c9_15_10_09_13_39_07

Comment by Githook User [ 08/Oct/15 ]

Author:

{u'username': u'dannenberg', u'name': u'matt dannenberg', u'email': u'matt.dannenberg@10gen.com'}

Message: SERVER-20822 make replsets/chaining_removal.js more robust by depending on fewer assumptions
Branch: master
https://github.com/mongodb/mongo/commit/d5453820238f41948937c6524f2eebf80b827021

Comment by Matt Dannenberg [ 08/Oct/15 ]

Don't bother with committing your changes. I have a larger rewrite that will give the test your better debugging and likely skirt the problem. CR soon.

Comment by Eric Milkie [ 08/Oct/15 ]

Another secret feature of assert!
I like your changes; please commit as they do look valuable in discovering why this test sometimes fails.

Comment by Max Hirschhorn [ 08/Oct/15 ]

Happened again:
https://evergreen.mongodb.com/task/mongodb_mongo_master_linux_64_debug_replicasets_legacy_6427313d4abf3831a41c8417054054e3a394e9c8_15_10_08_01_29_14

matt.dannenberg, milkie, should we commit my assertion message change so that we can gather more information when this test fails intermittently?

diff --git a/jstests/replsets/chaining_removal.js b/jstests/replsets/chaining_removal.js
index 76f8a6b..5bea301 100644
--- a/jstests/replsets/chaining_removal.js
+++ b/jstests/replsets/chaining_removal.js
@@ -23,13 +23,23 @@
     replTest.awaitReplication();
     assert.commandWorked(nodes[4].getDB("admin").runCommand({"replSetSyncFrom": nodes[1].host}));
     assert.commandWorked(nodes[1].getDB("admin").runCommand({"replSetSyncFrom": nodes[0].host}));
-    var res;
-    var res2;
-    assert.soon(function() {
-        res = nodes[4].getDB("admin").runCommand({"replSetGetStatus": 1});
-        res2 = nodes[1].getDB("admin").runCommand({"replSetGetStatus": 1});
-        return res.syncingTo === nodes[1].host && res2.syncingTo === nodes[0].host;
-    }, "node 4 failed to start chaining: "+ tojson(res));
+
+    var replSetStatus4;
+    var replSetStatus1;
+    assert.soon(
+        function checkReplSetStatus() {
+            replSetStatus4 = nodes[4].getDB("admin").runCommand({replSetGetStatus: 1});
+            replSetStatus1 = nodes[1].getDB("admin").runCommand({replSetGetStatus: 1});
+
+            var node1ChainingFromNode0 = replSetStatus1.syncingTo === nodes[0].host;
+            var node4ChainingFromNode1 = replSetStatus4.syncingTo === nodes[1].host;
+            return node4ChainingFromNode1 && node1ChainingFromNode0;
+        },
+        function msg() {
+            return "node 4 failed to start chaining from node 1: " + tojson(replSetStatus4) +
+                   "\nor node 1 failed to start chaining from node 0: " + tojson(replSetStatus1);
+        }
+    );
 
     // write that should reach all nodes
     var timeout = 15 * 1000;

Comment by Max Hirschhorn [ 08/Oct/15 ]

var res;
var res2;
assert.soon(function() {
    res = nodes[4].getDB("admin").runCommand({"replSetGetStatus": 1});
    res2 = nodes[1].getDB("admin").runCommand({"replSetGetStatus": 1});
    return res.syncingTo === nodes[1].host && res2.syncingTo === nodes[0].host;
}, "node 4 failed to start chaining: "+ tojson(res));

won't actually cause assert.soon() to print out the replica set status if the assertion fails because the string concatenation will be evaluated prior the function being executed (at which point res is undefined). The doassert() function accepts a function as a msg parameter that gets resolved to the assertion message it should use.

Running the test in a patch build using --repeat=20 with changes to make it so that the replica set status is logged if the assertion fails.

Generated at Thu Feb 08 03:55:23 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.