[SERVER-7360] timing error in replsets_priority1.js Created: 15/Oct/12  Updated: 17/Oct/12  Resolved: 17/Oct/12

Status: Closed
Project: Core Server
Component/s: Replication
Affects Version/s: 2.2.0
Fix Version/s: None

Type: Bug Priority: Blocker - P1
Reporter: Ian Whalen (Inactive) Assignee: Randolph Tan
Resolution: Done Votes: 0
Labels: buildbot
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Attachments: Text File replsets_priority1.txt    
Operating System: ALL
Participants:

 Description   

http://buildlogs.mongodb.org/V2.2%20OS%20X%2010.5%2064-bit/builds/63/test/slow%20nightly/replsets_priority1.js

Mon Oct 15 07:28:35 uncaught exception: assert.soon failed: function () {
    var ok = true;
    try {
        status = master.adminCommand({replSetGetStatus:1});
    } catch (e) {
        print(e);
        print("nreplsets_priority1.js checkPrimaryIs reconnecting");
        reconnect(master);
        status = master.adminCommand({replSetGetStatus:1});
    }
    var str = "goal: " + node.host + "==1 states: ";
    if (!status || !status.members) {
        return false;
    }
    status.members.forEach(function (m) {str += m.name + ": " + m.state + " ";if (m.name == node.host) {ok &= m.state == 1;} else {ok &= m.state != 1 || m.state == 1 && m.health == 0;}});
    print();
    print(str);
    print();
    occasionally(function () {print("\nstatus:");printjson(status);print();}, 15);
    return ok;
}, msg:bs-osx-106-x86-64-1.local:31001==1



 Comments   
Comment by Ian Whalen (Inactive) [ 17/Oct/12 ]

passed since. closing.

Comment by Randolph Tan [ 16/Oct/12 ]

Based from the logs, it looked like the test timed out waiting for 31001 to become primary just a few moments before the election concluded and 31001 being selected as primary:

m31000| Mon Oct 15 07:27:51 [conn22] replSet voting no for bs-osx-106-x86-64-1.local:31001 already voted for another
assert.soon failed: function () {
    var ok = true;
    try {
        status = master.adminCommand({replSetGetStatus:1});
    } catch (e) {
        print(e);
        print("nreplsets_priority1.js checkPrimaryIs reconnecting");
        reconnect(master);
        status = master.adminCommand({replSetGetStatus:1});
    }
    var str = "goal: " + node.host + "==1 states: ";
    if (!status || !status.members) {
        return false;
    }
    status.members.forEach(function (m) {str += m.name + ": " + m.state + " ";if (m.name == node.host) {ok &= m.state == 1;} else {ok &= m.state != 1 || m.state == 1 && m.health == 0;}});
    print();
    print(str);
    print();
    occasionally(function () {print("\nstatus:");printjson(status);print();}, 15);
    return ok;
}, msg:bs-osx-106-x86-64-1.local:31001==1
 m31000| Mon Oct 15 07:27:51 [conn21] end connection 10.4.1.21:58226 (2 connections now open)
 m31000| Mon Oct 15 07:27:51 [initandlisten] connection accepted from 10.4.1.21:58230 #23 (3 connections now open)
 m31001| Mon Oct 15 07:27:51 [rsMgr] replSet couldn't elect self, only received 1 votes
 m31001| Mon Oct 15 07:27:57 [rsMgr] replSet info electSelf 1
 m31001| Mon Oct 15 07:27:57 [rsMgr] replSet couldn't elect self, only received 1 votes
 m31002| Mon Oct 15 07:28:10 [rsBackgroundSync] replSet syncing to: bs-osx-106-x86-64-1.local:31001
 m31002| Mon Oct 15 07:28:29 [conn32] end connection 10.4.1.21:58232 (3 connections now open)
 m31002| Mon Oct 15 07:28:29 [initandlisten] connection accepted from 10.4.1.21:58248 #34 (4 connections now open)
 m31002| Mon Oct 15 07:28:35 [conn33] end connection 10.4.1.21:58233 (3 connections now open)
 m31001| Mon Oct 15 07:28:03 [rsMgr] replSet info electSelf 1
 m31001| Mon Oct 15 07:28:03 [rsMgr] replSet couldn't elect self, only received 1 votes
 m31002| Mon Oct 15 07:28:35 [initandlisten] connection accepted from 10.4.1.21:58251 #35 (4 connections now open)
Error("Printing Stack Trace")@:0
("assert.soon failed: function () {\n    var ok = true;\n    try {\n        status = master.adminCommand({replSetGetStatus:1});\n    } catch (e) {\n        print(e);\n        print(\"nreplsets_priority1.js checkPrimaryIs reconnecting\");\n        reconnect(master);\n        status = master.adminCommand({replSetGetStatus:1});\n    }\n    var str = \"goal: \" + node.host + \"==1 states: \";\n    if (!status || !status.members) {\n        return false;\n    }\n    status.members.forEach(function (m) {str += m.name + \": \" + m.state + \" \";if (m.name == node.host) {ok &= m.state == 1;} else {ok &= m.state != 1 || m.state == 1 && m.health == 0;}});\n    print();\n    print(str);\n    print();\n    occasionally(function () {print(\"\\nstatus:\");printjson(status);print();}, 15);\n    return ok;\n}, msg:bs-osx-106-x86-64-1.local:31001==1")@src/mongo/shell/utils.js:58
((function () {var ok = true;try {status = master.adminCommand({replSetGetStatus:1});} catch (e) {print(e);print("nreplsets_priority1.js checkPrimaryIs reconnecting");reconnect(master);status = master.adminCommand({replSetGetStatus:1});}var str = "goal: " + node.host + "==1 states: ";if (!status || !status.members) {return false;}status.members.forEach(function (m) {str += m.name + ": " + m.state + " ";if (m.name == node.host) {ok &= m.state == 1;} else {ok &= m.state != 1 || m.state == 1 && m.health == 0;}});print();print(str);print();occasionally(function () {print("\nstatus:");printjson(status);print();}, 15);return ok;}),"bs-osx-106-x86-64-1.local:31001==1",60000,1000)@src/mongo/shell/utils.js:167
([object bson_object])@/data/buildslaves/OS_X_105_64bit_V2.2/mongo/jstests/slowNightly/replsets_priority1.js:78
@/data/buildslaves/OS_X_105_64bit_V2.2/mongo/jstests/slowNightly/replsets_priority1.js:193
 m31001| Mon Oct 15 07:28:05 [conn7] end connection 10.4.1.21:58225 (3 connections now open)
 m31001| Mon Oct 15 07:28:05 [initandlisten] connection accepted from 10.4.1.21:58235 #9 (4 connections now open)
 m31000| Mon Oct 15 07:27:55 [rsMgr] not electing self, bs-osx-106-x86-64-1.local:31002 would veto
 m31000| Mon Oct 15 07:27:57 [conn22] replSet voting no for bs-osx-106-x86-64-1.local:31001 already voted for another
 m31000| Mon Oct 15 07:28:01 [rsMgr] not electing self, bs-osx-106-x86-64-1.local:31002 would veto
 m31001| Mon Oct 15 07:28:09 [rsMgr] replSet info electSelf 1
 
 
 
 m31001| Mon Oct 15 07:28:09 [rsMgr] replSet PRIMARY
 
 
 
 
 m31001| Mon Oct 15 07:28:10 [initandlisten] connection accepted from 10.4.1.21:58238 #10 (5 connections now open)
Mon Oct 15 07:28:35 uncaught exception: assert.soon failed: function () {
    var ok = true;
    try {
        status = master.adminCommand({replSetGetStatus:1});
    } catch (e) {
        print(e);
        print("nreplsets_priority1.js checkPrimaryIs reconnecting");
        reconnect(master);
        status = master.adminCommand({replSetGetStatus:1});
    }
    var str = "goal: " + node.host + "==1 states: ";
    if (!status || !status.members) {
        return false;
    }
    status.members.forEach(function (m) {str += m.name + ": " + m.state + " ";if (m.name == node.host) {ok &= m.state == 1;} else {ok &= m.state != 1 || m.state == 1 && m.health == 0;}});
    print();
    print(str);
    print();
    occasionally(function () {print("\nstatus:");printjson(status);print();}, 15);
    return ok;
}, msg:bs-osx-106-x86-64-1.local:31001==1

But taking a closer look at the logs, the timestamp of each of the servers and the shell is not well synchronized and are abnormally far apart - it looks like 31001 thinks he's already primary way before the time when the test timed out.

Generated at Thu Feb 08 03:14:18 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.