Loading...

XML

Word

Printable

JSON

Type: Bug
Resolution: Done
Priority: Major - P3
Fix Version/s: None
Affects Version/s: None
Component/s: Shell
Labels:
- balancer

Assigned Teams:

Sharding
Operating System:
ALL
Steps To Reproduce:
Hide

jstest is attached. Basically:

Start sharded cluster

sh.stopBalancer()

db.getSiblingDB("config").locks.update({_id:"balancer"},{$set:{state:2}})

sh.stopBalancer(1000) takes 15 minutes to timeout, not 1 second
Show
jstest is attached. Basically: Start sharded cluster sh.stopBalancer() db.getSiblingDB( "config" ).locks.update({_id: "balancer" },{$set:{state:2}}) sh.stopBalancer(1000) takes 15 minutes to timeout, not 1 second
CAR Domain/s:
None

Aha! Reference:
None
Tracking Level:
None
Risk Status:
None
Exec Notes:
None
Goal Name(s):
None
Goal Link:
None

sh.stopBalancer() accepts a timeout and interval, and passes them to sh.waitForBalancer(), which in turn passes them to sh.waitForBalancerOff():

sh.stopBalancer = function( timeout, interval ) {
    sh.setBalancerState( false )
    sh.waitForBalancer( false, timeout, interval )
}

sh.waitForBalancer = function( onOrNot, timeout, interval ){
    
    // If we're waiting for the balancer to turn on or switch state or
    // go to a particular state
    if( onOrNot ){
        // Just wait for the balancer lock to change, can't ensure we'll ever see it
        // actually locked
        sh.waitForDLock( "balancer", undefined, timeout, interval )
    }
    else {
        // Otherwise we need to wait until we're sure balancing stops
        sh.waitForBalancerOff( timeout, interval )
    }
    
}

However, sh.waitForBalancerOff does not pass these values through to sh.waitForDLock, instead passing a hardcoded value of 15 minutes:

sh.waitForBalancerOff = function( timeout, interval ){
    
    var pings = db.getSisterDB( "config" ).mongos.find().toArray()
    var activePings = []
    for( var i = 0; i < pings.length; i++ ){
        if( ! pings[i].waiting ) activePings.push( pings[i] )
    }
    
    print( "Waiting for active hosts..." )
    
    activePings = sh.waitForPingChange( activePings, 60 * 1000 )
    
    // After 1min, we assume that all hosts with unchanged pings are either 
    // offline (this is enough time for a full errored balance round, if a network
    // issue, which would reload settings) or balancing, which we wait for next
    // Legacy hosts we always have to wait for
    
    print( "Waiting for the balancer lock..." )
    
    // Wait for the balancer lock to become inactive
    // We can guess this is stale after 15 mins, but need to double-check manually
    try{ 
        sh.waitForDLock( "balancer", false, 15 * 60 * 1000 )
    }
    catch( e ){
        print( "Balancer still may be active, you must manually verify this is not the case using the config.changelog collection." )
        throw Error(e);
    }
        
    print( "Waiting again for active hosts after balancer is off..." )
    
    // Wait a short time afterwards, to catch the host which was balancing earlier
    activePings = sh.waitForPingChange( activePings, 5 * 1000 )
    
    // Warn about all the stale host pings remaining
    for( var i = 0; i < activePings.length; i++ ){
        print( "Warning : host " + activePings[i]._id + " seems to have been offline since " + activePings[i].ping )
    }
    
}

The 15 minute timeout should be a default which can be overridden, and the interval should be respected, i.e.:

Unable to find source-code formatter for language: diff. Available languages are: actionscript, ada, applescript, bash, c, c#, c++, cpp, css, erlang, go, groovy, haskell, html, java, javascript, js, json, lua, none, nyan, objc, perl, php, python, r, rainbow, ruby, scala, sh, sql, swift, visualbasic, xml, yaml

diff --git a/src/mongo/shell/utils_sh.js b/src/mongo/shell/utils_sh.js
index d9c05a3..f9215bb 100644
--- a/src/mongo/shell/utils_sh.js
+++ b/src/mongo/shell/utils_sh.js
@@ -225,7 +225,7 @@ sh.waitForBalancerOff = function( timeout, interval ){
     // Wait for the balancer lock to become inactive
     // We can guess this is stale after 15 mins, but need to double-check manually
     try{
-        sh.waitForDLock( "balancer", false, 15 * 60 * 1000 )
+        sh.waitForDLock( "balancer", false, timeout || 15 * 60 * 1000, interval )
     }
     catch( e ){
         print( "Balancer still may be active, you must manually verify this is not the case using the config.changelog collection." )

- - Sort By Name
  - Sort By Date
  - Ascending
  - Descending
  - Thumbnails
  - List
  - Download All

stopBalancerTimeout.js
0.3 kB
Mar 02 2015 05:52:31 AM UTC

Assignee:: [DO NOT USE] Backlog - Sharding Team
Reporter:: Kevin Pulo
Participants:: [DO NOT USE] Backlog - Sharding Team, Andy Schwerin, Kevin Pulo
Votes:: 0 Vote for this issue
Watchers:: 4 Start watching this issue

Created:: Mar 02 2015 05:52:31 AM UTC
Updated:: Dec 06 2022 04:55:21 AM UTC
Resolved:: Dec 12 2016 08:20:40 PM UTC

Details

Description

Attachments

Attachments

Activity

People

Dates