diff --git a/jstests/sharding/delete_during_migrate.js b/jstests/sharding/delete_during_migrate.js index 3d4d5432c0c..59b6139c36f 100644 --- a/jstests/sharding/delete_during_migrate.js +++ b/jstests/sharding/delete_during_migrate.js @@ -41,8 +41,12 @@ var join = startParallelShell("db." + coll + ".remove({});", st.s0.port); // migrate while deletions are happening try { - assert.commandWorked(st.s0.adminCommand( - {moveChunk: ns, find: {a: 1}, to: st.getOther(st.getPrimaryShard(dbname)).name})); + assert.commandWorked(st.s0.adminCommand({ + moveChunk: ns, + find: {a: 1}, + to: st.getOther(st.getPrimaryShard(dbname)).name, + _waitForDelete: true + })); } catch (e) { const expectedFailureMessage = "startCommit timed out waiting for the catch up completion."; if (!slowTestVariant || !e.message.match(expectedFailureMessage)) { diff --git a/jstests/sharding/demo.js b/jstests/sharding/demo.js new file mode 100644 index 00000000000..dbbb60a9c6c --- /dev/null +++ b/jstests/sharding/demo.js @@ -0,0 +1,124 @@ +//utils variables +const dbname = "demodb" +const ns = "demodb.test" +const admin = db.getSiblingDB("admin"); + +//operations + + +function printInfo(st,extramsg) +{ + output = { + "extramsg": extramsg, + "nchunks_rs1" : "number of chunks rs1 " + st.s.getDB("config").chunks.find({shard:"demo-rs1"}).count(), + "nchunks_rs0" : "number of chunks rs0 " + st.s.getDB("config").chunks.find({shard:"demo-rs0"}).count(), + "config.collections" : st.s.getDB("config").collections.find({}).toArray(), + "config.chunks": st.s.getDB("config").chunks.find({}).toArray(), + "config.shards" : st.s.getDB("config").shards.find().toArray(), + "demodb.size" : "st.s.getDB(demodb).test.find({}).count():" + st.s.getDB("demodb").test.find({}).count() + }; + + print("printInfo" + JSON.stringify(output, null, 2)) +} + + +const setFailPoint = (mode) => { + FixtureHelpers.runCommandOnEachPrimary( + {db: db.getSiblingDB("admin"), cmdObj: {configureFailPoint: 'blockShardRegistryUpdateRemoveShard', mode}}); +}; + + +//main +(function() { + "use strict"; + + load('jstests/libs/fail_point_util.js'); + load("jstests/libs/fixture_helpers.js"); // For FixtureHelpers. + + var st = new ShardingTest({shards: 2, rs: {nodes: 1}, config: 1,enableBalancer: true}); + + printInfo(st,"after createCluster"); + + let blockShardRegistryUpdateRemoveShard = configureFailPoint(st.configRS.getPrimary(), "blockShardRegistryUpdateRemoveShard"); + + //start removing + let reply = st.s.adminCommand({removeShard:"demo-rs0"}); + print("after remove shard 1 time" + tojson(reply)); + printInfo(st,"after removeShard"); + + let join =startParallelShell(function() + { + load("jstests/libs/fixture_helpers.js"); // For FixtureHelpers. + + assert.soon( () => { + sleep(1000); + var reply = db.adminCommand({removeShard:"demo-rs0"}); + assert.commandWorked(reply); + print("tried removeShard" + tojson(reply)); + + output = { + "extramsg": "in assert.soon removeShard", + "nchunks_rs1" : "number of chunks rs1 " + db.getSiblingDB("config").chunks.find({shard:"demo-rs1"}).count(), + "nchunks_rs0" : "number of chunks rs0 " + db.getSiblingDB("config").chunks.find({shard:"demo-rs0"}).count(), + "config.chunks": db.getSiblingDB("config").chunks.find({}).toArray(), + "config.shards" : db.getSiblingDB("config").shards.find().toArray(), + "demodb.size" : "demodb.test.stats().size:" + db.getSiblingDB("demodb").test.stats().size + }; + + print("printInfo" + JSON.stringify(output, null, 2)) + + //return db.getSiblingDB("config").chunks.find({shard:"demo-rs0"}).count() == 0; //this check is the same as the one in the sharding catalog + return reply.state=="completed"; //this check is the same as the one in the sharding catalog + + },"removeShard is taking too long", 60000 ); + + print("removeShard completed"); + + },st.s.port); + + sleep(1000); + + //sync - before setting failpoint to off we want to be sure remove shard is hanging + //we know this happens when we have a shard with no chunks + + + print("before blockShardRegistryUpdateRemoveShard.wait()"); + blockShardRegistryUpdateRemoveShard.wait(); + print("after blockShardRegistryUpdateRemoveShard.wait()"); + + assert.commandWorked(st.s.adminCommand({enableSharding: dbname})); + assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 'hashed'}, numInitialChunks: 2})); + + printInfo(st,"after shardCollection"); + + var bigString0 = "0".repeat(5* 1024 * 1024); // 5 Mb + var bigString1 = "1".repeat(5* 1024 * 1024); // 5 Mb + var bigString2 = "2".repeat(5* 1024 * 1024); // 5 Mb + var bigString3 = "3".repeat(5* 1024 * 1024); // 5 Mb + + var bulk = st.s.getDB("demodb").test.initializeUnorderedBulkOp(); + let num = 0; + + print("inserting 0"); + bulk.insert({_id: "zeros", s: bigString0 }); + print("inserting 1"); + bulk.insert({_id: "unos", s: bigString1 }); + print("inserting 2"); + bulk.insert({_id: "doces", s: bigString2 }); + print("inserting 3"); + bulk.insert({_id: "treces", s: bigString3 }); + + assert.commandWorked(bulk.execute()); + + printInfo(st,"after insert data"); + + + blockShardRegistryUpdateRemoveShard.off(); + + join(); + + printInfo(st,"after insert data & remove"); + + st.stop(); +})(); + \ No newline at end of file diff --git a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp index 947ec9fb3c2..b9410f9a0ed 100644 --- a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp +++ b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp @@ -95,6 +95,7 @@ #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding +MONGO_FAIL_POINT_DEFINE(blockShardRegistryUpdateRemoveShard); namespace mongo { namespace { @@ -920,6 +921,9 @@ RemoveShardProgress ShardingCatalogManager::removeShard(OperationContext* opCtx, LOGV2( 21949, "Going to remove shard: {shardId}", "Going to remove shard", "shardId"_attr = name); + logd("About to hanging due to blockShardRegistryUpdateRemoveShard fail point"); + blockShardRegistryUpdateRemoveShard.pauseWhileSet(opCtx); + // Synchronize the control shard selection, the shard's document removal, and the topology time // update to exclude potential race conditions in case of concurrent add/remove shard // operations.