Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-58036

TTL monitor can delete documents during chunk migration's and resharding's critical sections

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 5.3.0
    • Affects Version/s: None
    • Component/s: Sharding
    • Fully Compatible
    • ALL
    • Hide

      Apply the following patch and run the following resmoke.py invocation. The test case runs resharding because ReshardingTest#_checkConsistency() conveniently highlights the documents being un-deleted. Chunk migration would be similarly affected though.

      python buildscripts/resmoke.py run --suite=sharding jstests/sharding/resharding_critical_section_ttl_monitor.js
      
      Unable to find source-code formatter for language: diff. Available languages are: actionscript, ada, applescript, bash, c, c#, c++, cpp, css, erlang, go, groovy, haskell, html, java, javascript, js, json, lua, none, nyan, objc, perl, php, python, r, rainbow, ruby, scala, sh, sql, swift, visualbasic, xml, yaml
      diff --git a/jstests/sharding/libs/resharding_test_fixture.js b/jstests/sharding/libs/resharding_test_fixture.js
      index 5bfde1c09b..2b2304cadf 100644
      --- a/jstests/sharding/libs/resharding_test_fixture.js
      +++ b/jstests/sharding/libs/resharding_test_fixture.js
      @@ -90,7 +90,10 @@ var ReshardingTest = class {
           setup() {
               const mongosOptions = {setParameter: {}};
               const configOptions = {setParameter: {}};
      -        const rsOptions = {setParameter: {}};
      +        // XXX: Make this configurable by an argument to the ReshardingTest constructor or make
      +        // resharding_critical_section_ttl_monitor.js not depend on the TTL monitor being disabled
      +        // at startup.
      +        const rsOptions = {setParameter: {ttlMonitorEnabled: false, ttlMonitorSleepSecs: 1}};
      
               if (this._minimumOperationDurationMS !== undefined) {
                   configOptions.setParameter.reshardingMinimumOperationDurationMillis =
      diff --git a/jstests/sharding/resharding_critical_section_ttl_monitor.js b/jstests/sharding/resharding_critical_section_ttl_monitor.js
      new file mode 100644
      index 0000000000..5eb91cbc9a
      --- /dev/null
      +++ b/jstests/sharding/resharding_critical_section_ttl_monitor.js
      @@ -0,0 +1,49 @@
      +(function() {
      +"use strict";
      +
      +load("jstests/libs/discover_topology.js");
      +load("jstests/sharding/libs/resharding_test_fixture.js");
      +
      +const reshardingTest = new ReshardingTest();
      +
      +reshardingTest.setup();
      +
      +const donorShardNames = reshardingTest.donorShardNames;
      +const sourceCollection = reshardingTest.createShardedCollection({
      +    ns: "reshardingDb.coll",
      +    shardKeyPattern: {oldKey: 1},
      +    chunks: [{min: {oldKey: MinKey}, max: {oldKey: MaxKey}, shard: donorShardNames[0]}],
      +});
      +
      +const numDocs = 5;
      +assert.commandWorked(sourceCollection.insert(
      +    Array.from({length: numDocs}, (_, i) => ({oldKey: i, newKey: i, time: new Date(0)}))));
      +
      +assert.commandWorked(sourceCollection.createIndex({time: 1}, {expireAfterSeconds: 0}));
      +
      +const mongos = sourceCollection.getMongo();
      +const topology = DiscoverTopology.findConnectedNodes(mongos);
      +const donor0 = new Mongo(topology.shards[donorShardNames[0]].primary);
      +
      +const recipientShardNames = reshardingTest.recipientShardNames;
      +reshardingTest.withReshardingInBackground(  //
      +    {
      +        newShardKeyPattern: {newKey: 1},
      +        newChunks: [{min: {newKey: MinKey}, max: {newKey: MaxKey}, shard: recipientShardNames[0]}],
      +    },
      +    () => {},
      +    {
      +        postCheckConsistencyFn: () => {
      +            assert.eq(sourceCollection.find().itcount(), numDocs);
      +            assert.commandWorked(donor0.adminCommand({setParameter: 1, ttlMonitorEnabled: true}));
      +            assert.soon(() => {
      +                const serverStatus = assert.commandWorked(donor0.adminCommand({serverStatus: 1}));
      +                return serverStatus.metrics.ttl.passes >= 1;
      +            }, "timed out waiting for the TTL monitor to run");
      +
      +            reshardingTest._checkConsistency();
      +        }
      +    });
      +
      +reshardingTest.teardown();
      +})();
      
      Show
      Apply the following patch and run the following resmoke.py invocation. The test case runs resharding because ReshardingTest#_checkConsistency() conveniently highlights the documents being un-deleted. Chunk migration would be similarly affected though. python buildscripts/resmoke.py run --suite=sharding jstests/sharding/resharding_critical_section_ttl_monitor.js Unable to find source-code formatter for language: diff. Available languages are: actionscript, ada, applescript, bash, c, c#, c++, cpp, css, erlang, go, groovy, haskell, html, java, javascript, js, json, lua, none, nyan, objc, perl, php, python, r, rainbow, ruby, scala, sh, sql, swift, visualbasic, xml, yaml diff --git a/jstests/sharding/libs/resharding_test_fixture.js b/jstests/sharding/libs/resharding_test_fixture.js index 5bfde1c09b..2b2304cadf 100644 --- a/jstests/sharding/libs/resharding_test_fixture.js +++ b/jstests/sharding/libs/resharding_test_fixture.js @@ -90,7 +90,10 @@ var ReshardingTest = class { setup() { const mongosOptions = {setParameter: {}}; const configOptions = {setParameter: {}}; - const rsOptions = {setParameter: {}}; + // XXX: Make this configurable by an argument to the ReshardingTest constructor or make + // resharding_critical_section_ttl_monitor.js not depend on the TTL monitor being disabled + // at startup. + const rsOptions = {setParameter: {ttlMonitorEnabled: false , ttlMonitorSleepSecs: 1}}; if ( this ._minimumOperationDurationMS !== undefined) { configOptions.setParameter.reshardingMinimumOperationDurationMillis = diff --git a/jstests/sharding/resharding_critical_section_ttl_monitor.js b/jstests/sharding/resharding_critical_section_ttl_monitor.js new file mode 100644 index 0000000000..5eb91cbc9a --- /dev/ null +++ b/jstests/sharding/resharding_critical_section_ttl_monitor.js @@ -0,0 +1,49 @@ +(function() { + "use strict" ; + +load( "jstests/libs/discover_topology.js" ); +load( "jstests/sharding/libs/resharding_test_fixture.js" ); + + const reshardingTest = new ReshardingTest(); + +reshardingTest.setup(); + + const donorShardNames = reshardingTest.donorShardNames; + const sourceCollection = reshardingTest.createShardedCollection({ + ns: "reshardingDb.coll" , + shardKeyPattern: {oldKey: 1}, + chunks: [{min: {oldKey: MinKey}, max: {oldKey: MaxKey}, shard: donorShardNames[0]}], +}); + + const numDocs = 5; + assert .commandWorked(sourceCollection.insert( + Array.from({length: numDocs}, (_, i) => ({oldKey: i, newKey: i, time: new Date(0)})))); + + assert .commandWorked(sourceCollection.createIndex({time: 1}, {expireAfterSeconds: 0})); + + const mongos = sourceCollection.getMongo(); + const topology = DiscoverTopology.findConnectedNodes(mongos); + const donor0 = new Mongo(topology.shards[donorShardNames[0]].primary); + + const recipientShardNames = reshardingTest.recipientShardNames; +reshardingTest.withReshardingInBackground( // + { + newShardKeyPattern: {newKey: 1}, + newChunks: [{min: {newKey: MinKey}, max: {newKey: MaxKey}, shard: recipientShardNames[0]}], + }, + () => {}, + { + postCheckConsistencyFn: () => { + assert .eq(sourceCollection.find().itcount(), numDocs); + assert .commandWorked(donor0.adminCommand({setParameter: 1, ttlMonitorEnabled: true })); + assert .soon(() => { + const serverStatus = assert .commandWorked(donor0.adminCommand({serverStatus: 1})); + return serverStatus.metrics.ttl.passes >= 1; + }, "timed out waiting for the TTL monitor to run" ); + + reshardingTest._checkConsistency(); + } + }); + +reshardingTest.teardown(); +})();
    • Sharding EMEA 2021-11-29, Sharding EMEA 2021-12-13, Sharding EMEA 2022-01-24, Sharding EMEA 2022-02-07

      The TTL monitor on the primary of the donor shard can continue to delete documents even while the critical section is held to block writes. These writes may never transfer over to the primary of the recipient shard for it to also apply. It is however likely that the primary of the recipient shard will eventually decide to delete these documents on its own (unless an intervening update changes the document so it no longer makes the TTL deletion criteria). The current behavior is at least a violation of monotonic reads property when using causal consistency because the documents would go from being present (on the donor) to being deleted (on the donor) to being present again (on the recipient).

            Assignee:
            sergi.mateo-bellido@mongodb.com Sergi Mateo Bellido
            Reporter:
            max.hirschhorn@mongodb.com Max Hirschhorn
            Votes:
            0 Vote for this issue
            Watchers:
            7 Start watching this issue

              Created:
              Updated:
              Resolved: