Loading...

Type: Bug
Resolution: Done
Priority: Major - P3
Fix Version/s: None
Affects Version/s: 4.2.0
Component/s: None
Labels:
None

Operating System:
ALL

Steps To Reproduce:

Hide

// resmoke --suite=concurrency_replication --repeat=20 jstests/concurrency/fsm_workloads/repro.js

var $config = (function() {
    var data = {
        numDocs: 1000,
        docSize: 16 * 1000,
    };

    var getStringOfLength = (function() {
        var cache = {};
        return function getStringOfLength(size) {
            if (!cache[size]) {
                cache[size] = new Array(size + 1).join('x');
            }
            return cache[size];
        };
    })();

    function padDoc(doc, size) {
        // first set doc.padding so that Object.bsonsize will include the field name and other
        // overhead
        doc.padding = "";
        var paddingLength = size - Object.bsonsize(doc);
        assertAlways.lte(
            0, paddingLength, 'document is already bigger than ' + size + ' bytes: ' + tojson(doc));
        doc.padding = getStringOfLength(paddingLength);
        assertAlways.eq(size, Object.bsonsize(doc));
        return doc;
    }

    var states = {
        init: function init(db, collName) {
            this.threadCollName = db[collName].getName() + "_" + this.tid;
            assertWhenOwnColl.commandWorked(db.runCommand({create: this.threadCollName}));
        },

        insert: function insert(db, collname) {
            var bulk = db[this.threadCollName].initializeUnorderedBulkOp();
            for (var i = 0; i < this.numDocs; ++i) {
                // note: padDoc caches the large string after allocating it once, so it's ok to call
                // it in this loop
                bulk.insert(padDoc({
                    _id: i,
                    flag: i % 2 ? true : false,
                    rand: Random.rand(),
                    randInt: Random.randInt(this.numDocs)
                },
                                   this.docSize));
            }
            var res = bulk.execute();
            assertWhenOwnColl.writeOK(res);
            assertWhenOwnColl.eq(this.numDocs, res.nInserted);
            assertWhenOwnColl.eq(this.numDocs, db[this.threadCollName].find().itcount());
            assertWhenOwnColl.eq(this.numDocs / 2,
                                 db[this.threadCollName].find({flag: false}).itcount());
            assertWhenOwnColl.eq(this.numDocs / 2,
                                 db[this.threadCollName].find({flag: true}).itcount());
        },
        remove: function remove(db, collName) {
            var bulk = db[this.threadCollName].initializeUnorderedBulkOp();
            bulk.find({}).remove();
            var res = bulk.execute();

            assertWhenOwnColl.writeOK(res);
            assertWhenOwnColl.eq(this.numDocs, res.nRemoved);
            assertWhenOwnColl.eq(0, db[this.threadCollName].find().itcount());
        }
    };
    var transitions = {
        init: {insert: 1},
        insert: {remove: 1},
        remove: {insert: 1},
    };

    function setup(db, collName, cluster) {    }

    function teardown(db, collName, cluster) {    }

    return {
        threadCount: 10,
        iterations: 20,
        states: states,
        transitions: transitions,
        data: data,
        setup: setup,
        teardown: teardown,
    };
})();

Show

// resmoke --suite=concurrency_replication --repeat=20 jstests/concurrency/fsm_workloads/repro.js var $config = ( function () { var data = { numDocs: 1000, docSize: 16 * 1000, }; var getStringOfLength = ( function () { var cache = {}; return function getStringOfLength(size) { if (!cache[size]) { cache[size] = new Array(size + 1).join( 'x' ); } return cache[size]; }; })(); function padDoc(doc, size) { // first set doc.padding so that Object .bsonsize will include the field name and other // overhead doc.padding = ""; var paddingLength = size - Object .bsonsize(doc); assertAlways.lte( 0, paddingLength, 'document is already bigger than ' + size + ' bytes: ' + tojson(doc)); doc.padding = getStringOfLength(paddingLength); assertAlways.eq(size, Object .bsonsize(doc)); return doc; } var states = { init: function init(db, collName) { this .threadCollName = db[collName].getName() + "_" + this .tid; assertWhenOwnColl.commandWorked(db.runCommand({create: this .threadCollName})); }, insert: function insert(db, collname) { var bulk = db[ this .threadCollName].initializeUnorderedBulkOp(); for ( var i = 0; i < this .numDocs; ++i) { // note: padDoc caches the large string after allocating it once, so it's ok to call // it in this loop bulk.insert(padDoc({ _id: i, flag: i % 2 ? true : false , rand: Random.rand(), randInt: Random.randInt( this .numDocs) }, this .docSize)); } var res = bulk.execute(); assertWhenOwnColl.writeOK(res); assertWhenOwnColl.eq( this .numDocs, res.nInserted); assertWhenOwnColl.eq( this .numDocs, db[ this .threadCollName].find().itcount()); assertWhenOwnColl.eq( this .numDocs / 2, db[ this .threadCollName].find({flag: false }).itcount()); assertWhenOwnColl.eq( this .numDocs / 2, db[ this .threadCollName].find({flag: true }).itcount()); }, remove: function remove(db, collName) { var bulk = db[ this .threadCollName].initializeUnorderedBulkOp(); bulk.find({}).remove(); var res = bulk.execute(); assertWhenOwnColl.writeOK(res); assertWhenOwnColl.eq( this .numDocs, res.nRemoved); assertWhenOwnColl.eq(0, db[ this .threadCollName].find().itcount()); } }; var transitions = { init: {insert: 1}, insert: {remove: 1}, remove: {insert: 1}, }; function setup(db, collName, cluster) { } function teardown(db, collName, cluster) { } return { threadCount: 10, iterations: 20, states: states, transitions: transitions, data: data, setup: setup, teardown: teardown, }; })();

Sprint:
Execution Team 2019-11-04
Confidence Status:
None
Work Order:
0

Aha! Reference:
None
Tracking Level:
None
Risk Status:
None
Exec Notes:
None
Goal Name:
None
Goal Link:
None

I wrote a workload that does heavy inserts and deletes, and I'm observing many WT cache stuck errors: "Cache stuck for too long, giving up", and replication lag exceeding 47s per node.

I raised the vectored insert batch limit from 256K to 1MB, however, I wouldn't expect this to significantly alter the behavior of flow control, as I understand it.

The workload repetitively performs a large number of inserts then deletes. The goal is to have the inserts batch together an get inserted as a "vector" through the groupAndApplyInserts function during secondary batch application.

This is the patch where I first encountered the issue.

I discussed with maria.vankeulen, and FTDC reports that the lastCommitted and wall time lags are inconsistent with the member lag:

Flow Control, as a direct consumer of lastCommitted lag, will only throttle if this lag is greater than 5 seconds.

Additionally, below are the FTDC stats obtained for Flow Control:

Even when Flow Control is not actively throttling writes, we expect there to be FTDC data on the amount of locks each operation takes, since we sample this data regardless of whether Flow Control is enabled. However, this data is missing.

- - Sort By Name
  - Sort By Date
  - Ascending
  - Descending
  - Thumbnails
  - List
  - Download All

Screen Shot 2019-10-08 at 5.10.14 PM.png
381 kB
Oct 08 2019 09:11:01 PM UTC
Screen Shot 2019-10-07 at 3.24.33 PM.png
105 kB
Oct 07 2019 07:25:18 PM UTC
Screen Shot 2019-10-07 at 3.09.28 PM.png
77 kB
Oct 07 2019 07:10:54 PM UTC

Details

Description

Attachments

Attachments

Activity

People

Dates