Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-29189

Top and op latency histogram counters are incorrect for getMore on an awaitData cursor

    • Type: Icon: Bug Bug
    • Resolution: Duplicate
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: None
    • Component/s: Diagnostics, Querying
    • None
    • ALL
    • Hide

      The following script demonstrates the behavioral differences between top, the per-collection latency histograms, and the global latency histogram. Note that both the top command and the per-collection latency histograms are affected by this issue.

      (function() {
          "use strict";
      
          let c = db.c;
          c.drop();
          assert.commandWorked(db.createCollection("c", {capped: true, size: 1024}));
          assert.writeOK(db.c.insert({_id: 1}));
          assert.writeOK(db.c.insert({_id: 2}));
          assert.writeOK(db.c.insert({_id: 3}));
      
          function getGlobalLatencyStats() {
              return db.serverStatus().opLatencies.reads;
          }
      
          function getCollectionLatencyStats(coll) {
              return coll.latencyStats().next().latencyStats.reads;
          }
      
          function getTop(coll) {
              return db.adminCommand({top: 1}).totals[coll.getFullName()].getmore;
          }
      
          // Global latency histogram from serverStatus.
          let oldGlobalLatency = getGlobalLatencyStats();
          assert.eq(3, db.c.find().tailable(true).itcount());
          let newGlobalLatency = getGlobalLatencyStats();
      
          let globalLatencyOpsDelta = newGlobalLatency.ops - oldGlobalLatency.ops;
          print("global latency hist ops delta: " + globalLatencyOpsDelta);
          let globalLatencyDelta = newGlobalLatency.latency - oldGlobalLatency.latency;
          print("global latency hist latency delta (micros): " + globalLatencyDelta);
      
          // Per-collection latency histogram.
          let oldCollLatency = getCollectionLatencyStats(db.c);
          assert.eq(3, db.c.find().tailable(true).itcount());
          let newCollLatency = getCollectionLatencyStats(db.c);
      
          let collLatencyOpsDelta = newCollLatency.ops - oldCollLatency.ops;
          print("collection latency hist ops delta: " + collLatencyOpsDelta);
          let collLatencyDelta = newCollLatency.latency - oldCollLatency.latency;
          print("collection latency hist latency delta (micros): " + collLatencyOpsDelta);
      
          // Top.
          let oldTop = getTop(db.c);
          assert.eq(3, db.c.find().tailable(true).itcount());
          let newTop = getTop(db.c);
      
          let topOpsDelta = newTop.count - oldTop.count;
          print("top ops delta: " + topOpsDelta);
          let topLatencyDelta = newTop.time - oldTop.time;
          print("top latency delta (micros): " + topOpsDelta);
      }());
      
      Show
      The following script demonstrates the behavioral differences between top, the per-collection latency histograms, and the global latency histogram. Note that both the top command and the per-collection latency histograms are affected by this issue. (function() { "use strict" ; let c = db.c; c.drop(); assert .commandWorked(db.createCollection( "c" , {capped: true , size: 1024})); assert .writeOK(db.c.insert({_id: 1})); assert .writeOK(db.c.insert({_id: 2})); assert .writeOK(db.c.insert({_id: 3})); function getGlobalLatencyStats() { return db.serverStatus().opLatencies.reads; } function getCollectionLatencyStats(coll) { return coll.latencyStats().next().latencyStats.reads; } function getTop(coll) { return db.adminCommand({top: 1}).totals[coll.getFullName()].getmore; } // Global latency histogram from serverStatus. let oldGlobalLatency = getGlobalLatencyStats(); assert .eq(3, db.c.find().tailable( true ).itcount()); let newGlobalLatency = getGlobalLatencyStats(); let globalLatencyOpsDelta = newGlobalLatency.ops - oldGlobalLatency.ops; print( "global latency hist ops delta: " + globalLatencyOpsDelta); let globalLatencyDelta = newGlobalLatency.latency - oldGlobalLatency.latency; print( "global latency hist latency delta (micros): " + globalLatencyDelta); // Per-collection latency histogram. let oldCollLatency = getCollectionLatencyStats(db.c); assert .eq(3, db.c.find().tailable( true ).itcount()); let newCollLatency = getCollectionLatencyStats(db.c); let collLatencyOpsDelta = newCollLatency.ops - oldCollLatency.ops; print( "collection latency hist ops delta: " + collLatencyOpsDelta); let collLatencyDelta = newCollLatency.latency - oldCollLatency.latency; print( "collection latency hist latency delta (micros): " + collLatencyOpsDelta); // Top. let oldTop = getTop(db.c); assert .eq(3, db.c.find().tailable( true ).itcount()); let newTop = getTop(db.c); let topOpsDelta = newTop.count - oldTop.count; print( "top ops delta: " + topOpsDelta); let topLatencyDelta = newTop.time - oldTop.time; print( "top latency delta (micros): " + topOpsDelta); }());
    • Query 2017-06-19

      The getMore path uses the AutoGetCollectionForReadCommand RAII helper, which on destruction updates Top and the appropriate per-collection latency histogram. It updates the stats by

      1. bumping the number of recorded reads, and
      2. recording the wall clock time between construction and destruction of the RAII object.

      For getMore on awaitData cursors, however, we release the AutoGetCollectionForReadCommand while blocking. This is to ensure that we do not hold collection locks while sleeping on a condition variable for potentially long periods of time. After waking up, the AutoGetCollectionForReadCommand is reacquired. The result is that we double-count each getMore operation in the Top diagnostic output. Furthermore, the time spent blocking for awaitData is not recorded. This is inconsistent with the global operation latency histogram reported by serverStatus(), which incorporates time spent blocking for awaitData.

            Assignee:
            david.storch@mongodb.com David Storch
            Reporter:
            david.storch@mongodb.com David Storch
            Votes:
            0 Vote for this issue
            Watchers:
            10 Start watching this issue

              Created:
              Updated:
              Resolved: