Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-61188

When storeImageInSideCollection=true, pre-image noop entries for collection with preImageRecordingEnabledForCollection=true are assigned wrong opTimes

    • Type: Icon: Bug Bug
    • Resolution: Fixed
    • Priority: Icon: Major - P3 Major - P3
    • 5.2.0, 5.1.2, 5.0.6
    • Affects Version/s: None
    • Component/s: None
    • Labels:
      None
    • Fully Compatible
    • ALL
    • v5.1, v5.0
    • Hide
      diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp
      index c3b07dba589..cf3595ce556 100644
      --- a/src/mongo/db/op_observer_impl.cpp
      +++ b/src/mongo/db/op_observer_impl.cpp
      @@ -195,6 +195,7 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx,
               if (storePreImageInOplogForRetryableWrite) {
                   opTimes.prePostImageOpTime = oplogLink.preImageOpTime;
               }
      +        LOGV2(10001, "replLogUpdate", "preImage noop"_attr = oplogLink.preImageOpTime);
           }
       
           // This case handles storing the post image for retryable findAndModify's.
      @@ -206,6 +207,7 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx,
               oplogLink.postImageOpTime = logOperation(opCtx, &noopEntry);
               invariant(opTimes.prePostImageOpTime.isNull());
               opTimes.prePostImageOpTime = oplogLink.postImageOpTime;
      +        LOGV2(10002, "replLogUpdate", "postImage noop"_attr = oplogLink.postImageOpTime);
           }
       
           oplogEntry->setOpType(repl::OpTypeEnum::kUpdate);
      @@ -215,6 +217,7 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx,
           // oplogLink could have been changed to include pre/postImageOpTime by the previous no-op write.
           repl::appendOplogEntryChainInfo(opCtx, oplogEntry, &oplogLink, args.updateArgs.stmtIds);
           if (args.updateArgs.oplogSlot) {
      +        LOGV2(10003, "replLogUpdate", "oplogSlot"_attr = args.updateArgs.oplogSlot);
               oplogEntry->setOpTime(*args.updateArgs.oplogSlot);
           }
           opTimes.writeOpTime = logOperation(opCtx, oplogEntry);
      diff --git a/src/mongo/db/op_observer_impl_test.cpp b/src/mongo/db/op_observer_impl_test.cpp
      index 653b16340b3..a2089b69357 100644
      --- a/src/mongo/db/op_observer_impl_test.cpp
      +++ b/src/mongo/db/op_observer_impl_test.cpp
      @@ -1764,11 +1764,23 @@ TEST_F(OpObserverTest, TestFundamentalOnUpdateOutputs) {
                   BSON("$set" << BSON("postImage" << true) << "$unset" << BSON("preImage" << 1));
               updateArgs.criteria = BSON("_id" << 0);
               updateArgs.storeDocOption = testCase.imageType;
      -        OplogUpdateEntryArgs update(std::move(updateArgs), nss, uuid);
       
               // Phase 2: Call the code we're testing.
               WriteUnitOfWork wuow(opCtx);
               AutoGetCollection locks(opCtx, nss, LockMode::MODE_IX);
      +        if (testCase.imageType != StoreDocOption::None &&
      +            testCase.retryableOptions != RetryableOptions::kNotRetryable &&
      +            updateArgs.storeImageInSideCollection) {
      +            // We reserve two oplog slots here, expecting the greater of the two (say TS) to be used
      +            // as the oplog timestamp. Tenant migrations and resharding will forge no-op image oplog
      +            // entries and set the timestamp for these synthetic entries to be TS - 1.
      +            auto oplogInfo = LocalOplogInfo::get(opCtx);
      +            auto slots = oplogInfo->getNextOpTimes(opCtx, 2);
      +            uassertStatusOK(opCtx->recoveryUnit()->setTimestamp(slots[1].getTimestamp()));
      +            updateArgs.oplogSlot = slots[1];
      +            LOGV2(10004, "Reserved", "slots"_attr = slots);
      +        }
      +        OplogUpdateEntryArgs update(std::move(updateArgs), nss, uuid);
               opObserver.onUpdate(opCtx, update);
               wuow.commit();
       
      
      Show
      diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp index c3b07dba589..cf3595ce556 100644 --- a/src/mongo/db/op_observer_impl.cpp +++ b/src/mongo/db/op_observer_impl.cpp @@ -195,6 +195,7 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx, if (storePreImageInOplogForRetryableWrite) { opTimes.prePostImageOpTime = oplogLink.preImageOpTime; } + LOGV2(10001, "replLogUpdate" , "preImage noop" _attr = oplogLink.preImageOpTime); } // This case handles storing the post image for retryable findAndModify's. @@ -206,6 +207,7 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx, oplogLink.postImageOpTime = logOperation(opCtx, &noopEntry); invariant(opTimes.prePostImageOpTime.isNull()); opTimes.prePostImageOpTime = oplogLink.postImageOpTime; + LOGV2(10002, "replLogUpdate" , "postImage noop" _attr = oplogLink.postImageOpTime); } oplogEntry->setOpType(repl::OpTypeEnum::kUpdate); @@ -215,6 +217,7 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx, // oplogLink could have been changed to include pre/postImageOpTime by the previous no-op write. repl::appendOplogEntryChainInfo(opCtx, oplogEntry, &oplogLink, args.updateArgs.stmtIds); if (args.updateArgs.oplogSlot) { + LOGV2(10003, "replLogUpdate" , "oplogSlot" _attr = args.updateArgs.oplogSlot); oplogEntry->setOpTime(*args.updateArgs.oplogSlot); } opTimes.writeOpTime = logOperation(opCtx, oplogEntry); diff --git a/src/mongo/db/op_observer_impl_test.cpp b/src/mongo/db/op_observer_impl_test.cpp index 653b16340b3..a2089b69357 100644 --- a/src/mongo/db/op_observer_impl_test.cpp +++ b/src/mongo/db/op_observer_impl_test.cpp @@ -1764,11 +1764,23 @@ TEST_F(OpObserverTest, TestFundamentalOnUpdateOutputs) { BSON( "$set" << BSON( "postImage" << true ) << "$unset" << BSON( "preImage" << 1)); updateArgs.criteria = BSON( "_id" << 0); updateArgs.storeDocOption = testCase.imageType; - OplogUpdateEntryArgs update(std::move(updateArgs), nss, uuid); // Phase 2: Call the code we're testing. WriteUnitOfWork wuow(opCtx); AutoGetCollection locks(opCtx, nss, LockMode::MODE_IX); + if (testCase.imageType != StoreDocOption::None && + testCase.retryableOptions != RetryableOptions::kNotRetryable && + updateArgs.storeImageInSideCollection) { + // We reserve two oplog slots here, expecting the greater of the two (say TS) to be used + // as the oplog timestamp. Tenant migrations and resharding will forge no-op image oplog + // entries and set the timestamp for these synthetic entries to be TS - 1. + auto oplogInfo = LocalOplogInfo::get(opCtx); + auto slots = oplogInfo->getNextOpTimes(opCtx, 2); + uassertStatusOK(opCtx->recoveryUnit()->setTimestamp(slots[1].getTimestamp())); + updateArgs.oplogSlot = slots[1]; + LOGV2(10004, "Reserved" , "slots" _attr = slots); + } + OplogUpdateEntryArgs update(std::move(updateArgs), nss, uuid); opObserver.onUpdate(opCtx, update); wuow.commit();
    • Replication 2021-11-15, Replication 2021-11-29

      When storeImageInSideCollection=true, we reserve two oplog slots, T-1 and T, for every findAndModify operation with a pre or post image:

      • T-1 is reserved for the potential forged noop entry for the pre or post image during tenant migration or resharding.
      • T is reserved for the update oplog entry itself.

      For a collection with preImageRecordingEnabledForCollection=false, we write the update oplog entry in the reserved slot T and write the pre or post image to a config.image_collection entry. However, for a collection with preImageRecordingEnabledForCollection=true, we currently write the pre-image to a noop oplog entry in some other oplog slot T' > T, and then write the update oplog entry in the reserved slot T. That latter is invalid so we hit this invariant in the ReplClientInfo. That is:

      1. For a findAndModify operation with preImage, we end up with the following oplog entries:
        • T-1: reserved for potential forged pre-image noop oplog entry
        • T: update oplog entry
        • T': pre-image noop oplog entry
      2. For a findAndModify operation with postImage, we end up with the following oplog entries:
        • T-1: reserved for potential forged post-image noop oplog entry
        • T: update oplog entry
        • T': pre-image noop oplog entry

      This invariant failure can be reproduced by applying the attached diffs (containing code taken from reserveOplogSlotsForRetryableFindAndModify()) and running the following test cases in OpObserverTest/TestFundamentalOnUpdateOutputs.

      2021-11-02T06:52:46.159Z I  TEST     [thread1] UpdateTestCase {"ImageType":"PreImage","PreImageRecording":true,"ChangeStreamPreAndPostImagesEnabled":false,"RetryableOptions":"Images in side collection","ExpectedOplogEntries":2}
      ...
      2021-11-02T06:52:46.159Z I  TEST     [thread1] Reserved {"slots":[{"ts":{"$timestamp":{"t":1635835966,"i":27}},"t":0},{"ts":{"$timestamp":{"t":1635835966,"i":28}},"t":0}]}
      2021-11-02T06:52:46.159Z I  REPL     [thread1] replLogUpdate {"preImage noop":{"ts":{"$timestamp":{"t":1635835966,"i":29}},"t":0}}
      2021-11-02T06:52:46.159Z I  REPL     [thread1] replLogUpdate {"oplogSlot":{"ts":{"$timestamp":{"t":1635835966,"i":28}},"t":0}}
      2021-11-02T06:52:46.159Z F  ASSERT   [thread1] Invariant failure {"expr":"ot >= _lastOp","file":"src/mongo/db/repl/repl_client_info.cpp","line":64}
      
      2021-11-02T08:05:39.296Z I  TEST     [main] UpdateTestCase {"ImageType":"PostImage","PreImageRecording":true,"ChangeStreamPreAndPostImagesEnabled":false,"RetryableOptions":"Images in side collection","ExpectedOplogEntries":2}
      ...
      2021-11-02T08:05:39.296Z I  TEST     [main] Reserved {"slots":[{"ts":{"$timestamp":{"t":1635840339,"i":42}},"t":0},{"ts":{"$timestamp":{"t":1635840339,"i":43}},"t":0}]}
      2021-11-02T08:05:39.296Z I  REPL     [main] replLogUpdate {"preImage noop":{"ts":{"$timestamp":{"t":1635840339,"i":44}},"t":0}}
      2021-11-02T08:05:39.296Z I  REPL     [main] replLogUpdate {"oplogSlot":{"ts":{"$timestamp":{"t":1635840339,"i":43}},"t":0}}
      2021-11-02T08:05:39.296Z F  ASSERT   [main] Invariant failure {"expr":"ot >= _lastOp","file":"src/mongo/db/repl/repl_client_info.cpp","line":64}
      

            Assignee:
            jason.chan@mongodb.com Jason Chan
            Reporter:
            cheahuychou.mao@mongodb.com Cheahuychou Mao
            Votes:
            0 Vote for this issue
            Watchers:
            14 Start watching this issue

              Created:
              Updated:
              Resolved: