Details

    • Type: Bug
    • Status: Closed
    • Priority: Critical - P2
    • Resolution: Fixed
    • Affects Version/s: 3.0.4
    • Fix Version/s: 3.0.6, 3.1.7
    • Component/s: Admin, WiredTiger
    • Labels:
    • Backwards Compatibility:
      Fully Compatible
    • Operating System:
      Linux
    • Backport Completed:

      Description

      MongoDb crash due to segfault

      2015-07-24T08:25:36.061+0000 F -        [conn1441936] Got signal: 11 (Segmentation fault).
       
       0xf7a0f9 0xf799c2 0xf79d1e 0x7f8105ed48d0 0x13090a5 0x134cef9 0xd867c7 0xaaf8e6 0xa29f8c 0xbec4f2 0xbec5dc 0x97c04d 0x97de88 0x9f40e4 0x9f506d 0x9f5d7b 0xbbabca 0xad0f00 0x82b88d 0xf2d22b 0x7f8105ecd0a4 0x7f8104980c2d
      ----- BEGIN BACKTRACE -----
      {"backtrace":[{"b":"400000","o":"B7A0F9"},{"b":"400000","o":"B799C2"},{"b":"400000","o":"B79D1E"},{"b":"7F8105EC5000","o":"F8D0"},{"b":"400000","o":"F090A5"},{"b":"400000","o":"F4CEF9"},{"b":"400000","o":"9867C7"},{"b":"400000","o":"6AF8E6"},{"b":"400000","o":"629F8C"},{"b":"400000","o":"7EC4F2"},{"b":"400000","o":"7EC5DC"},{"b":"400000","o":"57C04D"},{"b":"400000","o":"57DE88"},{"b":"400000","o":"5F40E4"},{"b":"400000","o":"5F506D"},{"b":"400000","o":"5F5D7B"},{"b":"400000","o":"7BABCA"},{"b":"400000","o":"6D0F00"},{"b":"400000","o":"42B88D"},{"b":"400000","o":"B2D22B"},{"b":"7F8105EC5000","o":"80A4"},{"b":"7F810489B000","o":"E5C2D"}],"processInfo":{ "mongodbVersion" : "3.0.4", "gitVersion" : "0481c958daeb2969800511e7475dc66986fa9ed5", "uname" : { "sysname" : "Linux", "release" : "3.2.0-4-amd64", "version" : "#1 SMP Debian 3.2.60-1+deb7u3", "machine" : "x86_64" }, "somap" : [ { "elfType" : 2, "b" : "400000", "buildId" : "EB7A0F7D9F202DB3ADD7637D20509D9DE82AC5A1" }, { "b" : "7FFF2D438000", "path" : "linux-vdso.so.1", "elfType" : 3, "buildId" : "804F357CF825E4716BA7DA60414184939D1D3432" }, { "b" : "7F8105EC5000", "path" : "/lib/x86_64-linux-gnu/libpthread.so.0", "elfType" : 3, "buildId" : "13FBE0ABFFDBC4E77822218A4FEAAECF2C530CCD" }, { "b" : "7F8105C65000", "path" : "/usr/lib/x86_64-linux-gnu/libssl.so.1.0.0", "elfType" : 3, "buildId" : "90B85CC62C41AE65576EA31156205F149B04FD31" }, { "b" : "7F810586D000", "path" : "/usr/lib/x86_64-linux-gnu/libcrypto.so.1.0.0", "elfType" : 3, "buildId" : "9CC15590792B6F822DBB2ACEF66CB9A89FAA0775" }, { "b" : "7F8105665000", "path" : "/lib/x86_64-linux-gnu/librt.so.1", "elfType" : 3, "buildId" : "5700D48F643EBE7756A77E4A0F6526DCFC2343F7" }, { "b" : "7F8105461000", "path" : "/lib/x86_64-linux-gnu/libdl.so.2", "elfType" : 3, "buildId" : "70AA589D1428E82B06ED23A77CBDA444D087B764" }, { "b" : "7F810515A000", "path" : "/usr/lib/x86_64-linux-gnu/libstdc++.so.6", "elfType" : 3, "buildId" : "8711429397A5AF8B6269B867D830EDF6E0225B8D" }, { "b" : "7F8104E59000", "path" : "/lib/x86_64-linux-gnu/libm.so.6", "elfType" : 3, "buildId" : "7D57EA20FD7C4A976288AAF88E42D6F2ADFE80F2" }, { "b" : "7F8104C43000", "path" : "/lib/x86_64-linux-gnu/libgcc_s.so.1", "elfType" : 3, "buildId" : "F980B1188708F8D8B5C35D185444AF4CB939AA1E" }, { "b" : "7F810489B000", "path" : "/lib/x86_64-linux-gnu/libc.so.6", "elfType" : 3, "buildId" : "8ADD0E23D1B03EF3902512D340D6590F3F83DA17" }, { "b" : "7F81060E2000", "path" : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3, "buildId" : "AE8BAB7FF427EA2AD85632F8EC54350CAA1CF7DF" }, { "b" : "7F8104684000", "path" : "/lib/x86_64-linux-gnu/libz.so.1", "elfType" : 3, "buildId" : "1EFEB71FD4999C2307570D673A724EA4E1D85267" } ] }}
       mongod(_ZN5mongo15printStackTraceERSo+0x29) [0xf7a0f9]
       mongod(+0xB799C2) [0xf799c2]
       mongod(+0xB79D1E) [0xf79d1e]
       libpthread.so.0(+0xF8D0) [0x7f8105ed48d0]
       mongod(__wt_btcur_prev+0x1405) [0x13090a5]
       mongod(+0xF4CEF9) [0x134cef9]
       mongod(+0x9867C7) [0xd867c7]
       mongod(_ZN5mongo16BtreeIndexCursor15restorePositionEPNS_16OperationContextE+0x16) [0xaaf8e6]
       mongod(_ZN5mongo9IndexScan12restoreStateEPNS_16OperationContextE+0x4C) [0xa29f8c]
       mongod(_ZN5mongo12PlanExecutor12restoreStateEPNS_16OperationContextE+0x42) [0xbec4f2]
       mongod(_ZN5mongo12PlanExecutor18getNextSnapshottedEPNS_11SnapshottedINS_7BSONObjEEEPNS_8RecordIdE+0x6C) [0xbec5dc]
       mongod(_ZN5mongo16CmdFindAndModify7runImplEPNS_16OperationContextERKSsS4_RKNS_7BSONObjES7_S7_S7_bbbRNS_14BSONObjBuilderERSs+0x43D) [0x97c04d]
       mongod(_ZN5mongo16CmdFindAndModify3runEPNS_16OperationContextERKSsRNS_7BSONObjEiRSsRNS_14BSONObjBuilderEb+0x358) [0x97de88]
       mongod(_ZN5mongo12_execCommandEPNS_16OperationContextEPNS_7CommandERKSsRNS_7BSONObjEiRSsRNS_14BSONObjBuilderEb+0x34) [0x9f40e4]
       mongod(_ZN5mongo7Command11execCommandEPNS_16OperationContextEPS0_iPKcRNS_7BSONObjERNS_14BSONObjBuilderEb+0xC1D) [0x9f506d]
       mongod(_ZN5mongo12_runCommandsEPNS_16OperationContextEPKcRNS_7BSONObjERNS_11_BufBuilderINS_16TrivialAllocatorEEERNS_14BSONObjBuilderEbi+0x28B) [0x9f5d7b]
       mongod(_ZN5mongo8runQueryEPNS_16OperationContextERNS_7MessageERNS_12QueryMessageERKNS_15NamespaceStringERNS_5CurOpES3_+0x77A) [0xbbabca]
       mongod(_ZN5mongo16assembleResponseEPNS_16OperationContextERNS_7MessageERNS_10DbResponseERKNS_11HostAndPortE+0xB10) [0xad0f00]
       mongod(_ZN5mongo16MyMessageHandler7processERNS_7MessageEPNS_21AbstractMessagingPortEPNS_9LastErrorE+0xDD) [0x82b88d]
       mongod(_ZN5mongo17PortMessageServer17handleIncomingMsgEPv+0x34B) [0xf2d22b]
       libpthread.so.0(+0x80A4) [0x7f8105ecd0a4]
       libc.so.6(clone+0x6D) [0x7f8104980c2d]
      -----  END BACKTRACE  -----
      2015-07-24T08:31:14.169+0000 I CONTROL  ***** SERVER RESTARTED *****
      

      1. connection.1441936.txt
        15 kB
        Shankar Karuppiah
      2. startup.txt
        5 kB
        Shankar Karuppiah

        Issue Links

          Activity

          Hide
          shankar.k Shankar Karuppiah added a comment - - edited

          Hello Ramon,

          I have upload the log for that day, please confirm if you have received it.

          About the operations, based on the log for the connection that caused the crash, it came from one of our batch processors.

          Schema for the collection:

          {
          	_id : {
          		type : 'ObjectId',
          	},
          	sdt : { // data for processing
          		type : 'nested'
          	},
          	td : { // steps in batch processing
          		type : 'Array',
          		item : {
          			type : 'String'
          		}
          	}
          	apid : { type : 'String' } //batch processor id
          } 
          

          The collection has one index,

          { "sdt.eo": -1, "apid": 1 }

          besides the default index on _id

          Documents are added to the collections in batch of 50.

          Then each batch process will fetch one document at a time from the collection using following command

          	this.collection.findAndModify({
          		'sdt.eo' : {
          			'$lt' : new Date()
          		},
          		'$or' : [
          			{
          				apid : {
          					$exists : false
          				}
          			},{
          				apid : processId
          			}
          		]
          	},{
          		'sdt.eo' : 1
          	},{
          		$set : { apid : processId }
          	}, callback);
          

          Each document goes through steps of processing, after each step, the state is update using the following command

          	this.collection.update({
          		_id: new ObjectID(documentId)
          	}, {
          		$addToSet: {
          			td: task
          		}
          	}, callback);
          

          Once all the processing step is completed the document is removed, using following command

          	this.collection.remove({
          		_id : new ObjectID(documentId)
          	},  callback);
          

          On average each document will be processed within 300ms, and 100 documents are processed per second at peak

          Side note :
          I'm not sure if this is related, like you mentioned before there are a lot writeConflicts, though at most each document will be updated 20 times including insert and remove ops

          Show
          shankar.k Shankar Karuppiah added a comment - - edited Hello Ramon, I have upload the log for that day, please confirm if you have received it. About the operations, based on the log for the connection that caused the crash, it came from one of our batch processors. Schema for the collection: { _id : { type : 'ObjectId', }, sdt : { // data for processing type : 'nested' }, td : { // steps in batch processing type : 'Array', item : { type : 'String' } } apid : { type : 'String' } //batch processor id } The collection has one index, { "sdt.eo": -1, "apid": 1 } besides the default index on _id Documents are added to the collections in batch of 50. Then each batch process will fetch one document at a time from the collection using following command this.collection.findAndModify({ 'sdt.eo' : { '$lt' : new Date() }, '$or' : [ { apid : { $exists : false } },{ apid : processId } ] },{ 'sdt.eo' : 1 },{ $set : { apid : processId } }, callback); Each document goes through steps of processing, after each step, the state is update using the following command this.collection.update({ _id: new ObjectID(documentId) }, { $addToSet: { td: task } }, callback); Once all the processing step is completed the document is removed, using following command this.collection.remove({ _id : new ObjectID(documentId) }, callback); On average each document will be processed within 300ms, and 100 documents are processed per second at peak Side note : I'm not sure if this is related, like you mentioned before there are a lot writeConflicts , though at most each document will be updated 20 times including insert and remove ops
          Hide
          ramon.fernandez Ramon Fernandez added a comment -

          Shankar Karuppiah, thank you for uploading the log file, unfortunately it doesn't contain information about the segfault. If you manage to reproduce the problem and can send logs those may help understand what's happening. In the mean time we're working on a theory on our end, please continue to watch the ticket for updates.

          Thanks,
          Ramón.

          Show
          ramon.fernandez Ramon Fernandez added a comment - Shankar Karuppiah , thank you for uploading the log file, unfortunately it doesn't contain information about the segfault. If you manage to reproduce the problem and can send logs those may help understand what's happening. In the mean time we're working on a theory on our end, please continue to watch the ticket for updates. Thanks, Ramón.
          Hide
          xgen-internal-githook Githook User added a comment -

          Author:

          {u'username': u'michaelcahill', u'name': u'Michael Cahill', u'email': u'michael.cahill@mongodb.com'}

          Message: WT-2025 SERVER-19573 Change row-store inserts to avoid page locking.

          Also switch to always using an atomic operation to swap inserts into the skiplist. This fixes a potential race where readers may see a partially initialized WT_INSERT object, because the spinlock we used in the past for all inserts did not imply a write barrier.
          Branch: develop
          https://github.com/wiredtiger/wiredtiger/commit/dd80a4bc0ab5cf6ade97367b52bf0e6d55e19424

          Show
          xgen-internal-githook Githook User added a comment - Author: {u'username': u'michaelcahill', u'name': u'Michael Cahill', u'email': u'michael.cahill@mongodb.com'} Message: WT-2025 SERVER-19573 Change row-store inserts to avoid page locking. Also switch to always using an atomic operation to swap inserts into the skiplist. This fixes a potential race where readers may see a partially initialized WT_INSERT object, because the spinlock we used in the past for all inserts did not imply a write barrier. Branch: develop https://github.com/wiredtiger/wiredtiger/commit/dd80a4bc0ab5cf6ade97367b52bf0e6d55e19424
          Hide
          xgen-internal-githook Githook User added a comment -

          Author:

          {u'username': u'michaelcahill', u'name': u'Michael Cahill', u'email': u'michael.cahill@mongodb.com'}

          Message: Merge pull request #2094 from wiredtiger/row-insert-nolock

          WT-2025 SERVER-19573 Change row-store inserts to avoid page locking.
          Branch: develop
          https://github.com/wiredtiger/wiredtiger/commit/4545a8b14cf30b358fec0a20df1c9e6154fc7eb0

          Show
          xgen-internal-githook Githook User added a comment - Author: {u'username': u'michaelcahill', u'name': u'Michael Cahill', u'email': u'michael.cahill@mongodb.com'} Message: Merge pull request #2094 from wiredtiger/row-insert-nolock WT-2025 SERVER-19573 Change row-store inserts to avoid page locking. Branch: develop https://github.com/wiredtiger/wiredtiger/commit/4545a8b14cf30b358fec0a20df1c9e6154fc7eb0
          Hide
          xgen-internal-githook Githook User added a comment -

          Author:

          {u'username': u'michaelcahill', u'name': u'Michael Cahill', u'email': u'michael.cahill@mongodb.com'}

          Message: WT-2025 SERVER-19573 Change row-store inserts to avoid page locking.

          Merge pull request #2094 from wiredtiger/row-insert-nolock
          (cherry picked from commit 4545a8b14cf30b358fec0a20df1c9e6154fc7eb0)
          Branch: mongodb-3.0
          https://github.com/wiredtiger/wiredtiger/commit/0349d8aa5f07209fb9b53cfff8236abe8acedfa3

          Show
          xgen-internal-githook Githook User added a comment - Author: {u'username': u'michaelcahill', u'name': u'Michael Cahill', u'email': u'michael.cahill@mongodb.com'} Message: WT-2025 SERVER-19573 Change row-store inserts to avoid page locking. Merge pull request #2094 from wiredtiger/row-insert-nolock (cherry picked from commit 4545a8b14cf30b358fec0a20df1c9e6154fc7eb0) Branch: mongodb-3.0 https://github.com/wiredtiger/wiredtiger/commit/0349d8aa5f07209fb9b53cfff8236abe8acedfa3

            People

            • Votes:
              0 Vote for this issue
              Watchers:
              7 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved: