[SERVER-18640] Wiredtiger does not recover from unclean shutdown Created: 23/May/15  Updated: 26/Aug/15  Resolved: 29/May/15

Status: Closed
Project: Core Server
Component/s: WiredTiger
Affects Version/s: 3.0.3
Fix Version/s: None

Type: Bug Priority: Critical - P2
Reporter: Dharshan Rangegowda Assignee: Ramon Fernandez Marina
Resolution: Done Votes: 0
Labels: None
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Attachments: Zip Archive unclean.zip    
Issue Links:
Related
related to SERVER-19815 Improved mongod --repair option for W... Closed
Operating System: ALL
Participants:

 Description   

My server was shut down uncleanly. Now the database does not start

2015-05-23T17:41:49.538+0000 I CONTROL  ***** SERVER RESTARTED *****
2015-05-23T17:41:49.596+0000 W -        [initandlisten] Detected unclean shutdown - /mongodb_data/mongod.lock is not empty.
2015-05-23T17:41:49.596+0000 W STORAGE  [initandlisten] Recovering data from the last clean checkpoint.
2015-05-23T17:41:49.596+0000 I STORAGE  [initandlisten] wiredtiger_open config: create,cache_size=1G,session_max=20000,eviction=(threads_max=4),statistics=(fast),log=(enabled=true,archive=tr
ue,path=journal,compressor=snappy),checkpoint=(wait=60,log_size=2GB),statistics_log=(wait=0),
2015-05-23T17:41:49.858+0000 E STORAGE  [initandlisten] WiredTiger (2) [1432402909:858118][1876:0x7f257de42c20], file:collection-2--5585092568808516308.wt, session.open_cursor: /mongodb_data
/collection-2--5585092568808516308.wt: No such file or directory
2015-05-23T17:41:49.858+0000 E STORAGE  [initandlisten] no cursor for uri: table:collection-2--5585092568808516308
2015-05-23T17:41:49.858+0000 I -        [initandlisten] Invariant failure c src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp 1032
2015-05-23T17:41:49.871+0000 I CONTROL  [initandlisten]
 0xf6a889 0xf08321 0xeec092 0xd76f89 0xd770ac 0xd779e3 0xd77ce9 0xd71b88 0xcf1d36 0xcf4e0c 0xd70aa6 0xa8104d 0x7f3e62 0x7f93c4 0x3f3201ed5d 0x7f1bbd
----- BEGIN BACKTRACE -----
{"backtrace":[{"b":"400000","o":"B6A889"},{"b":"400000","o":"B08321"},{"b":"400000","o":"AEC092"},{"b":"400000","o":"976F89"},{"b":"400000","o":"9770AC"},{"b":"400000","o":"9779E3"},{"b":"40
0000","o":"977CE9"},{"b":"400000","o":"971B88"},{"b":"400000","o":"8F1D36"},{"b":"400000","o":"8F4E0C"},{"b":"400000","o":"970AA6"},{"b":"400000","o":"68104D"},{"b":"400000","o":"3F3E62"},{"
b":"400000","o":"3F93C4"},{"b":"3F32000000","o":"1ED5D"},{"b":"400000","o":"3F1BBD"}],"processInfo":{ "mongodbVersion" : "3.0.3", "gitVersion" : "b40106b36eecd1b4407eb1ad1af6bc60593c6105", "
uname" : { "sysname" : "Linux", "release" : "2.6.32-431.29.2.el6.x86_64", "version" : "#1 SMP Tue Sep 9 21:36:05 UTC 2014", "machine" : "x86_64" }, "somap" : [ { "elfType" : 2, "b" : "400000
", "buildId" : "108A63CA14A4BD5E599BAC10885DBD3A85DA5439" }, { "b" : "7FFFFC6B8000", "elfType" : 3, "buildId" : "5474F0D8DAF3D6177E2C4B06F3892745CB43B4D5" }, { "path" : "/lib64/libpthread.so
.0", "elfType" : 3, "buildId" : "B8DFF8E53D9F2B80C3C382E83EC17C828B536A39" }, { "path" : "/usr/lib64/libssl.so.10", "elfType" : 3, "buildId" : "934508308DAF0D5C61E9997463F0D8B0A3F096BA" }, {
 "path" : "/usr/lib64/libcrypto.so.10", "elfType" : 3, "buildId" : "A4329A30669C783FA8DEEB7D1EA83749A8FA14E1" }, { "path" : "/lib64/librt.so.1", "elfType" : 3, "buildId" : "583411D8786F86A1D
6B8741C502831E6122445A7" }, { "path" : "/lib64/libdl.so.2", "elfType" : 3, "buildId" : "454F8FC6CC6502C6401E5F9E221564D80665D277" }, { "path" : "/usr/lib64/libstdc++.so.6", "elfType" : 3, "b
uildId" : "ED99110E629209C5CA6C0ED704F2C5CE3171513A" }, { "path" : "/lib64/libm.so.6", "elfType" : 3, "buildId" : "C9A87F6A29ED1D3CB18F539845A45FE3A9877FF1" }, { "path" : "/lib64/libgcc_s.so
.1", "elfType" : 3, "buildId" : "A44499D29B114A5366CD72DD4883958495AC1C1D" }, { "path" : "/lib64/libc.so.6", "elfType" : 3, "buildId" : "6EFE254F4564519BBB80889534FAC3D61C18C387" }, { "path"
 : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3, "buildId" : "6F8E59B70E469F3A924A268911FF8FD0C37E7460" }, { "path" : "/lib64/libgssapi_krb5.so.2", "elfType" : 3, "buildId" : "54BA6B78A92203
44E77463947215E42F0EABCC62" }, { "path" : "/lib64/libkrb5.so.3", "elfType" : 3, "buildId" : "6797403AA5F8FAD8ADFF683478B45F528CE4FB0E" }, { "path" : "/lib64/libcom_err.so.2", "elfType" : 3,
"buildId" : "8CE28F280150E62296240E70ECAC64E4A57AB826" }, { "path" : "/lib64/libk5crypto.so.3", "elfType" : 3, "buildId" : "05733977F4E41652B86070B27A0CFC2C1EA7719D" }, { "path" : "/lib64/li
bz.so.1", "elfType" : 3, "buildId" : "5FA8E5038EC04A774AF72A9BB62DC86E1049C4D6" }, { "path" : "/lib64/libkrb5support.so.0", "elfType" : 3, "buildId" : "E3FA235F3BA3F776A01A18ECA737C9890F4459
23" }, { "path" : "/lib64/libkeyutils.so.1", "elfType" : 3, "buildId" : "AF374BAFB7F5B139A0B431D3F06D82014AFF3251" }, { "path" : "/lib64/libresolv.so.2", "elfType" : 3, "buildId" : "58B69647
8044E028A5970D48A4ED50E164B43B36" }, { "path" : "/lib64/libselinux.so.1", "elfType" : 3, "buildId" : "E6798A06BEE17CF102BBA44FD512FF8B805CEAF1" } ] }}
 mongod(_ZN5mongo15printStackTraceERSo+0x29) [0xf6a889]
 mongod(_ZN5mongo10logContextEPKc+0xE1) [0xf08321]
 mongod(_ZN5mongo15invariantFailedEPKcS1_j+0xB2) [0xeec092]
 mongod(_ZN5mongo21WiredTigerRecordStore8Iterator7_locateERKNS_8RecordIdEb+0x159) [0xd76f89]
 mongod(_ZN5mongo21WiredTigerRecordStore8IteratorC2ERKS0_PNS_16OperationContextERKNS_8RecordIdERKNS_20CollectionScanParams9DirectionEb+0x9C) [0xd770ac]
 mongod(_ZNK5mongo21WiredTigerRecordStore11getIteratorEPNS_16OperationContextERKNS_8RecordIdERKNS_20CollectionScanParams9DirectionE+0x43) [0xd779e3]
 mongod(_ZN5mongo21WiredTigerRecordStoreC1EPNS_16OperationContextERKNS_10StringDataES5_bllPNS_28CappedDocumentDeleteCallbackEPNS_20WiredTigerSizeStorerE+0x2B9) [0xd77ce9]
 mongod(_ZN5mongo18WiredTigerKVEngine14getRecordStoreEPNS_16OperationContextERKNS_10StringDataES5_RKNS_17CollectionOptionsE+0xA8) [0xd71b88]
 mongod(_ZN5mongo22KVDatabaseCatalogEntry14initCollectionEPNS_16OperationContextERKSsb+0x276) [0xcf1d36]
 mongod(_ZN5mongo15KVStorageEngineC1EPNS_8KVEngineERKNS_22KVStorageEngineOptionsE+0x69C) [0xcf4e0c]
 mongod(+0x970AA6) [0xd70aa6]
 mongod(_ZN5mongo23GlobalEnvironmentMongoD22setGlobalStorageEngineERKSs+0x30D) [0xa8104d]
 mongod(_ZN5mongo13initAndListenEi+0x422) [0x7f3e62]
 mongod(main+0x134) [0x7f93c4]
 libc.so.6(__libc_start_main+0xFD) [0x3f3201ed5d]
 mongod(+0x3F1BBD) [0x7f1bbd]
-----  END BACKTRACE  -----
2015-05-23T17:41:49.871+0000 I -        [initandlisten]
 
***aborting after invariant() failure



 Comments   
Comment by Ramon Fernandez Marina [ 26/Aug/15 ]

dixenon, currently there's no automated way to recover from this scenario. We've created SERVER-19815 to improve this situation – feel free to vote for SERVER-19815 and watch it for updates.

Regards,
Ramón.

Comment by Denys [ 02/Aug/15 ]

Hi Ramon

Despite the reason why files are missing is there any way to restore other databases from the data directory?

Thanks,
Denys.

Comment by Ramon Fernandez Marina [ 29/May/15 ]

dharshanr@scalegrid.net, as per Michael's explanation above we believe this issue is not a bug in MongoDB, but something related to the storage subsystem and how fsync() calls are handled. If the virtualized storage does not properly implement SCSI commands or flushes things properly that could easily explain why some files are missing despite WiredTiger calling fsync() on file creation.

I'm going to resolve this issue, but if you have some additional information that points to a problem in MongoDB please comment back and we can re-open the ticket.

Regards,
Ramón.

Comment by Michael Cahill (Inactive) [ 29/May/15 ]

Hi Darshan,

WiredTiger calls fsync on the file when it is first created, plus fsync on the directory just after the file is created. If the filesystem and storage system are working properly, these calls should guarantee that the files will exist with a valid header after a crash.

Repair does not automatically drop collections if the underlying files are missing, so in this case there is not much repair can do because the expected files don't exist.

Comment by Dharshan Rangegowda [ 28/May/15 ]

Hi Michael,

It is ext4 running on a RAID 0. Fairly standard options. I will provide more details soon

When is the fysnc call made? does it make it immediatly or when there is memory pressure on the instance?

Also why was the repair not able to recover the remaining data?

Comment by Michael Cahill (Inactive) [ 28/May/15 ]

Hi dharshanr@scalegrid.net, can you please tell us more about the filesystem you are using, including mount options?

At the moment, it looks to us as if either the fsync calls that WiredTiger makes are not being respected by the filesystem across a crash and restart, or that some files were somehow removed after they were created.

Comment by Dharshan Rangegowda [ 26/May/15 ]

Hi Ramon,

The mongod process is the only one that accesses this directory -so there was no restore.

So you mean the databases recorded the presence of the file but did not complete the write operation? Is there a way to force mongod with wiredtiger to flush fully to disk?

The only thing that happened was that the servers were stopped from outside. The server is a VM in Azure and the disk is a RAID0 disk with two underlying disks.

Comment by Ramon Fernandez Marina [ 26/May/15 ]

The error message above says that file collection-2--5585092568808516308.wt, which is mentioned in the WiredTiger metadata, is not present in your data path – which seems to be confirmed by the fact that the file is not present in the path you sent.

Since WiredTiger fsync's the database directory when files are created, this scenario can be triggered when the storage layer is not providing enough durability guarantees (i.e.: the fsync is ignored and MongoDB crashes before the file is written). Another possibility restore operations that affected the files in dbpath. Can you please provide more information as to the type and configuration of the storage layer and clarify whether some other process was accessing this dbpath?

Comment by Ramon Fernandez Marina [ 25/May/15 ]

Hi dharshanr@scalegrid.net, thanks for opening this ticket and uploading the data files. I think this could be a duplicate of SERVER-18275, we'll know for sure after analyzing the data files. Stay tuned.

Comment by Dharshan Rangegowda [ 23/May/15 ]

I tried to do a repair and it failed as well

[root@SG-azwtrs3-1341 mongodb_data]# mongod --dbpath /mongodb_data --repair --repairpath /mongodb_data/repair --storageEngine wiredTiger
2015-05-23T20:05:54.312+0000 I STORAGE  [initandlisten] wiredtiger_open config: create,cache_size=1G,session_max=20000,eviction=(threads_max=4),statistics=(fast),checkpoint=(wait=60,log_size
=2GB),statistics_log=(wait=0),
2015-05-23T20:05:54.689+0000 I STORAGE  [initandlisten] Repairing size cache
2015-05-23T20:05:54.690+0000 I STORAGE  [initandlisten] WiredTiger progress session.verify 2
2015-05-23T20:05:54.690+0000 I STORAGE  [initandlisten] Verify succeeded on uri table:sizeStorer. Not salvaging.
2015-05-23T20:05:54.690+0000 I STORAGE  [initandlisten] Repairing catalog metadata
2015-05-23T20:05:54.692+0000 I STORAGE  [initandlisten] WiredTiger progress session.verify 2
2015-05-23T20:05:54.692+0000 I STORAGE  [initandlisten] Verify succeeded on uri table:_mdb_catalog. Not salvaging.
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] MongoDB starting : pid=2917 port=27017 dbpath=/mongodb_data 64-bit host=SG-azwtrs3-1341
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] ** WARNING: You are running this process as the root user, which is not recommended.
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten]
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten]
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] ** WARNING: /sys/kernel/mm/transparent_hugepage/defrag is 'always'.
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] **        We suggest setting it to 'never'
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten]
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] ** WARNING: soft rlimits too low. rlimits set to 13281 processes, 65536 files. Number of processes should be at least 32768 : 0.5 time
s number of files.
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten]
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] db version v3.0.3
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] git version: b40106b36eecd1b4407eb1ad1af6bc60593c6105
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] OpenSSL version: OpenSSL 1.0.1e-fips 11 Feb 2013
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] build info: Linux ip-10-182-86-231 2.6.32-220.el6.x86_64 #1 SMP Wed Nov 9 08:03:13 EST 2011 x86_64 BOOST_LIB_VERSION=1_49
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] allocator: tcmalloc
2015-05-23T20:05:54.822+0000 I CONTROL  [initandlisten] options: { repair: true, storage: { dbPath: "/mongodb_data", engine: "wiredTiger", repairPath: "/mongodb_data/repair" } }
2015-05-23T20:05:54.822+0000 I STORAGE  [initandlisten] repairDatabase local
2015-05-23T20:05:54.822+0000 I STORAGE  [initandlisten] Repairing collection local.me
2015-05-23T20:05:54.823+0000 I STORAGE  [initandlisten] WiredTiger progress session.verify 2
2015-05-23T20:05:54.823+0000 I STORAGE  [initandlisten] Verify succeeded on uri table:collection-0--8071772262508041921. Not salvaging.
2015-05-23T20:05:55.006+0000 I INDEX    [initandlisten] build index on: local.me properties: { v: 1, key: { _id: 1 }, name: "_id_", ns: "local.me" }
2015-05-23T20:05:55.006+0000 I INDEX    [initandlisten]          building index using bulk method
2015-05-23T20:05:55.008+0000 I STORAGE  [initandlisten] Repairing collection local.oplog.rs
2015-05-23T20:05:55.008+0000 E STORAGE  [initandlisten] WiredTiger (2) [1432411555:8889][2917:0x7ff540f58c20], file:collection-2--5585092568808516308.wt, session.verify: /mongodb_data/collec
tion-2--5585092568808516308.wt: No such file or directory
2015-05-23T20:05:55.008+0000 I STORAGE  [initandlisten] Verify failed on uri table:collection-2--5585092568808516308. Running a salvage operation.
2015-05-23T20:05:55.009+0000 E STORAGE  [initandlisten] WiredTiger (2) [1432411555:9336][2917:0x7ff540f58c20], file:collection-2--5585092568808516308.wt, session.salvage: /mongodb_data/colle
ction-2--5585092568808516308.wt: No such file or directory
2015-05-23T20:05:55.009+0000 I -        [initandlisten] Invariant failure rs.get() src/mongo/db/catalog/database.cpp 186
2015-05-23T20:05:55.143+0000 I CONTROL  [initandlisten]
 0xf6a889 0xf08321 0xeec092 0x911291 0x913073 0x915d50 0xc01900 0x7f4484 0x7f93c4 0x3f3201ed5d 0x7f1bbd
----- BEGIN BACKTRACE -----
{"backtrace":[{"b":"400000","o":"B6A889"},{"b":"400000","o":"B08321"},{"b":"400000","o":"AEC092"},{"b":"400000","o":"511291"},{"b":"400000","o":"513073"},{"b":"400000","o":"515D50"},{"b":"40
0000","o":"801900"},{"b":"400000","o":"3F4484"},{"b":"400000","o":"3F93C4"},{"b":"3F32000000","o":"1ED5D"},{"b":"400000","o":"3F1BBD"}],"processInfo":{ "mongodbVersion" : "3.0.3", "gitVersio
n" : "b40106b36eecd1b4407eb1ad1af6bc60593c6105", "uname" : { "sysname" : "Linux", "release" : "2.6.32-431.29.2.el6.x86_64", "version" : "#1 SMP Tue Sep 9 21:36:05 UTC 2014", "machine" : "x86
_64" }, "somap" : [ { "elfType" : 2, "b" : "400000", "buildId" : "108A63CA14A4BD5E599BAC10885DBD3A85DA5439" }, { "b" : "7FFF1E0AD000", "elfType" : 3, "buildId" : "5474F0D8DAF3D6177E2C4B06F38
92745CB43B4D5" }, { "path" : "/lib64/libpthread.so.0", "elfType" : 3, "buildId" : "B8DFF8E53D9F2B80C3C382E83EC17C828B536A39" }, { "path" : "/usr/lib64/libssl.so.10", "elfType" : 3, "buildId"
 : "934508308DAF0D5C61E9997463F0D8B0A3F096BA" }, { "path" : "/usr/lib64/libcrypto.so.10", "elfType" : 3, "buildId" : "A4329A30669C783FA8DEEB7D1EA83749A8FA14E1" }, { "path" : "/lib64/librt.so
.1", "elfType" : 3, "buildId" : "583411D8786F86A1D6B8741C502831E6122445A7" }, { "path" : "/lib64/libdl.so.2", "elfType" : 3, "buildId" : "454F8FC6CC6502C6401E5F9E221564D80665D277" }, { "path
" : "/usr/lib64/libstdc++.so.6", "elfType" : 3, "buildId" : "ED99110E629209C5CA6C0ED704F2C5CE3171513A" }, { "path" : "/lib64/libm.so.6", "elfType" : 3, "buildId" : "C9A87F6A29ED1D3CB18F53984
5A45FE3A9877FF1" }, { "path" : "/lib64/libgcc_s.so.1", "elfType" : 3, "buildId" : "A44499D29B114A5366CD72DD4883958495AC1C1D" }, { "path" : "/lib64/libc.so.6", "elfType" : 3, "buildId" : "6EF
E254F4564519BBB80889534FAC3D61C18C387" }, { "path" : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3, "buildId" : "6F8E59B70E469F3A924A268911FF8FD0C37E7460" }, { "path" : "/lib64/libgssapi_krb5
.so.2", "elfType" : 3, "buildId" : "54BA6B78A9220344E77463947215E42F0EABCC62" }, { "path" : "/lib64/libkrb5.so.3", "elfType" : 3, "buildId" : "6797403AA5F8FAD8ADFF683478B45F528CE4FB0E" }, {
"path" : "/lib64/libcom_err.so.2", "elfType" : 3, "buildId" : "8CE28F280150E62296240E70ECAC64E4A57AB826" }, { "path" : "/lib64/libk5crypto.so.3", "elfType" : 3, "buildId" : "05733977F4E41652
B86070B27A0CFC2C1EA7719D" }, { "path" : "/lib64/libz.so.1", "elfType" : 3, "buildId" : "5FA8E5038EC04A774AF72A9BB62DC86E1049C4D6" }, { "path" : "/lib64/libkrb5support.so.0", "elfType" : 3, "
buildId" : "E3FA235F3BA3F776A01A18ECA737C9890F445923" }, { "path" : "/lib64/libkeyutils.so.1", "elfType" : 3, "buildId" : "AF374BAFB7F5B139A0B431D3F06D82014AFF3251" }, { "path" : "/lib64/lib
resolv.so.2", "elfType" : 3, "buildId" : "58B696478044E028A5970D48A4ED50E164B43B36" }, { "path" : "/lib64/libselinux.so.1", "elfType" : 3, "buildId" : "E6798A06BEE17CF102BBA44FD512FF8B805CEA
F1" } ] }}
 mongod(_ZN5mongo15printStackTraceERSo+0x29) [0xf6a889]
 mongod(_ZN5mongo10logContextEPKc+0xE1) [0xf08321]
 mongod(_ZN5mongo15invariantFailedEPKcS1_j+0xB2) [0xeec092]
 mongod(_ZN5mongo8Database30_getOrCreateCollectionInstanceEPNS_16OperationContextERKNS_10StringDataE+0xE1) [0x911291]
 mongod(_ZN5mongo8DatabaseC1EPNS_16OperationContextERKNS_10StringDataEPNS_20DatabaseCatalogEntryE+0x1E3) [0x913073]
 mongod(_ZN5mongo14DatabaseHolder6openDbEPNS_16OperationContextERKNS_10StringDataEPb+0x150) [0x915d50]
 mongod(_ZN5mongo14repairDatabaseEPNS_16OperationContextEPNS_13StorageEngineERKSsbb+0x730) [0xc01900]
 mongod(_ZN5mongo13initAndListenEi+0xA44) [0x7f4484]
 mongod(main+0x134) [0x7f93c4]
 libc.so.6(__libc_start_main+0xFD) [0x3f3201ed5d]
 mongod(+0x3F1BBD) [0x7f1bbd]
-----  END BACKTRACE  -----
2015-05-23T20:05:55.144+0000 I -        [initandlisten]
 
***aborting after invariant() failure

Comment by Dharshan Rangegowda [ 23/May/15 ]

A zip of the data files is attached.

Generated at Thu Feb 08 03:48:19 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.