[SERVER-67486] "command failed because of stale config" failure in catalog shard POC Created: 23/Jun/22  Updated: 29/Oct/23  Resolved: 06/Jul/22

Status: Closed
Project: Core Server
Component/s: None
Affects Version/s: None
Fix Version/s: None

Type: Task Priority: Major - P3
Reporter: Andrew Shuvalov (Inactive) Assignee: Kshitij Gupta
Resolution: Fixed Votes: 0
Labels: sharding-nyc-subteam2, sharding-nyc-subteam2-catalog-poc
Remaining Estimate: Not Specified
Time Spent: Not Specified
Original Estimate: Not Specified

Issue Links:
Depends
is depended on by SERVER-65662 Investigate fail point timeout in cat... Closed
Backwards Compatibility: Fully Compatible
Participants:

 Description   

[js_test:agg_mongos_secondaryok] uncaught exception: Error: assert failed : aggregate command failed: {
[js_test:agg_mongos_secondaryok] 	"ok" : 0,
[js_test:agg_mongos_secondaryok] 	"errmsg" : "command failed because of stale config :: caused by :: sharding status of collection test.user is not currently available for description and needs to be recovered from the config server",
[js_test:agg_mongos_secondaryok] 	"code" : 13388,
[js_test:agg_mongos_secondaryok] 	"codeName" : "StaleConfig",
[js_test:agg_mongos_secondaryok] 	"ns" : "test.user",
[js_test:agg_mongos_secondaryok] 	"vReceived" : {
[js_test:agg_mongos_secondaryok] 		"e" : ObjectId("000000000000000000000000"),
[js_test:agg_mongos_secondaryok] 		"t" : Timestamp(0, 0),
[js_test:agg_mongos_secondaryok] 		"v" : Timestamp(0, 0)
[js_test:agg_mongos_secondaryok] 	},
[js_test:agg_mongos_secondaryok] 	"shardId" : "agg_mongos_secondaryok-rs0",
[js_test:agg_mongos_secondaryok] 	"$clusterTime" : {
[js_test:agg_mongos_secondaryok] 		"clusterTime" : Timestamp(1655995468, 11),
[js_test:agg_mongos_secondaryok] 		"signature" : {
[js_test:agg_mongos_secondaryok] 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:agg_mongos_secondaryok] 			"keyId" : NumberLong(0)
[js_test:agg_mongos_secondaryok] 		}
[js_test:agg_mongos_secondaryok] 	},
[js_test:agg_mongos_secondaryok] 	"operationTime" : Timestamp(1655995468, 11)
[js_test:agg_mongos_secondaryok] } :
[js_test:agg_mongos_secondaryok] doassert@src/mongo/shell/assert.js:20:14
[js_test:agg_mongos_secondaryok] assert@src/mongo/shell/assert.js:151:17
[js_test:agg_mongos_secondaryok] doTest@jstests/sharding/agg_mongos_secondaryok.js:33:11
[js_test:agg_mongos_secondaryok] @jstests/sharding/agg_mongos_secondaryok.js:44:7
[js_test:agg_mongos_secondaryok] @jstests/sharding/agg_mongos_secondaryok.js:48:3
[js_test:agg_mongos_secondaryok] failed to load: jstests/sharding/agg_mongos_secondaryok.js

Similar failure in:

jstests/sharding/agg_out_rc_available.js
jstests/sharding/balance_repl.js
jstests/sharding/change_streams_primary_shard_unaware.js
jstests/sharding/change_streams_update_lookup_shard_metadata_missing.js
jstests/sharding/count_secondaryok.js



 Comments   
Comment by Kelsey Schubert [ 23/Aug/22 ]

Thanks for updating!

Comment by Kelsey Schubert [ 22/Aug/22 ]

kshitij.gupta@mongodb.com, I think this should have no fixversion because it was commited to a non-release branch. Can you confirm/update?

Comment by Andrew Witten (Inactive) [ 28/Jun/22 ]

jstests/sharding/agg_out_rc_available.js

 

jstests/sharding/collstats_returns_orphan_count.js also fails with this error.

The output of line 44 (
db.runCommand({'collStats': noColl.getFullName()})) 
is

[js_test:collstats_returns_orphan_count] [jsTest] collstats res: {
[js_test:collstats_returns_orphan_count] [jsTest] 	"ok" : 0,
[js_test:collstats_returns_orphan_count] [jsTest] 	"errmsg" : "got stale shardVersion response from shard collstats_returns_orphan_count-rs0 at host ip-10-122-1-232:20020 :: caused by :: sharding status of collection db.db.unusedColl is not currently known and needs to be recovered",
[js_test:collstats_returns_orphan_count] [jsTest] 	"code" : 13388,
[js_test:collstats_returns_orphan_count] [jsTest] 	"codeName" : "StaleConfig",
[js_test:collstats_returns_orphan_count] [jsTest] 	"ns" : "db.db.unusedColl",
[js_test:collstats_returns_orphan_count] [jsTest] 	"vReceived" : {
[js_test:collstats_returns_orphan_count] [jsTest] 		"e" : ObjectId("000000000000000000000000"),
[js_test:collstats_returns_orphan_count] [jsTest] 		"t" : Timestamp(0, 0),
[js_test:collstats_returns_orphan_count] [jsTest] 		"v" : Timestamp(0, 0)
[js_test:collstats_returns_orphan_count] [jsTest] 	},
[js_test:collstats_returns_orphan_count] [jsTest] 	"shardId" : "collstats_returns_orphan_count-rs0",
[js_test:collstats_returns_orphan_count] [jsTest] 	"$clusterTime" : {
[js_test:collstats_returns_orphan_count] [jsTest] 		"clusterTime" : Timestamp(1656431248, 24),
[js_test:collstats_returns_orphan_count] [jsTest] 		"signature" : {
[js_test:collstats_returns_orphan_count] [jsTest] 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:collstats_returns_orphan_count] [jsTest] 			"keyId" : NumberLong(0)
[js_test:collstats_returns_orphan_count] [jsTest] 		}
[js_test:collstats_returns_orphan_count] [jsTest] 	},
[js_test:collstats_returns_orphan_count] [jsTest] 	"operationTime" : Timestamp(1656431248, 24)
[js_test:collstats_returns_orphan_count] [jsTest] }

Comment by Andrew Shuvalov (Inactive) [ 23/Jun/22 ]

jstests/sharding/change_streams_update_lookup_shard_metadata_missing.js
jstests/sharding/commands_that_write_accept_wc_shards.js
... and so one

Comment by Andrew Shuvalov (Inactive) [ 23/Jun/22 ]

I think the stale config is also the root cause of:
jstests/sharding/change_streams_primary_shard_unaware.js
 
The failure is slightly different:

[js_test:change_streams_primary_shard_unaware] uncaught exception: Error: command failed: {
[js_test:change_streams_primary_shard_unaware] 	"ok" : 0,
[js_test:change_streams_primary_shard_unaware] 	"errmsg" : "sharding status of collection config.system.sessions is not currently available for description and needs to be recovered from the config server",
[js_test:change_streams_primary_shard_unaware] 	"code" : 13388,
[js_test:change_streams_primary_shard_unaware] 	"codeName" : "StaleConfig",
[js_test:change_streams_primary_shard_unaware] 	"ns" : "config.system.sessions",
[js_test:change_streams_primary_shard_unaware] 	"vReceived" : {
[js_test:change_streams_primary_shard_unaware] 		"e" : ObjectId("62b490f222bbd4a48a52309e"),
[js_test:change_streams_primary_shard_unaware] 		"t" : Timestamp(1656000754, 23),
[js_test:change_streams_primary_shard_unaware] 		"v" : Timestamp(1, 0)
[js_test:change_streams_primary_shard_unaware] 	},
[js_test:change_streams_primary_shard_unaware] 	"shardId" : "change_streams_primary_shard_unaware-rs0",
[js_test:change_streams_primary_shard_unaware] 	"$clusterTime" : {
[js_test:change_streams_primary_shard_unaware] 		"clusterTime" : Timestamp(1656000805, 5),
[js_test:change_streams_primary_shard_unaware] 		"signature" : {
[js_test:change_streams_primary_shard_unaware] 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:change_streams_primary_shard_unaware] 			"keyId" : NumberLong(0)
[js_test:change_streams_primary_shard_unaware] 		}
[js_test:change_streams_primary_shard_unaware] 	},
[js_test:change_streams_primary_shard_unaware] 	"operationTime" : Timestamp(1656000805, 5)
[js_test:change_streams_primary_shard_unaware] } with original command request: {
[js_test:change_streams_primary_shard_unaware] 	"query" : {
[js_test:change_streams_primary_shard_unaware] 		"aggregate" : "system.sessions",
[js_test:change_streams_primary_shard_unaware] 		"pipeline" : [
[js_test:change_streams_primary_shard_unaware] 			{
[js_test:change_streams_primary_shard_unaware] 				"$indexStats" : {
[js_test:change_streams_primary_shard_unaware] 
[js_test:change_streams_primary_shard_unaware] 				}
[js_test:change_streams_primary_shard_unaware] 			},
[js_test:change_streams_primary_shard_unaware] 			{
[js_test:change_streams_primary_shard_unaware] 				"$group" : {
[js_test:change_streams_primary_shard_unaware] 					"_id" : "$shard",
[js_test:change_streams_primary_shard_unaware] 					"indexes" : {
[js_test:change_streams_primary_shard_unaware] 						"$push" : {
[js_test:change_streams_primary_shard_unaware] 							"spec" : "$spec"
[js_test:change_streams_primary_shard_unaware] 						}
[js_test:change_streams_primary_shard_unaware] 					}
[js_test:change_streams_primary_shard_unaware] 				}
[js_test:change_streams_primary_shard_unaware] 			},
[js_test:change_streams_primary_shard_unaware] 			{
[js_test:change_streams_primary_shard_unaware] 				"$project" : {
[js_test:change_streams_primary_shard_unaware] 					"_id" : 0,
[js_test:change_streams_primary_shard_unaware] 					"shard" : "$_id",
[js_test:change_streams_primary_shard_unaware] 					"indexes" : 1
[js_test:change_streams_primary_shard_unaware] 				}
[js_test:change_streams_primary_shard_unaware] 			}
[js_test:change_streams_primary_shard_unaware] 		],
[js_test:change_streams_primary_shard_unaware] 		"readConcern" : {
[js_test:change_streams_primary_shard_unaware] 			"level" : "local"
[js_test:change_streams_primary_shard_unaware] 		},
[js_test:change_streams_primary_shard_unaware] 		"cursor" : {
[js_test:change_streams_primary_shard_unaware] 
[js_test:change_streams_primary_shard_unaware] 		},
[js_test:change_streams_primary_shard_unaware] 		"lsid" : {
[js_test:change_streams_primary_shard_unaware] 			"id" : UUID("ab1a3207-a8bf-494d-b9d3-e6f3c262fdf7")
[js_test:change_streams_primary_shard_unaware] 		},
[js_test:change_streams_primary_shard_unaware] 		"$clusterTime" : {
[js_test:change_streams_primary_shard_unaware] 			"clusterTime" : Timestamp(1656000805, 5),
[js_test:change_streams_primary_shard_unaware] 			"signature" : {
[js_test:change_streams_primary_shard_unaware] 				"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:change_streams_primary_shard_unaware] 				"keyId" : NumberLong(0)
[js_test:change_streams_primary_shard_unaware] 			}
[js_test:change_streams_primary_shard_unaware] 		}
[js_test:change_streams_primary_shard_unaware] 	},
[js_test:change_streams_primary_shard_unaware] 	"$readPreference" : {
[js_test:change_streams_primary_shard_unaware] 		"mode" : "primary"
[js_test:change_streams_primary_shard_unaware] 	}
[js_test:change_streams_primary_shard_unaware] } on connection: connection to ip-10-122-2-167:20022 : aggregate failed :
[js_test:change_streams_primary_shard_unaware] _getErrorWithCode@src/mongo/shell/utils.js:24:13
[js_test:change_streams_primary_shard_unaware] doassert@src/mongo/shell/assert.js:18:14
[js_test:change_streams_primary_shard_unaware] _assertCommandWorked@src/mongo/shell/assert.js:760:25
[js_test:change_streams_primary_shard_unaware] assert.commandWorked@src/mongo/shell/assert.js:852:16
[js_test:change_streams_primary_shard_unaware] DB.prototype._runAggregate@src/mongo/shell/db.js:286:12
[js_test:change_streams_primary_shard_unaware] DBCollection.prototype.aggregate@src/mongo/shell/collection.js:974:21
[js_test:change_streams_primary_shard_unaware] getPerShardIndexes@jstests/sharding/libs/sharded_index_util.js:52:14
[js_test:change_streams_primary_shard_unaware] makeGetIndexDocsFunc/<@jstests/libs/override_methods/check_indexes_consistent_across_cluster.js:40:45
[js_test:change_streams_primary_shard_unaware] ShardingTest.prototype.checkIndexesConsistentAcrossCluster@jstests/libs/override_methods/check_indexes_consistent_across_cluster.js:65:42
[js_test:change_streams_primary_shard_unaware] ShardingTest/this.stop@src/mongo/shell/shardingtest.js:384:14
[js_test:change_streams_primary_shard_unaware] @jstests/sharding/change_streams_primary_shard_unaware.js:179:4
[js_test:change_streams_primary_shard_unaware] @jstests/sharding/change_streams_primary_shard_unaware.js:180:3
[js_test:change_streams_primary_shard_unaware] failed to load: jstests/sharding/change_streams_primary_shard_unaware.js

Comment by Andrew Shuvalov (Inactive) [ 23/Jun/22 ]

Also similar in:
 
jstests/sharding/authmr.js
jstests/sharding/balance_repl.js

 

Comment by Andrew Shuvalov (Inactive) [ 23/Jun/22 ]

This seems a similar failure:

[js_test:agg_out_rc_available] uncaught exception: Error: command failed: {
[js_test:agg_out_rc_available] 	"ok" : 0,
[js_test:agg_out_rc_available] 	"errmsg" : "PlanExecutor error during aggregation :: caused by :: got stale shardVersion response from shard agg_out_rc_available-rs0 at host ip-10-122-2-167:20020 :: caused by :: sharding status of collection test.output_coll is not currently available for description and needs to be recovered from the config server",
[js_test:agg_out_rc_available] 	"code" : 13388,
[js_test:agg_out_rc_available] 	"codeName" : "StaleConfig",
[js_test:agg_out_rc_available] 	"ns" : "test.output_coll",
[js_test:agg_out_rc_available] 	"vReceived" : {
[js_test:agg_out_rc_available] 		"e" : ObjectId("00000000ffffffffffffffff"),
[js_test:agg_out_rc_available] 		"t" : Timestamp(4294967295, 4294967295),
[js_test:agg_out_rc_available] 		"v" : Timestamp(0, 0)
[js_test:agg_out_rc_available] 	},
[js_test:agg_out_rc_available] 	"shardId" : "agg_out_rc_available-rs0",
[js_test:agg_out_rc_available] 	"$clusterTime" : {
[js_test:agg_out_rc_available] 		"clusterTime" : Timestamp(1655995824, 19),
[js_test:agg_out_rc_available] 		"signature" : {
[js_test:agg_out_rc_available] 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:agg_out_rc_available] 			"keyId" : NumberLong(0)
[js_test:agg_out_rc_available] 		}
[js_test:agg_out_rc_available] 	},
[js_test:agg_out_rc_available] 	"operationTime" : Timestamp(1655995824, 18)
[js_test:agg_out_rc_available] } with original command request: {
[js_test:agg_out_rc_available] 	"aggregate" : "input_coll",
[js_test:agg_out_rc_available] 	"pipeline" : [
[js_test:agg_out_rc_available] 		{
[js_test:agg_out_rc_available] 			"$out" : "output_coll"
[js_test:agg_out_rc_available] 		}
[js_test:agg_out_rc_available] 	],
[js_test:agg_out_rc_available] 	"cursor" : {
[js_test:agg_out_rc_available] 
[js_test:agg_out_rc_available] 	},
[js_test:agg_out_rc_available] 	"readConcern" : {
[js_test:agg_out_rc_available] 		"level" : "available"
[js_test:agg_out_rc_available] 	},
[js_test:agg_out_rc_available] 	"lsid" : {
[js_test:agg_out_rc_available] 		"id" : UUID("1568a3d2-760d-459b-8932-0964b753928c")
[js_test:agg_out_rc_available] 	},
[js_test:agg_out_rc_available] 	"$clusterTime" : {
[js_test:agg_out_rc_available] 		"clusterTime" : Timestamp(1655995823, 47),
[js_test:agg_out_rc_available] 		"signature" : {
[js_test:agg_out_rc_available] 			"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
[js_test:agg_out_rc_available] 			"keyId" : NumberLong(0)
[js_test:agg_out_rc_available] 		}
[js_test:agg_out_rc_available] 	}
[js_test:agg_out_rc_available] } on connection: connection to ip-10-122-2-167:20021 :
[js_test:agg_out_rc_available] _getErrorWithCode@src/mongo/shell/utils.js:24:13
[js_test:agg_out_rc_available] doassert@src/mongo/shell/assert.js:18:14
[js_test:agg_out_rc_available] _assertCommandWorked@src/mongo/shell/assert.js:760:25
[js_test:agg_out_rc_available] assert.commandWorked@src/mongo/shell/assert.js:852:16
[js_test:agg_out_rc_available] @jstests/sharding/agg_out_rc_available.js:26:8
[js_test:agg_out_rc_available] @jstests/sharding/agg_out_rc_available.js:43:3
[js_test:agg_out_rc_available] s20021| {"t":{"$date":"2022-06-23T14:50:24.151+00:00"},"s":"D1", "c":"ASSERT",   "id":23074,   "ctx":"conn6","msg":"User assertion","attr":{"error":"StaleConfig{ ns: \"test.output_coll\", vReceived: { e: ObjectId('00000000ffffffffffffffff'), t: Timestamp(4294967295, 4294967295), v: Timestamp(0, 0) }, shardId: \"agg_out_rc_available-rs0\" }: PlanExecutor error during aggregation :: caused by :: got stale shardVersion response from shard agg_out_rc_available-rs0 at host ip-10-122-2-167:20020 :: caused by :: sharding status of collection test.output_coll is not currently available for description and needs to be recovered from the config server","file":"src/mongo/util/future_impl.h","line":1152}}
[js_test:agg_out_rc_available] failed to load: jstests/sharding/agg_out_rc_available.js

Generated at Thu Feb 08 06:08:15 UTC 2024 using Jira 9.7.1#970001-sha1:2222b88b221c4928ef0de3161136cc90c8356a66.