|
kevin.pulo oh nice catch! I agree - I just looked at the logs in the failure you posted and am seeing a race between a _cloneCollectionIndexesAndOptions getting a ShardNotFound and updating the shard registry:
// shard0 starts _cloneCollectionOptionsFromPrimaryShard which will try to target shard1
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.746+0000 | 2020-09-16T07:47:05.743+00:00 D2 COMMAND 21965 [conn49] "About to run the command","attr":{"db":"test","commandArgs":{"_cloneCollectionOptionsFromPrimaryShard":"test.fts_diacriticsensitive","primaryShard":"shard-rs1","writeConcern":{"w":"majority","wtimeout":60000},"lsid":{"id":{"$uuid":"193c858a-fad8-406f-ae66-5f900007fcaf"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1600242425,"i":9}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$configTime":{"$timestamp":{"t":1600242425,"i":9}},"$topologyTime":{"$timestamp":{"t":1600242409,"i":14}},"$client":{"application":{"name":"MongoDB Shell"},"driver":{"name":"MongoDB Internal Client","version":"4.5.0-3164-g0bf3695-patch-5f61b9fe0ae6067c98def4fb"},"os":{"type":"Linux","name":"Red Hat Enterprise Linux Server release 6.2 (Santiago)","architecture":"x86_64","version":"Kernel 2.6.32-220.el6.x86_64"},"mongos":{"host":"ip-10-122-28-132:20007","client":"127.0.0.1:44550","version":"4.5.0-3164-g0bf3695-patch-5f61b9fe0ae6067c98def4fb"}},"$configServerState":{"opTime":{"ts":{"$timestamp":{"t":1600242425,"i":9}},"t":1}},"$db":"test"}}
|
|
// shard0 starts the RSM for shard1 with all 3 nodes
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.746+0000 | 2020-09-16T07:47:05.744+00:00 I NETWORK 4603701 [ShardRegistry-0] "Starting Replica Set Monitor","attr":{"protocol":"streamable","uri":"shard-rs1/localhost:20004,localhost:20005,localhost:20006"}
|
|
// shard0 hears back from a secondary and marks the others as unknown
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.766+0000 | 2020-09-16T07:47:05.746+00:00 I NETWORK 4333213 [ReplicaSetMonitor-TaskExecutor] "RSM Topology Change","attr":{"replicaSet":"shard-rs1","newTopologyDescription":"{ id: \"0a9b4938-3c06-4cd8-951f-ef855136c62c\", topologyType: \"ReplicaSetNoPrimary\", servers: { localhost:20004: { address: \"localhost:20004\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} }, localhost:20005: { address: \"localhost:20005\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} }, localhost:20006: { address: \"localhost:20006\", topologyVersion: { processId: ObjectId('5f61c2e2f6d5680adfaa44f8'), counter: 5 }, roundTripTime: 826, lastWriteDate: new Date(1600242425000), opTime: { ts: Timestamp(1600242425, 7), t: 1 }, type: \"RSSecondary\", minWireVersion: 10, maxWireVersion: 10, me: \"localhost:20006\", setName: \"shard-rs1\", setVersion: 3, primary: \"localhost:20004\", lastUpdateTime: new Date(1600242425746), logicalSessionTimeoutMinutes: 30, hosts: { 0: \"localhost:20004\", 1: \"localhost:20005\", 2: \"localhost:20006\" }, arbiters: {}, passives: {} } }, logicalSessionTimeoutMinutes: 30, setName: \"shard-rs1\", compatible: true }","previousTopologyDescription":"{ id: \"0a9b4938-3c06-4cd8-951f-ef855136c62c\", topologyType: \"Unknown\", servers: { localhost:20004: { address: \"localhost:20004\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} }, localhost:20005: { address: \"localhost:20005\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} }, localhost:20006: { address: \"localhost:20006\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} } }, compatible: true }"}
|
|
// shard0 updates the shard registry with this info (the host its trying to contact port 20004 is removed)
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.766+0000 | 2020-09-16T07:47:05.747+00:00 I SHARDING 22732 [ShardRegistry-0] "Updating shard connection string on shard registry","attr":{"shardId":"shard-rs1","newShardConnectionString":"shard-rs1/localhost:20006","oldShardConnectionString":"shard-rs1/localhost:20004,localhost:20005,localhost:20006"}
|
|
// shard0 hears back from 20004 who is primary, and we've scheduled an update to the shard registry which has not happened yet
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.766+0000 | 2020-09-16T07:47:05.747+00:00 I NETWORK 4333213 [ReplicaSetMonitor-TaskExecutor] "RSM Topology Change","attr":{"replicaSet":"shard-rs1","newTopologyDescription":"{ id: \"0a9b4938-3c06-4cd8-951f-ef855136c62c\", topologyType: \"ReplicaSetWithPrimary\", servers: { localhost:20004: { address: \"localhost:20004\", topologyVersion: { processId: ObjectId('5f61c2e2d5e7470bb28fa484'), counter: 10 }, roundTripTime: 878, lastWriteDate: new Date(1600242425000), opTime: { ts: Timestamp(1600242425, 8), t: 1 }, type: \"RSPrimary\", minWireVersion: 10, maxWireVersion: 10, me: \"localhost:20004\", setName: \"shard-rs1\", setVersion: 3, electionId: ObjectId('7fffffff0000000000000001'), primary: \"localhost:20004\", lastUpdateTime: new Date(1600242425747), logicalSessionTimeoutMinutes: 30, hosts: { 0: \"localhost:20004\", 1: \"localhost:20005\", 2: \"localhost:20006\" }, arbiters: {}, passives: {} }, localhost:20005: { address: \"localhost:20005\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} }, localhost:20006: { address: \"localhost:20006\", topologyVersion: { processId: ObjectId('5f61c2e2f6d5680adfaa44f8'), counter: 5 }, roundTripTime: 826, lastWriteDate: new Date(1600242425000), opTime: { ts: Timestamp(1600242425, 7), t: 1 }, type: \"RSSecondary\", minWireVersion: 10, maxWireVersion: 10, me: \"localhost:20006\", setName: \"shard-rs1\", setVersion: 3, primary: \"localhost:20004\", lastUpdateTime: new Date(1600242425746), logicalSessionTimeoutMinutes: 30, hosts: { 0: \"localhost:20004\", 1: \"localhost:20005\", 2: \"localhost:20006\" }, arbiters: {}, passives: {} } }, logicalSessionTimeoutMinutes: 30, setName: \"shard-rs1\", compatible: true, maxSetVersion: 3, maxElectionId: ObjectId('7fffffff0000000000000001') }","previousTopologyDescription":"{ id: \"0a9b4938-3c06-4cd8-951f-ef855136c62c\", topologyType: \"ReplicaSetNoPrimary\", servers: { localhost:20004: { address: \"localhost:20004\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} }, localhost:20005: { address: \"localhost:20005\", type: \"Unknown\", minWireVersion: 0, maxWireVersion: 0, lastUpdateTime: new Date(-9223372036854775808), hosts: {}, arbiters: {}, passives: {} }, localhost:20006: { address: \"localhost:20006\", topologyVersion: { processId: ObjectId('5f61c2e2f6d5680adfaa44f8'), counter: 5 }, roundTripTime: 826, lastWriteDate: new Date(1600242425000), opTime: { ts: Timestamp(1600242425, 7), t: 1 }, type: \"RSSecondary\", minWireVersion: 10, maxWireVersion: 10, me: \"localhost:20006\", setName: \"shard-rs1\", setVersion: 3, primary: \"localhost:20004\", lastUpdateTime: new Date(1600242425746), logicalSessionTimeoutMinutes: 30, hosts: { 0: \"localhost:20004\", 1: \"localhost:20005\", 2: \"localhost:20006\" }, arbiters: {}, passives: {} } }, logicalSessionTimeoutMinutes: 30, setName: \"shard-rs1\", compatible: true }"}
|
|
// Update the shard registry
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.766+0000 | 2020-09-16T07:47:05.747+00:00 I SHARDING 471691 [ReplicaSetMonitor-TaskExecutor] "Updating the shard registry with confirmed replica set","attr":{"connectionString":"shard-rs1/localhost:20004,localhost:20005,localhost:20006"}
|
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.767+0000 | 2020-09-16T07:47:05.748+00:00 I SHARDING 22732 [ShardRegistry-0] "Updating shard connection string on shard registry","attr":{"shardId":"shard-rs1","newShardConnectionString":"shard-rs1/localhost:20004,localhost:20005,localhost:20006","oldShardConnectionString":"shard-rs1/localhost:20006"}
|
|
// fail with shard not found
|
[ShardedClusterFixture:job0:shard0:node1] 2020-09-16T07:47:05.767+0000 | 2020-09-16T07:47:05.749+00:00 D1 COMMAND 21962 [conn49] "Assertion while executing command","attr":{"command":"_cloneCollectionOptionsFromPrimaryShard","db":"test","commandArgs":{"_cloneCollectionOptionsFromPrimaryShard":"test.fts_diacriticsensitive","primaryShard":"shard-rs1","writeConcern":{"w":"majority","wtimeout":60000},"lsid":{"id":{"$uuid":"193c858a-fad8-406f-ae66-5f900007fcaf"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1600242425,"i":9}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$configTime":{"$timestamp":{"t":1600242425,"i":9}},"$topologyTime":{"$timestamp":{"t":1600242409,"i":14}},"$client":{"application":{"name":"MongoDB Shell"},"driver":{"name":"MongoDB Internal Client","version":"4.5.0-3164-g0bf3695-patch-5f61b9fe0ae6067c98def4fb"},"os":{"type":"Linux","name":"Red Hat Enterprise Linux Server release 6.2 (Santiago)","architecture":"x86_64","version":"Kernel 2.6.32-220.el6.x86_64"},"mongos":{"host":"ip-10-122-28-132:20007","client":"127.0.0.1:44550","version":"4.5.0-3164-g0bf3695-patch-5f61b9fe0ae6067c98def4fb"}},"$configServerState":{"opTime":{"ts":{"$timestamp":{"t":1600242425,"i":9}},"t":1}},"$db":"test"},"error":"ShardNotFound: No shard found for host: localhost:20004"}
|
The time for the logs for updating the shard registry and the command throwing ShardNotFound are at the same time - 2020-09-16T07:47:05.767. I think we'll want to make this update synchronous again!
|