|
Highlighed logs
// Cursor 5131255404875629212 was established on j1:cl0:s1:n2 (localhost:20256)
|
[j1:cl0:s1:n2] | 2023-04-12T09:40:38.900+00:00 I COMMAND 51803 [conn56] "Slow query","attr":{"type":"command","ns":"test.ed_db_cursor_mi","appName":"mongosync, version: 1.2.0-122-g9698ec32, id: shard-rs0, cluster: src, commit: 9698ec32, go version: go1.19.7, compiler: gc","command":{"aggregate":"ed_db_cursor_mi","pipeline":[{"$sample":{"size":0}},{"$project":{"_id":true}}],"allowDiskUse":true,"cursor":{"batchSize":0},"maxTimeMS":300000,"readConcern":{"level":"majority"},"collectionUUID":{"$uuid":"0959e90f-fb90-4529-9dc7-c74216bebe08"},"needsMerge":true,"let":{"CLUSTER_TIME":{"$timestamp":{"t":1681292437,"i":46}},"NOW":{"$date":"2023-04-12T09:40:38.406Z"}},"fromMongos":true,"collation":{"locale":"simple"},"$_generateV2ResumeTokens":false,"writeConcern":{"w":"majority","wtimeout":0,"provenance":"implicitDefault"},"shardVersion":{"t":{"$timestamp":{"t":1681292415,"i":66}},"e":{"$oid":"64367c7fecf82c44e5182052"},"v":{"$timestamp":{"t":1,"i":3}}},"clientOperationKey":{"$uuid":"a6ceb25a-c433-4fb8-ad6a-8fe859a88da7"},"lsid":{"id":{"$uuid":"95ef134b-562d-4f70-b298-31d63f814792"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"maxTimeMSOpOnly":300010,"$readPreference":{"mode":"nearest","hedge":{"enabled":true}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1681292437,"i":46}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$configTime":{"$timestamp":{"t":1681292437,"i":45}},"$topologyTime":{"$timestamp":{"t":1681292368,"i":6}},"$client":{"driver":{"name":"mongo-go-driver","version":"v1.11.3"},"os":{"type":"linux","architecture":"amd64"},"platform":"go1.19.7","application":{"name":"mongosync, version: 1.2.0-122-g9698ec32, id: shard-rs0, cluster: src, commit: 9698ec32, go version: go1.19.7, compiler: gc"},"mongos":{"host":"ip-10-128-27-197.ec2.internal:20257","client":"127.0.0.1:45484","version":"6.0.5-103-g9cdf8d1"}},"mayBypassWriteBlocking":false,"$db":"test"},"planSummary":"COLLSCAN","cursorid":5131255404875629212,"keysExamined":0,"docsExamined":0,"numYields":0,"nreturned":0,"queryHash":"EF4BF651","queryFramework":"classic","reslen":307,"locks":{"FeatureCompatibilityVersion":{"acquireCount":{"r":1}},"Global":{"acquireCount":{"r":1}},"Mutex":{"acquireCount":{"r":3}}},"readConcern":{"level":"majority","provenance":"clientSupplied"},"writeConcern":{"w":"majority","wtimeout":0,"provenance":"implicitDefault"},"storage":{},"remote":"127.0.0.1:39506","protocol":"op_msg","durationMillis":493}
|
|
// j1:cl0:s0:n0 executes $mergeCursors or 5131255404875629212 (above) and 2274519845428743609 (localhost:20252).
|
// It fails dues to NotPrimaryOrSecondary from 20252 because it is in the middle of a rollback.
|
[j1:cl0:s0:n0] | 2023-04-12T09:40:38.902+00:00 W COMMAND 23799 [conn89] "Aggregate command executor error","attr":{"error":{"code":13436,"codeName":"NotPrimaryOrSecondary","errmsg":"Error on remote shard localhost:20252 :: caused by :: node is not in primary or recovering state"},"stats":{},"cmd":{"aggregate":"ed_db_cursor_mi","pipeline":[{"$mergeCursors":{"lsid":{"id":{"$uuid":"95ef134b-562d-4f70-b298-31d63f814792"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"sort":{"$rand":-1},"compareWholeSortKey":false,"remotes":[{"shardId":"shard-rs0","hostAndPort":"localhost:20252","cursorResponse":{"cursor":{"id":2274519845428743609,"ns":"test.ed_db_cursor_mi","firstBatch":[]},"ok":1}},{"shardId":"shard-rs1","hostAndPort":"localhost:20256","cursorResponse":{"cursor":{"id":5131255404875629212,"ns":"test.ed_db_cursor_mi","firstBatch":[]},"ok":1}}],"tailableMode":"normal","nss":"test.ed_db_cursor_mi","allowPartialResults":false,"recordRemoteOpWaitTime":true}},{"$replaceRoot":{"newRoot":{"$arrayToObject":[[{"k":{"$const":"_id"},"v":{"$toHashedIndexKey":"$_id"}}]]}}},{"$bucketAuto":{"groupBy":"$$ROOT","buckets":4,"output":{"count":{"$sum":{"$const":1}}}}}],"allowDiskUse":true,"cursor":{"batchSize":101},"maxTimeMS":300000,"readConcern":{"level":"majority"},"collectionUUID":{"$uuid":"0959e90f-fb90-4529-9dc7-c74216bebe08"},"fromMongos":true,"let":{"CLUSTER_TIME":{"$timestamp":{"t":1681292437,"i":46}},"NOW":{"$date":"2023-04-12T09:40:38.406Z"}},"collation":{"locale":"simple"},"shardVersion":{"t":{"$timestamp":{"t":4294967295,"i":4294967295}},"e":{"$oid":"00000000ffffffffffffffff"},"v":{"$timestamp":{"t":0,"i":0}}},"writeConcern":{"w":"majority","wtimeout":0,"provenance":"implicitDefault"},"lsid":{"id":{"$uuid":"95ef134b-562d-4f70-b298-31d63f814792"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"maxTimeMSOpOnly":299515,"$readPreference":{"mode":"nearest","hedge":{"enabled":true}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1681292437,"i":46}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$configTime":{"$timestamp":{"t":1681292437,"i":45}},"$topologyTime":{"$timestamp":{"t":1681292368,"i":6}},"$client":{"driver":{"name":"mongo-go-driver","version":"v1.11.3"},"os":{"type":"linux","architecture":"amd64"},"platform":"go1.19.7","application":{"name":"mongosync, version: 1.2.0-122-g9698ec32, id: shard-rs0, cluster: src, commit: 9698ec32, go version: go1.19.7, compiler: gc"},"mongos":{"host":"ip-10-128-27-197.ec2.internal:20257","client":"127.0.0.1:45484","version":"6.0.5-103-g9cdf8d1"}},"mayBypassWriteBlocking":false,"$db":"test"}}
|
|
// The error is propagated back to mongos.
|
[j1:cl0:s] | 2023-04-12T09:40:38.902+00:00 I NETWORK 4712102 [conn58] "Host failed in replica set","attr":{"replicaSet":"shard-rs0","host":"localhost:20251","error":{"code":13436,"codeName":"NotPrimaryOrSecondary","errmsg":"PlanExecutor error during aggregation :: caused by :: Error on remote shard localhost:20252 :: caused by :: node is not in primary or recovering state"},"action":{"dropConnections":false,"requestImmediateCheck":true,"outcome":{"host":"localhost:20251","success":false,"errorMessage":"NotPrimaryOrSecondary: PlanExecutor error during aggregation :: caused by :: Error on remote shard localhost:20252 :: caused by :: node is not in primary or recovering state"}}}
|
|
// Mongos appears to retry the same $mergeCursors. But it finds CursorNotFound (cursor 5131255404875629212, localhost:20256)
|
// Did the previous $mergeCursors failure cause the merging shard to kill the cursors?
|
[j1:cl0:s0:n1] | 2023-04-12T09:40:41.514+00:00 W COMMAND 23799 [conn54] "Aggregate command executor error","attr":{"error":{"code":43,"codeName":"CursorNotFound","errmsg":"Error on remote shard localhost:20256 :: caused by :: cursor id 5131255404875629212 not found"},"stats":{},"cmd":{"aggregate":"ed_db_cursor_mi","pipeline":[{"$mergeCursors":{"lsid":{"id":{"$uuid":"95ef134b-562d-4f70-b298-31d63f814792"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"sort":{"$rand":-1},"compareWholeSortKey":false,"remotes":[{"shardId":"shard-rs0","hostAndPort":"localhost:20252","cursorResponse":{"cursor":{"id":2274519845428743609,"ns":"test.ed_db_cursor_mi","firstBatch":[]},"ok":1}},{"shardId":"shard-rs1","hostAndPort":"localhost:20256","cursorResponse":{"cursor":{"id":5131255404875629212,"ns":"test.ed_db_cursor_mi","firstBatch":[]},"ok":1}}],"tailableMode":"normal","nss":"test.ed_db_cursor_mi","allowPartialResults":false,"recordRemoteOpWaitTime":true}},{"$replaceRoot":{"newRoot":{"$arrayToObject":[[{"k":{"$const":"_id"},"v":{"$toHashedIndexKey":"$_id"}}]]}}},{"$bucketAuto":{"groupBy":"$$ROOT","buckets":4,"output":{"count":{"$sum":{"$const":1}}}}}],"allowDiskUse":true,"cursor":{"batchSize":101},"maxTimeMS":300000,"readConcern":{"level":"majority"},"collectionUUID":{"$uuid":"0959e90f-fb90-4529-9dc7-c74216bebe08"},"fromMongos":true,"let":{"CLUSTER_TIME":{"$timestamp":{"t":1681292437,"i":46}},"NOW":{"$date":"2023-04-12T09:40:38.406Z"}},"collation":{"locale":"simple"},"shardVersion":{"t":{"$timestamp":{"t":4294967295,"i":4294967295}},"e":{"$oid":"00000000ffffffffffffffff"},"v":{"$timestamp":{"t":0,"i":0}}},"writeConcern":{"w":"majority","wtimeout":0,"provenance":"implicitDefault"},"lsid":{"id":{"$uuid":"95ef134b-562d-4f70-b298-31d63f814792"},"uid":{"$binary":{"base64":"47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=","subType":"0"}}},"maxTimeMSOpOnly":296902,"$readPreference":{"mode":"nearest","hedge":{"enabled":true}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1681292439,"i":5}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"$configTime":{"$timestamp":{"t":1681292439,"i":2}},"$topologyTime":{"$timestamp":{"t":1681292368,"i":6}},"$client":{"driver":{"name":"mongo-go-driver","version":"v1.11.3"},"os":{"type":"linux","architecture":"amd64"},"platform":"go1.19.7","application":{"name":"mongosync, version: 1.2.0-122-g9698ec32, id: shard-rs0, cluster: src, commit: 9698ec32, go version: go1.19.7, compiler: gc"},"mongos":{"host":"ip-10-128-27-197.ec2.internal:20257","client":"127.0.0.1:45484","version":"6.0.5-103-g9cdf8d1"}},"mayBypassWriteBlocking":false,"$db":"test"}}
|
|
// CursorNotFound is returned to the client, rather than the NotPrimaryOrSecondary that would have been returned in a replica set environment.
|
[j1:cl0:s] | 2023-04-12T09:40:41.515+00:00 I COMMAND 51803 [conn58] "Slow query","attr":{"type":"command","ns":"test.ed_db_cursor_mi","appName":"mongosync, version: 1.2.0-122-g9698ec32, id: shard-rs0, cluster: src, commit: 9698ec32, go version: go1.19.7, compiler: gc","command":{"aggregate":"ed_db_cursor_mi","collectionUUID":{"$uuid":"0959e90f-fb90-4529-9dc7-c74216bebe08"},"pipeline":[{"$sample":{"size":0}},{"$replaceWith":{"$arrayToObject":[[{"k":"_id","v":{"$toHashedIndexKey":"$_id"}}]]}},{"$bucketAuto":{"groupBy":"$$ROOT","buckets":4}}],"allowDiskUse":true,"cursor":{},"readConcern":{"level":"majority"},"lsid":{"id":{"$uuid":"95ef134b-562d-4f70-b298-31d63f814792"}},"$clusterTime":{"clusterTime":{"$timestamp":{"t":1681292437,"i":46}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"maxTimeMS":300000,"$db":"test","$readPreference":{"mode":"nearest"}},"nShards":2,"numYields":0,"ok":0,"errMsg":"PlanExecutor error during aggregation :: caused by :: Error on remote shard localhost:20256 :: caused by :: cursor id 5131255404875629212 not found","errName":"CursorNotFound","errCode":43,"reslen":362,"readConcern":{"level":"majority","provenance":"clientSupplied"},"remote":"127.0.0.1:45484","protocol":"op_msg","durationMillis":3109}
|
I suspect that the mongos is retrying a $mergeCursors after a failure, but the underlaying cursors might have been killed by the previous attempt. Thus CursorNotFound is returned to the client, which did not expect that. (In a replica set, it would have received NotPrimaryOrSecondary, which it is prepared to handle. Reassigning to query execution.
|