-
Type: Bug
-
Resolution: Cannot Reproduce
-
Priority: Major - P3
-
None
-
Affects Version/s: 2.1.1, 2.2.1
-
Component/s: None
-
Labels:None
-
Environment:scalaVersion := "2.11.6"
SparkVersion :="2.1.0" / " 2.2.0"/ "2.3.0"
I am trying to read a huge complex document from MongoDB into spark data frame. When I convert this db to json, It works. But If I directly read from MongoDB I am getting the following error : *com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString{value='NaN'}*
Able to read into DF and do all the processing. Getting error when I try to show it or write to a json/csv.
at mongo$.main(mongo.scala:14) -- df.show() 18/04/03 12:08:24 INFO DAGScheduler: ShuffleMapStage 1 (show at mongo.scala:14) failed in 15.240 s due to Job aborted due to stage failure: Task 109 in stage 1.0 failed 1 times, most recent failure: Lost task 109.0 in stage 1.0 (TID 110, localhost, executor driver): com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString\{value='NaN'\}) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:83) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77) at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67) at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.Iterator$class.foreach(Iterator.scala:750) at scala.collection.AbstractIterator.foreach(Iterator.scala:1202) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at scala.collection.AbstractIterable.foreach(Iterable.scala:54) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.AbstractTraversable.map(Traversable.scala:104) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:67) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45) at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$10.next(Iterator.scala:354) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748) Driver stacktrace: 18/04/03 12:08:24 INFO DAGScheduler: Job 1 failed: show at mongo.scala:14, took 15.569032 s Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 109 in stage 1.0 failed 1 times, most recent failure: Lost task 109.0 in stage 1.0 (TID 110, localhost, executor driver): com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString{value='NaN'}) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:83) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77) at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67) at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.Iterator$class.foreach(Iterator.scala:750) at scala.collection.AbstractIterator.foreach(Iterator.scala:1202) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at scala.collection.AbstractIterable.foreach(Iterable.scala:54) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.AbstractTraversable.map(Traversable.scala:104) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:67) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45) at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$10.next(Iterator.scala:354) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067) at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363) at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3272) at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484) at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484) at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252) at org.apache.spark.sql.Dataset.head(Dataset.scala:2484) at org.apache.spark.sql.Dataset.take(Dataset.scala:2698) at org.apache.spark.sql.Dataset.showString(Dataset.scala:254) at org.apache.spark.sql.Dataset.show(Dataset.scala:723) at org.apache.spark.sql.Dataset.show(Dataset.scala:682) at org.apache.spark.sql.Dataset.show(Dataset.scala:691) at mongo$.main(mongo.scala:14) at mongo.main(mongo.scala) Caused by: com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString{value='NaN'}) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:83) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77) at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67) at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.Iterator$class.foreach(Iterator.scala:750) at scala.collection.AbstractIterator.foreach(Iterator.scala:1202) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at scala.collection.AbstractIterable.foreach(Iterable.scala:54) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.AbstractTraversable.map(Traversable.scala:104) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:67) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112) at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39) at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37) at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45) at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$$anon$10.next(Iterator.scala:354) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:748) 18/04/03 12:08:24 INFO SparkContext: Invoking stop() from shutdown hook