Uploaded image for project: 'Spark Connector'
  1. Spark Connector
  2. SPARK-172

Not able to Show/Write from spark DF read using mongo spark connector.

    • Type: Icon: Bug Bug
    • Resolution: Cannot Reproduce
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: 2.1.1, 2.2.1
    • Component/s: None
    • Labels:
      None
    • Environment:
      scalaVersion := "2.11.6"
      SparkVersion :="2.1.0" / " 2.2.0"/ "2.3.0"

      I am trying to read a huge complex document from MongoDB into spark data frame. When I convert this db to json, It works. But If I directly read from MongoDB I am getting the following error : *com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString{value='NaN'}*

      Able to read into DF and do all the processing. Getting error when I try to show it or write to a json/csv.

      at mongo$.main(mongo.scala:14) -- df.show()
      18/04/03 12:08:24 INFO DAGScheduler: ShuffleMapStage 1 (show at mongo.scala:14) failed in 15.240 s due to Job aborted due to stage failure: Task 109 in stage 1.0 failed 1 times, most recent failure: Lost task 109.0 in stage 1.0 (TID 110, localhost, executor driver): com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString\{value='NaN'\})
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:83)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.Iterator$class.foreach(Iterator.scala:750)
      	at scala.collection.AbstractIterator.foreach(Iterator.scala:1202)
      	at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
      	at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:67)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45)
      	at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$10.next(Iterator.scala:354)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
      	at org.apache.spark.scheduler.Task.run(Task.scala:109)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
      	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
      	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
      	at java.lang.Thread.run(Thread.java:748)
      
      Driver stacktrace:
      18/04/03 12:08:24 INFO DAGScheduler: Job 1 failed: show at mongo.scala:14, took 15.569032 s
      Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 109 in stage 1.0 failed 1 times, most recent failure: Lost task 109.0 in stage 1.0 (TID 110, localhost, executor driver): com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString{value='NaN'})
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:83)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.Iterator$class.foreach(Iterator.scala:750)
      	at scala.collection.AbstractIterator.foreach(Iterator.scala:1202)
      	at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
      	at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:67)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45)
      	at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$10.next(Iterator.scala:354)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
      	at org.apache.spark.scheduler.Task.run(Task.scala:109)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
      	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
      	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
      	at java.lang.Thread.run(Thread.java:748)
      
      Driver stacktrace:
      	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
      	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
      	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
      	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
      	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
      	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
      	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
      	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
      	at scala.Option.foreach(Option.scala:257)
      	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
      	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
      	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
      	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
      	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
      	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
      	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
      	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
      	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
      	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
      	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
      	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3272)
      	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
      	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
      	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253)
      	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
      	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252)
      	at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
      	at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
      	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
      	at org.apache.spark.sql.Dataset.show(Dataset.scala:723)
      	at org.apache.spark.sql.Dataset.show(Dataset.scala:682)
      	at org.apache.spark.sql.Dataset.show(Dataset.scala:691)
      	at mongo$.main(mongo.scala:14)
      	at mongo.main(mongo.scala)
      Caused by: com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a DoubleType (value: BsonString{value='NaN'})
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:83)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$com$mongodb$spark$sql$MapFunctions$$convertToDataType$2.apply(MapFunctions.scala:67)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.Iterator$class.foreach(Iterator.scala:750)
      	at scala.collection.AbstractIterator.foreach(Iterator.scala:1202)
      	at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
      	at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:67)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MapFunctions$.castToStructType(MapFunctions.scala:112)
      	at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:77)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      	at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
      	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      	at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
      	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      	at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      	at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45)
      	at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$2.apply(MongoRelation.scala:45)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at scala.collection.Iterator$$anon$10.next(Iterator.scala:354)
      	at scala.collection.Iterator$$anon$11.next(Iterator.scala:370)
      	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
      	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
      	at org.apache.spark.scheduler.Task.run(Task.scala:109)
      	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
      	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
      	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
      	at java.lang.Thread.run(Thread.java:748)
      18/04/03 12:08:24 INFO SparkContext: Invoking stop() from shutdown hook
      

            Assignee:
            ross@mongodb.com Ross Lawley
            Reporter:
            sathishr Sathish KR
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

              Created:
              Updated:
              Resolved: