1 year ago

#373451

test-img

Enes

Java Heap Space Error in SparklyR & Tidymodels

Hi,

I am trying to model a large dataset (2000000+ rows, ~150 columns) using the tidymodels and the spark engine. However, I am receiving the following error: java.lang.OutOfMemoryError: Java heap space

The below code includes 10 rows from the dataset. But, for reproducibility, if I could provide it in a better format please let me know.

You can find the full output, the code, and session info below.

Thank you very much for your help.

rm(list = ls())
library(sparklyr)
library(tidyverse)
library(tidymodels)
setwd("~/Desktop/USERS/ENES/DATA")

sc <- spark_connect(master = "local", version = "3.1.2")

df <- spark_read_csv(sc = sc, name = "df", path = "BandCombination_MainTIFF_Pixels_v02.csv", overwrite = TRUE)

df <- df %>% 
  mutate(Urun = case_when(Urun == "misir tohumluk" ~ "misir",
                          Urun == "misir tohumluk bet" ~ "misir",
                          Urun == "karpuz hasat" ~ "karpuz",
                          Urun == "vegetation" ~ "vejetasyon",
                          Urun == "misir yeni ekilmis" ~ "misir",
                          Urun == "orchards" ~ "orchard",
                          TRUE ~ Urun)) 

train_df <- df %>% 
  filter(layer == "TrainSet") %>% 
  select(Urun, "18_Mart_2019_Blue":"27_Temmuz_2019_NDVI_MOC2")

test_df <- df %>% 
  filter(layer == "TestSet") %>% 
  select(Urun, "18_Mart_2019_Blue":"27_Temmuz_2019_NDVI_MOC2")

> train_df
# Source: spark<?> [?? x 145]
   Urun  `18_Mart_2019_Blue` `18_Mart_2019_Gr…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…`
   <chr>               <int>              <int>            <int>            <int>            <dbl>            <dbl>            <dbl>            <dbl>
 1 aniz                  330                447              383             4197            0.833             2.96            0.137            0.978
 2 aniz                  339                447              403             4197            0.825             2.75            0.161            0.970
 3 aniz                  339                464              403             4229            0.826             2.26            0.224            0.943
 4 aniz                  349                475              403             4245            0.827             2.26            0.224            0.943
 5 aniz                  373                470              416             4133            0.817             3.08            0.127            0.969
 6 aniz                  349                464              396             4245            0.829             3.13            0.123            0.989
 7 aniz                  339                442              370             4277            0.841             2.92            0.141            0.955
 8 aniz                  349                447              396             4245            0.829             2.67            0.181            0.883
 9 aniz                  339                447              396             4229            0.829             1.72            0.373            0.581
10 aniz                  334                447              396             4245            0.829             2.46            0.205            0.857
# … with more rows, and 136 more variables: `2526_Mart_2019_Blue` <int>, `2526_Mart_2019_Green` <int>, `2526_Mart_2019_Red` <int>,
#   `2526_Mart_2019_NIR` <int>, `2526_Mart_2019_NDVI` <dbl>, `2526_Mart_2019_NDVI_Entropy` <dbl>, `2526_Mart_2019_NDVI_ASM` <dbl>,
#   `2526_Mart_2019_NDVI_MOC2` <dbl>, `5_Nisan_2019_Blue` <int>, `5_Nisan_2019_Green` <int>, `5_Nisan_2019_Red` <int>, `5_Nisan_2019_NIR` <int>,
#   `5_Nisan_2019_NDVI` <dbl>, `5_Nisan_2019_NDVI_Entropy` <dbl>, `5_Nisan_2019_NDVI_ASM` <dbl>, `5_Nisan_2019_NDVI_MOC2` <dbl>,
#   `22_Nisan_2019_Blue` <int>, `22_Nisan_2019_Green` <int>, `22_Nisan_2019_Red` <int>, `22_Nisan_2019_NIR` <int>, `22_Nisan_2019_NDVI` <dbl>,
#   `22_Nisan_2019_NDVI_Entropy` <dbl>, `22_Nisan_2019_NDVI_ASM` <dbl>, `22_Nisan_2019_NDVI_MOC2` <dbl>, `2930_Nisan_2019_Blue` <int>,
#   `2930_Nisan_2019_Green` <int>, `2930_Nisan_2019_Red` <int>, `2930_Nisan_2019_NIR` <int>, `2930_Nisan_2019_NDVI` <dbl>, …
>
rf_fit <- rand_forest(trees = 500) %>% 
  set_mode("classification") %>% 
  set_engine("spark") %>% 
  fit(Urun ~., data = train_df)

The error I received.

Error: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 18.0 failed 1 times, most recent failure: Lost task 5.0 in stage 18.0 (TID 375) (localhost executor driver): java.lang.OutOfMemoryError: Java heap space
    at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
    at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:651)
    at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:647)
    at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$4683/0x0000000840f77840.apply(Unknown Source)
    at scala.Array$.tabulate(Array.scala:334)
    at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:647)
    at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$4664/0x0000000840f2e040.apply(Unknown Source)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
    at org.apache.spark.rdd.RDD$$Lambda$2697/0x0000000841706440.apply(Unknown Source)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
    at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
    at org.apache.spark.scheduler.Task.run(Task.scala:131)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
    at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2504/0x000000084161a040.apply(Unknown Source)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
    at scala.Option.foreach(Option.scala:407)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
    at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
    at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
    at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)
    at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:663)
    at org.apache.spark.ml.tree.impl.RandomForest$.runBagged(RandomForest.scala:208)
    at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:302)
    at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:161)
    at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
    at scala.util.Try$.apply(Try.scala:213)
    at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
    at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:138)
    at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46)
    at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
    at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$5(Pipeline.scala:151)
    at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:130)
    at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:123)
    at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$4(Pipeline.scala:151)
    at scala.collection.Iterator.foreach(Iterator.scala:941)
    at scala.collection.Iterator.foreach$(Iterator.scala:941)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$2(Pipeline.scala:147)
    at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:130)
    at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:123)
    at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$1(Pipeline.scala:133)
    at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
    at scala.util.Try$.apply(Try.scala:213)
    at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
    at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:133)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:566)
    at sparklyr.Invoke.invoke(invoke.scala:161)
    at sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
    at sparklyr.StreamHandler.read(stream.scala:62)
    at sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
    at scala.util.control.Breaks.breakable(Breaks.scala:42)
    at sparklyr.BackendHandler.channelRead0(handler.scala:41)
    at sparklyr.BackendHandler.channelRead0(handler.scala:14)
    at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
    at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandle
Error: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.base/java.lang.reflect.Method.invoke(Method.java:566)
sparklyr.Invoke.invoke(invoke.scala:161)
sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
sparklyr.StreamHandler.read(stream.scala:62)
sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
scala.util.control.Breaks.breakable(Breaks.scala:42)
sparklyr.BackendHandler.channelRead0(handler.scala:41)
sparklyr.BackendHandler.channelRead0(handler.scala:14)
io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)

The currently active SparkContext was created at:

org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.base/java.lang.reflect.Method.invoke(Method.java:566)
sparklyr.Invoke.invoke(invoke.scala:161)
sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
sparklyr.StreamHandler.read(stream.scala:62)
sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
scala.util.control.Breaks.breakable(Breaks.scala:42)
sparklyr.BackendHandler.channelRead0(handler.scala:41)
sparklyr.BackendHandler.channelRead0(handler.scala:14)
io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
         
    at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)
    at org.apache.spark.SparkContext.cancelAllJobs(SparkContext.scala:2381)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:566)
    at sparklyr.Invoke.invoke(invoke.scala:161)
    at sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
    at sparklyr.StreamHandler.read(stream.scala:62)
    at sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
    at scala.util.control.Breaks.breakable(Breaks.scala:42)
    at sparklyr.BackendHandler.channelRead0(handler.scala:41)
    at sparklyr.BackendHandler.channelRead0(handler.scala:14)
    at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
    at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
    at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:324)
    at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:296)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
    at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
    at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163)
    at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714)
    at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650)
    at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576)
    at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493)
    at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989)
    at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
    at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
    at java.base/java.lang.Thread.run(Thread.java:829)
Timing stopped at: 1.604 0.338 134.9
> devtools::session_info()
─ Session info ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 setting  value                       
 version  R version 3.6.3 (2020-02-29)
 os       Ubuntu 18.04.6 LTS          
 system   x86_64, linux-gnu           
 ui       RStudio                     
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Europe            
 date     2022-04-04                  

r

apache-spark

sparkr

sparklyr

tidymodels

0 Answers

Your Answer

Accepted video resources