1 year ago
#373451
Enes
Java Heap Space Error in SparklyR & Tidymodels
Hi,
I am trying to model a large dataset (2000000+ rows, ~150 columns) using the tidymodels
and the spark
engine. However, I am receiving the following error: java.lang.OutOfMemoryError: Java heap space
The below code includes 10 rows from the dataset. But, for reproducibility, if I could provide it in a better format please let me know.
You can find the full output, the code, and session info below.
Thank you very much for your help.
rm(list = ls())
library(sparklyr)
library(tidyverse)
library(tidymodels)
setwd("~/Desktop/USERS/ENES/DATA")
sc <- spark_connect(master = "local", version = "3.1.2")
df <- spark_read_csv(sc = sc, name = "df", path = "BandCombination_MainTIFF_Pixels_v02.csv", overwrite = TRUE)
df <- df %>%
mutate(Urun = case_when(Urun == "misir tohumluk" ~ "misir",
Urun == "misir tohumluk bet" ~ "misir",
Urun == "karpuz hasat" ~ "karpuz",
Urun == "vegetation" ~ "vejetasyon",
Urun == "misir yeni ekilmis" ~ "misir",
Urun == "orchards" ~ "orchard",
TRUE ~ Urun))
train_df <- df %>%
filter(layer == "TrainSet") %>%
select(Urun, "18_Mart_2019_Blue":"27_Temmuz_2019_NDVI_MOC2")
test_df <- df %>%
filter(layer == "TestSet") %>%
select(Urun, "18_Mart_2019_Blue":"27_Temmuz_2019_NDVI_MOC2")
> train_df
# Source: spark<?> [?? x 145]
Urun `18_Mart_2019_Blue` `18_Mart_2019_Gr…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…` `18_Mart_2019_…`
<chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 aniz 330 447 383 4197 0.833 2.96 0.137 0.978
2 aniz 339 447 403 4197 0.825 2.75 0.161 0.970
3 aniz 339 464 403 4229 0.826 2.26 0.224 0.943
4 aniz 349 475 403 4245 0.827 2.26 0.224 0.943
5 aniz 373 470 416 4133 0.817 3.08 0.127 0.969
6 aniz 349 464 396 4245 0.829 3.13 0.123 0.989
7 aniz 339 442 370 4277 0.841 2.92 0.141 0.955
8 aniz 349 447 396 4245 0.829 2.67 0.181 0.883
9 aniz 339 447 396 4229 0.829 1.72 0.373 0.581
10 aniz 334 447 396 4245 0.829 2.46 0.205 0.857
# … with more rows, and 136 more variables: `2526_Mart_2019_Blue` <int>, `2526_Mart_2019_Green` <int>, `2526_Mart_2019_Red` <int>,
# `2526_Mart_2019_NIR` <int>, `2526_Mart_2019_NDVI` <dbl>, `2526_Mart_2019_NDVI_Entropy` <dbl>, `2526_Mart_2019_NDVI_ASM` <dbl>,
# `2526_Mart_2019_NDVI_MOC2` <dbl>, `5_Nisan_2019_Blue` <int>, `5_Nisan_2019_Green` <int>, `5_Nisan_2019_Red` <int>, `5_Nisan_2019_NIR` <int>,
# `5_Nisan_2019_NDVI` <dbl>, `5_Nisan_2019_NDVI_Entropy` <dbl>, `5_Nisan_2019_NDVI_ASM` <dbl>, `5_Nisan_2019_NDVI_MOC2` <dbl>,
# `22_Nisan_2019_Blue` <int>, `22_Nisan_2019_Green` <int>, `22_Nisan_2019_Red` <int>, `22_Nisan_2019_NIR` <int>, `22_Nisan_2019_NDVI` <dbl>,
# `22_Nisan_2019_NDVI_Entropy` <dbl>, `22_Nisan_2019_NDVI_ASM` <dbl>, `22_Nisan_2019_NDVI_MOC2` <dbl>, `2930_Nisan_2019_Blue` <int>,
# `2930_Nisan_2019_Green` <int>, `2930_Nisan_2019_Red` <int>, `2930_Nisan_2019_NIR` <int>, `2930_Nisan_2019_NDVI` <dbl>, …
>
rf_fit <- rand_forest(trees = 500) %>%
set_mode("classification") %>%
set_engine("spark") %>%
fit(Urun ~., data = train_df)
The error I received.
Error: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 18.0 failed 1 times, most recent failure: Lost task 5.0 in stage 18.0 (TID 375) (localhost executor driver): java.lang.OutOfMemoryError: Java heap space
at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:77)
at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:651)
at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:647)
at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$4683/0x0000000840f77840.apply(Unknown Source)
at scala.Array$.tabulate(Array.scala:334)
at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:647)
at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$4664/0x0000000840f2e040.apply(Unknown Source)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
at org.apache.spark.rdd.RDD$$Lambda$2697/0x0000000841706440.apply(Unknown Source)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2504/0x000000084161a040.apply(Unknown Source)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)
at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:663)
at org.apache.spark.ml.tree.impl.RandomForest$.runBagged(RandomForest.scala:208)
at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:302)
at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:161)
at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:138)
at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
at org.apache.spark.ml.Pipeline.$anonfun$fit$5(Pipeline.scala:151)
at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:130)
at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:123)
at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
at org.apache.spark.ml.Pipeline.$anonfun$fit$4(Pipeline.scala:151)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.ml.Pipeline.$anonfun$fit$2(Pipeline.scala:147)
at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:130)
at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:123)
at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
at org.apache.spark.ml.Pipeline.$anonfun$fit$1(Pipeline.scala:133)
at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:133)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at sparklyr.Invoke.invoke(invoke.scala:161)
at sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
at sparklyr.StreamHandler.read(stream.scala:62)
at sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
at scala.util.control.Breaks.breakable(Breaks.scala:42)
at sparklyr.BackendHandler.channelRead0(handler.scala:41)
at sparklyr.BackendHandler.channelRead0(handler.scala:14)
at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandle
Error: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:
org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.base/java.lang.reflect.Method.invoke(Method.java:566)
sparklyr.Invoke.invoke(invoke.scala:161)
sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
sparklyr.StreamHandler.read(stream.scala:62)
sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
scala.util.control.Breaks.breakable(Breaks.scala:42)
sparklyr.BackendHandler.channelRead0(handler.scala:41)
sparklyr.BackendHandler.channelRead0(handler.scala:14)
io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
The currently active SparkContext was created at:
org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.base/java.lang.reflect.Method.invoke(Method.java:566)
sparklyr.Invoke.invoke(invoke.scala:161)
sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
sparklyr.StreamHandler.read(stream.scala:62)
sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
scala.util.control.Breaks.breakable(Breaks.scala:42)
sparklyr.BackendHandler.channelRead0(handler.scala:41)
sparklyr.BackendHandler.channelRead0(handler.scala:14)
io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)
at org.apache.spark.SparkContext.cancelAllJobs(SparkContext.scala:2381)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at sparklyr.Invoke.invoke(invoke.scala:161)
at sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
at sparklyr.StreamHandler.read(stream.scala:62)
at sparklyr.BackendHandler.$anonfun$channelRead0$1(handler.scala:60)
at scala.util.control.Breaks.breakable(Breaks.scala:42)
at sparklyr.BackendHandler.channelRead0(handler.scala:41)
at sparklyr.BackendHandler.channelRead0(handler.scala:14)
at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:324)
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:296)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493)
at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989)
at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.base/java.lang.Thread.run(Thread.java:829)
Timing stopped at: 1.604 0.338 134.9
> devtools::session_info()
─ Session info ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
setting value
version R version 3.6.3 (2020-02-29)
os Ubuntu 18.04.6 LTS
system x86_64, linux-gnu
ui RStudio
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe
date 2022-04-04
r
apache-spark
sparkr
sparklyr
tidymodels
0 Answers
Your Answer