1 year ago
#360092
Raphaël
Py4JJavaError: An error occurred while calling o51.transform
I am currently in distress. I am trying to run a spark code to classify pictures with a CNN and for this use the spark-deep-learning packages from Databricks. I followed their tutorial page and managed to upload the pictures, make the train and test dataframes, import the sparkdl package (path directory to the jar in my .bashrc). Basically everything is in my jupyter notebook as the code below:
import findspark
import sparkdl
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from sparkdl import DeepImageFeaturizer
findspark.init('/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7')
sc = SparkContext()
spark = SparkSession(sc)
img_dir = "/home/raphael/projet8/archive/fruits-360_dataset/fruits-360/Training/"
strawberry_df = spark.read.format("image").load(img_dir+"/Strawberry").withColumn("label", lit(1))
banana_df = spark.read.format("image").load(img_dir + "/Banana").withColumn("label", lit(0))
strawberry_train, strawberry_test = strawberry_df.randomSplit([.8, .2], seed=0)
banana_train, banana_test = banana_df.randomSplit([.8, .2], seed=0)
train_df = strawberry_train.unionAll(banana_train).cache()
test_df = strawberry_test.unionAll(banana_test).cache()
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])
p_model = p.fit(train_df)
Each step seems to run correctly but it crashes at the last one with a long succession of errors:
Py4JJavaError Traceback (most recent call last)
<ipython-input-34-5ce1a14f5a67> in <module>()
----> 1 p_model = p.fit(train_df)
/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/pipeline.py in _fit(self, dataset)
105 if isinstance(stage, Transformer):
106 transformers.append(stage)
--> 107 dataset = stage.transform(dataset)
108 else: # must be an Estimator
109 model = stage.fit(dataset)
/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/base.py in transform(self, dataset, params)
171 return self.copy(params)._transform(dataset)
172 else:
--> 173 return self._transform(dataset)
174 else:
175 raise ValueError("Params must be a param map but got %s." % type(params))
/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/wrapper.py in _transform(self, dataset)
310 def _transform(self, dataset):
311 self._transfer_params_to_java()
--> 312 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
313
314
/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o51.transform.
: java.lang.NoClassDefFoundError: org/tensorframes/ShapeDescription
at com.databricks.sparkdl.DeepImageFeaturizer.transform(DeepImageFeaturizer.scala:124)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
All my researches (and there have been a bunch) turned up nothing. I'm stuck and don't see the trick. I humbly ask for your help. How can I solve this ?
My configuration: Ubuntu 18.04.6, Python 3.6.9, Spark 2.4.8 (Hadoop 2.7), Spark-deep-learning-1.3.0 (spark 2.4 and scala 2.11), Hadoop 2.7.7, Scala 2.11.12, Tensorflow 1.14.0, Keras 2.3.1, Tensorframes 0.2.9, Pandas 1.1.5, Py4j 0.10.7, Java-8-openjdk-amd64.
python
java
apache-spark
pyspark
py4j
0 Answers
Your Answer