1 year ago

#360092

test-img

Raphaël

Py4JJavaError: An error occurred while calling o51.transform

I am currently in distress. I am trying to run a spark code to classify pictures with a CNN and for this use the spark-deep-learning packages from Databricks. I followed their tutorial page and managed to upload the pictures, make the train and test dataframes, import the sparkdl package (path directory to the jar in my .bashrc). Basically everything is in my jupyter notebook as the code below:

import findspark
import sparkdl

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

from sparkdl import DeepImageFeaturizer 

findspark.init('/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7')

sc = SparkContext()
spark = SparkSession(sc)

img_dir = "/home/raphael/projet8/archive/fruits-360_dataset/fruits-360/Training/"

strawberry_df = spark.read.format("image").load(img_dir+"/Strawberry").withColumn("label", lit(1))
banana_df = spark.read.format("image").load(img_dir + "/Banana").withColumn("label", lit(0))

strawberry_train, strawberry_test = strawberry_df.randomSplit([.8, .2], seed=0)
banana_train, banana_test = banana_df.randomSplit([.8, .2], seed=0)

train_df = strawberry_train.unionAll(banana_train).cache()
test_df = strawberry_test.unionAll(banana_test).cache()

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")

p = Pipeline(stages=[featurizer, lr])

p_model = p.fit(train_df)

Each step seems to run correctly but it crashes at the last one with a long succession of errors:

Py4JJavaError                             Traceback (most recent call last)
<ipython-input-34-5ce1a14f5a67> in <module>()
----> 1 p_model = p.fit(train_df)

/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/base.py in fit(self, dataset, params)
    130                 return self.copy(params)._fit(dataset)
    131             else:
--> 132                 return self._fit(dataset)
    133         else:
    134             raise ValueError("Params must be either a param map or a list/tuple of param maps, "

/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/pipeline.py in _fit(self, dataset)
    105                 if isinstance(stage, Transformer):
    106                     transformers.append(stage)
--> 107                     dataset = stage.transform(dataset)
    108                 else:  # must be an Estimator
    109                     model = stage.fit(dataset)

/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/base.py in transform(self, dataset, params)
    171                 return self.copy(params)._transform(dataset)
    172             else:
--> 173                 return self._transform(dataset)
    174         else:
    175             raise ValueError("Params must be a param map but got %s." % type(params))

/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/ml/wrapper.py in _transform(self, dataset)
    310     def _transform(self, dataset):
    311         self._transfer_params_to_java()
--> 312         return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
    313 
    314 

/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/home/raphael/projet8/spark-2.4.8-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling o51.transform.
: java.lang.NoClassDefFoundError: org/tensorframes/ShapeDescription
at com.databricks.sparkdl.DeepImageFeaturizer.transform(DeepImageFeaturizer.scala:124)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)

All my researches (and there have been a bunch) turned up nothing. I'm stuck and don't see the trick. I humbly ask for your help. How can I solve this ?

My configuration: Ubuntu 18.04.6, Python 3.6.9, Spark 2.4.8 (Hadoop 2.7), Spark-deep-learning-1.3.0 (spark 2.4 and scala 2.11), Hadoop 2.7.7, Scala 2.11.12, Tensorflow 1.14.0, Keras 2.3.1, Tensorframes 0.2.9, Pandas 1.1.5, Py4j 0.10.7, Java-8-openjdk-amd64.

python

java

apache-spark

pyspark

py4j

0 Answers

Your Answer

Accepted video resources