Error in simple spark application -
i'm running simple spark application 'word vector'. here code (this spark website)
import org.apache.spark._ import org.apache.spark.rdd._ import org.apache.spark.sparkcontext._ import org.apache.spark.mllib.feature.{word2vec, word2vecmodel} object simpleapp { def main(args: array[string]) { val conf = new sparkconf().setappname("word2vector") val sc = new sparkcontext(conf) val input = sc.textfile("text8").map(line => line.split(" ").toseq) val word2vec = new word2vec() val model = word2vec.fit(input) val synonyms = model.findsynonyms("china", 40) for((synonym, cosinesimilarity) <- synonyms) { println(s"$synonym $cosinesimilarity") } // save , load model model.save(sc, "mymodelpath") }
}
when running it gives me following error message
exception in thread "main" org.apache.hadoop.mapred.invalidinputexception: input path not exist: hdfs://gxydevvm:8020/user/hadoop/your_spark_home/readme.md @ org.apache.hadoop.mapred.fileinputformat.singlethreadedliststatus(fileinputformat.java:285) @ org.apache.hadoop.mapred.fileinputformat.liststatus(fileinputformat.java:228) @ org.apache.hadoop.mapred.fileinputformat.getsplits(fileinputformat.java:313) @ org.apache.spark.rdd.hadooprdd.getpartitions(hadooprdd.scala:207) @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:219) @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:217) @ scala.option.getorelse(option.scala:120) @ org.apache.spark.rdd.rdd.partitions(rdd.scala:217) @ org.apache.spark.rdd.mappartitionsrdd.getpartitions(mappartitionsrdd.scala:32) @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:219) @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:217) @ scala.option.getorelse(option.scala:120) @ org.apache.spark.rdd.rdd.partitions(rdd.scala:217) @ org.apache.spark.rdd.mappartitionsrdd.getpartitions(mappartitionsrdd.scala:32) @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:219) @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:217) @ scala.option.getorelse(option.scala:120) @ org.apache.spark.rdd.rdd.partitions(rdd.scala:217) @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1781) @ org.apache.spark.rdd.rdd.count(rdd.scala:1099) @ org.apache.spark.api.java.javarddlike$class.count(javarddlike.scala:442) @ org.apache.spark.api.java.abstractjavarddlike.count(javarddlike.scala:47) @ simpleapp.main(simpleapp.java:13) @ sun.reflect.nativemethodaccessorimpl.invoke0(native method) @ sun.reflect.nativemethodaccessorimpl.invoke(nativemethodaccessorimpl.java:57) @ sun.reflect.delegatingmethodaccessorimpl.invoke(delegatingmethodaccessorimpl.java:43) @ java.lang.reflect.method.invoke(method.java:606) @ org.apache.spark.deploy.sparksubmit$.org$apache$spark$deploy$sparksubmit$$runmain(sparksubmit.scala:665) @ org.apache.spark.deploy.sparksubmit$.dorunmain$1(sparksubmit.scala:170) @ org.apache.spark.deploy.sparksubmit$.submit(sparksubmit.scala:193) @ org.apache.spark.deploy.sparksubmit$.main(sparksubmit.scala:112) @ org.apache.spark.deploy.sparksubmit.main(sparksubmit.scala)
what problem? addess coming /user/hadoop/your_spark_home/readme.md
this related default spark configuration. take (or use grep) in conf directory of spark home directory. should find spark-env.sh file, contain reference strange file. in fact, spark trying load file hdfs (kind of standard if run spark on cluster : input / output should reachable master, , workers slaves). if use spark locally have configure spark context using setmaster method. here version :
object sparkdemo { def log[a](key:string)(job : =>a) = { val start = system.currenttimemillis val output = job println("===> %s in %s seconds" .format(key, (system.currenttimemillis - start) / 1000.0)) output } def main(args: array[string]):unit ={ val modelname ="w2vmodel" val sc = new sparkcontext( new sparkconf() .setappname("sparkdemo") .set("spark.executor.memory", "4g") .set("spark.driver.maxresultsize", "16g") .setmaster("spark://192.168.1.53:7077") // ip of spark master. // .setmaster("local[2]") // not work... workers loose contact master after 120s ) // take target folder if unsure how jar named // onliner compile / run : sbt package && sbt run sc.addjar("./target/scala-2.10/sparkling_2.10-0.1.jar") val input = sc.textfile("./text8").map(line => line.split(" ").toseq) val word2vec = new word2vec() val model = log("compute model") { word2vec.fit(input) } log ("save model") { model.save(sc, modelname) } val synonyms = model.findsynonyms("china", 40) for((synonym, cosinesimilarity) <- synonyms) { println(s"$synonym $cosinesimilarity") } val model2 = log("reload model") { word2vecmodel.load(sc, modelname) } } }
Comments
Post a Comment