Error in simple spark application -


i'm running simple spark application 'word vector'. here code (this spark website)

import org.apache.spark._ import org.apache.spark.rdd._ import org.apache.spark.sparkcontext._ import org.apache.spark.mllib.feature.{word2vec, word2vecmodel}  object simpleapp {   def main(args: array[string]) {   val conf = new sparkconf().setappname("word2vector")   val sc = new sparkcontext(conf)   val input = sc.textfile("text8").map(line => line.split(" ").toseq)   val word2vec = new word2vec()   val model = word2vec.fit(input)   val synonyms = model.findsynonyms("china", 40)   for((synonym, cosinesimilarity) <- synonyms) {     println(s"$synonym $cosinesimilarity")   }   // save , load model   model.save(sc, "mymodelpath")  } 

}

when running it gives me following error message

 exception in thread "main" org.apache.hadoop.mapred.invalidinputexception: input path not exist: hdfs://gxydevvm:8020/user/hadoop/your_spark_home/readme.md     @ org.apache.hadoop.mapred.fileinputformat.singlethreadedliststatus(fileinputformat.java:285)     @ org.apache.hadoop.mapred.fileinputformat.liststatus(fileinputformat.java:228)     @ org.apache.hadoop.mapred.fileinputformat.getsplits(fileinputformat.java:313)     @ org.apache.spark.rdd.hadooprdd.getpartitions(hadooprdd.scala:207)     @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:219)     @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:217)     @ scala.option.getorelse(option.scala:120)     @ org.apache.spark.rdd.rdd.partitions(rdd.scala:217)     @ org.apache.spark.rdd.mappartitionsrdd.getpartitions(mappartitionsrdd.scala:32)     @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:219)     @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:217)     @ scala.option.getorelse(option.scala:120)     @ org.apache.spark.rdd.rdd.partitions(rdd.scala:217)     @ org.apache.spark.rdd.mappartitionsrdd.getpartitions(mappartitionsrdd.scala:32)     @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:219)     @ org.apache.spark.rdd.rdd$$anonfun$partitions$2.apply(rdd.scala:217)     @ scala.option.getorelse(option.scala:120)     @ org.apache.spark.rdd.rdd.partitions(rdd.scala:217)     @ org.apache.spark.sparkcontext.runjob(sparkcontext.scala:1781)     @ org.apache.spark.rdd.rdd.count(rdd.scala:1099)     @ org.apache.spark.api.java.javarddlike$class.count(javarddlike.scala:442)     @ org.apache.spark.api.java.abstractjavarddlike.count(javarddlike.scala:47)     @ simpleapp.main(simpleapp.java:13)     @ sun.reflect.nativemethodaccessorimpl.invoke0(native method)     @ sun.reflect.nativemethodaccessorimpl.invoke(nativemethodaccessorimpl.java:57)     @ sun.reflect.delegatingmethodaccessorimpl.invoke(delegatingmethodaccessorimpl.java:43)     @ java.lang.reflect.method.invoke(method.java:606)     @ org.apache.spark.deploy.sparksubmit$.org$apache$spark$deploy$sparksubmit$$runmain(sparksubmit.scala:665)     @ org.apache.spark.deploy.sparksubmit$.dorunmain$1(sparksubmit.scala:170)     @ org.apache.spark.deploy.sparksubmit$.submit(sparksubmit.scala:193)     @ org.apache.spark.deploy.sparksubmit$.main(sparksubmit.scala:112)     @ org.apache.spark.deploy.sparksubmit.main(sparksubmit.scala) 

what problem? addess coming /user/hadoop/your_spark_home/readme.md

this related default spark configuration. take (or use grep) in conf directory of spark home directory. should find spark-env.sh file, contain reference strange file. in fact, spark trying load file hdfs (kind of standard if run spark on cluster : input / output should reachable master, , workers slaves). if use spark locally have configure spark context using setmaster method. here version :

object sparkdemo {    def log[a](key:string)(job : =>a) = {     val start = system.currenttimemillis     val output = job     println("===> %s in %s seconds"       .format(key, (system.currenttimemillis - start) / 1000.0))     output   }    def main(args: array[string]):unit ={      val modelname ="w2vmodel"      val sc = new sparkcontext(       new sparkconf()       .setappname("sparkdemo")       .set("spark.executor.memory", "4g")       .set("spark.driver.maxresultsize", "16g")       .setmaster("spark://192.168.1.53:7077") // ip of spark master.       // .setmaster("local[2]") // not work... workers loose contact master after 120s     )      // take target folder if unsure how jar named     // onliner compile / run : sbt package && sbt run     sc.addjar("./target/scala-2.10/sparkling_2.10-0.1.jar")      val input = sc.textfile("./text8").map(line => line.split(" ").toseq)      val word2vec = new word2vec()      val model = log("compute model") { word2vec.fit(input) }     log ("save model") { model.save(sc, modelname) }      val synonyms = model.findsynonyms("china", 40)     for((synonym, cosinesimilarity) <- synonyms) {       println(s"$synonym $cosinesimilarity")     }      val model2 = log("reload model") { word2vecmodel.load(sc, modelname) }   } } 

Comments

Popular posts from this blog

python - pip install -U PySide error -

arrays - C++ error: a brace-enclosed initializer is not allowed here before ‘{’ token -

apache - setting document root in antoher partition on ubuntu -