2014-03-25 81 views
11

,以减轻我的地图的发展减少对Hadoop的运行之前,实际部署的任务的Hadoop我测试用一个简单的地图减速器我写的任务:如何从Eclipse/Intellij IDE运行简单的Spark应用程序?

object mapreduce { 
    import scala.collection.JavaConversions._ 

    val intermediate = new java.util.HashMap[String, java.util.List[Int]] 
                //> intermediate : java.util.HashMap[String,java.util.List[Int]] = {} 
    val result = new java.util.ArrayList[Int]  //> result : java.util.ArrayList[Int] = [] 

    def emitIntermediate(key: String, value: Int) { 
    if (!intermediate.containsKey(key)) { 
     intermediate.put(key, new java.util.ArrayList) 
    } 
    intermediate.get(key).add(value) 
    }            //> emitIntermediate: (key: String, value: Int)Unit 

    def emit(value: Int) { 
    println("value is " + value) 
    result.add(value) 
    }            //> emit: (value: Int)Unit 

    def execute(data: java.util.List[String], mapper: String => Unit, reducer: (String, java.util.List[Int]) => Unit) { 

    for (line <- data) { 
     mapper(line) 
    } 

    for (keyVal <- intermediate) { 
     reducer(keyVal._1, intermediate.get(keyVal._1)) 
    } 

    for (item <- result) { 
     println(item) 
    } 
    }            //> execute: (data: java.util.List[String], mapper: String => Unit, reducer: (St 
                //| ring, java.util.List[Int]) => Unit)Unit 

    def mapper(record: String) { 
    var jsonAttributes = com.nebhale.jsonpath.JsonPath.read("$", record, classOf[java.util.ArrayList[String]]) 
    println("jsonAttributes are " + jsonAttributes) 
    var key = jsonAttributes.get(0) 
    var value = jsonAttributes.get(1) 

    println("key is " + key) 
    var delims = "[ ]+"; 
    var words = value.split(delims); 
    for (w <- words) { 
     emitIntermediate(w, 1) 
    } 
    }            //> mapper: (record: String)Unit 

    def reducer(key: String, listOfValues: java.util.List[Int]) = { 
    var total = 0 
    for (value <- listOfValues) { 
     total += value; 
    } 

    emit(total) 
    }            //> reducer: (key: String, listOfValues: java.util.List[Int])Unit 
    var dataToProcess = new java.util.ArrayList[String] 
                //> dataToProcess : java.util.ArrayList[String] = [] 
    dataToProcess.add("[\"test1\" , \"test1 here is another test1 test1 \"]") 
                //> res0: Boolean = true 
    dataToProcess.add("[\"test2\" , \"test2 here is another test2 test1 \"]") 
                //> res1: Boolean = true 

    execute(dataToProcess, mapper, reducer)   //> jsonAttributes are [test1, test1 here is another test1 test1 ] 
                //| key is test1 
                //| jsonAttributes are [test2, test2 here is another test2 test1 ] 
                //| key is test2 
                //| value is 2 
                //| value is 2 
                //| value is 4 
                //| value is 2 
                //| value is 2 
                //| 2 
                //| 2 
                //| 4 
                //| 2 
                //| 2 


    for (keyValue <- intermediate) { 
     println(keyValue._1 + "->"+keyValue._2.size)//> another->2 
                //| is->2 
                //| test1->4 
                //| here->2 
                //| test2->2 
    } 


} 

这让我对我的Eclipse中运行我的MapReduce任务的在部署到实际Hadoop集群之前,Windows上的IDE。我想为Spark执行类似的操作,或者有能力在Eclipse中编写Spark代码以在部署到Spark集群之前进行测试。这可能与Spark?由于Spark在Hadoop之上运行,这是否意味着我不能在未安装Hadoop的情况下运行Spark?换句话说,我可以使用Spark库来运行代码吗? :

import org.apache.spark.SparkContext 
import org.apache.spark.SparkContext._ 

object SimpleApp { 
    def main(args: Array[String]) { 
    val logFile = "$YOUR_SPARK_HOME/README.md" // Should be some file on your system 
    val sc = new SparkContext("local", "Simple App", "YOUR_SPARK_HOME", 
     List("target/scala-2.10/simple-project_2.10-1.0.jar")) 
    val logData = sc.textFile(logFile, 2).cache() 
    val numAs = logData.filter(line => line.contains("a")).count() 
    val numBs = logData.filter(line => line.contains("b")).count() 
    println("Lines with a: %s, Lines with b: %s".format(numAs, numBs)) 
    } 
} 

https://spark.apache.org/docs/0.9.0/quick-start.html#a-standalone-app-in-scala

采取若然是我需要我的项目中包括星火库?

+1

[spark.apache.org](https://spark.apache.org/downloads.html)表示火花core_2.10,版本0.9.0,温育。我会开始与它的依赖关系。你可以找到[这里](http://mvnrepository.com/artifact/org.apache.spark/spark-core_2.10/0.9.0-incubating)或者[这里](http://search.maven.org /#浏览| -183575761)。如果您没有使用依赖管理插件创建您的项目来创建您的eclipse项目,则必须自行下载依赖关系。其中一个似乎是hadoop-client。 – n0741337

回答

2

以下添加到您的build.sbt libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.1",并确保您scalaVersion设置(如:scalaVersion := "2.10.3"

此外,如果你只是在本地运行的程序,你可以跳过最后两个参数SparkContext如下val sc = new SparkContext("local", "Simple App")

最后,星火可以在Hadoop上运行,但也可以在单机模式下运行。参见:https://spark.apache.org/docs/0.9.1/spark-standalone.html

+0

您还会需要使用sbteclipse再生Eclipse项目,并可能刷新Eclipse内部项目。 –

+0

@IulianDragos这是一个很好的观点。谢谢。 –

+5

这是否直接工作?你不需要打包一个jar并将其提交给spark-submit? – Neil

相关问题