,以减轻我的地图的发展减少对Hadoop的运行之前,实际部署的任务的Hadoop我测试用一个简单的地图减速器我写的任务:如何从Eclipse/Intellij IDE运行简单的Spark应用程序?
object mapreduce {
import scala.collection.JavaConversions._
val intermediate = new java.util.HashMap[String, java.util.List[Int]]
//> intermediate : java.util.HashMap[String,java.util.List[Int]] = {}
val result = new java.util.ArrayList[Int] //> result : java.util.ArrayList[Int] = []
def emitIntermediate(key: String, value: Int) {
if (!intermediate.containsKey(key)) {
intermediate.put(key, new java.util.ArrayList)
}
intermediate.get(key).add(value)
} //> emitIntermediate: (key: String, value: Int)Unit
def emit(value: Int) {
println("value is " + value)
result.add(value)
} //> emit: (value: Int)Unit
def execute(data: java.util.List[String], mapper: String => Unit, reducer: (String, java.util.List[Int]) => Unit) {
for (line <- data) {
mapper(line)
}
for (keyVal <- intermediate) {
reducer(keyVal._1, intermediate.get(keyVal._1))
}
for (item <- result) {
println(item)
}
} //> execute: (data: java.util.List[String], mapper: String => Unit, reducer: (St
//| ring, java.util.List[Int]) => Unit)Unit
def mapper(record: String) {
var jsonAttributes = com.nebhale.jsonpath.JsonPath.read("$", record, classOf[java.util.ArrayList[String]])
println("jsonAttributes are " + jsonAttributes)
var key = jsonAttributes.get(0)
var value = jsonAttributes.get(1)
println("key is " + key)
var delims = "[ ]+";
var words = value.split(delims);
for (w <- words) {
emitIntermediate(w, 1)
}
} //> mapper: (record: String)Unit
def reducer(key: String, listOfValues: java.util.List[Int]) = {
var total = 0
for (value <- listOfValues) {
total += value;
}
emit(total)
} //> reducer: (key: String, listOfValues: java.util.List[Int])Unit
var dataToProcess = new java.util.ArrayList[String]
//> dataToProcess : java.util.ArrayList[String] = []
dataToProcess.add("[\"test1\" , \"test1 here is another test1 test1 \"]")
//> res0: Boolean = true
dataToProcess.add("[\"test2\" , \"test2 here is another test2 test1 \"]")
//> res1: Boolean = true
execute(dataToProcess, mapper, reducer) //> jsonAttributes are [test1, test1 here is another test1 test1 ]
//| key is test1
//| jsonAttributes are [test2, test2 here is another test2 test1 ]
//| key is test2
//| value is 2
//| value is 2
//| value is 4
//| value is 2
//| value is 2
//| 2
//| 2
//| 4
//| 2
//| 2
for (keyValue <- intermediate) {
println(keyValue._1 + "->"+keyValue._2.size)//> another->2
//| is->2
//| test1->4
//| here->2
//| test2->2
}
}
这让我对我的Eclipse中运行我的MapReduce任务的在部署到实际Hadoop集群之前,Windows上的IDE。我想为Spark执行类似的操作,或者有能力在Eclipse中编写Spark代码以在部署到Spark集群之前进行测试。这可能与Spark?由于Spark在Hadoop之上运行,这是否意味着我不能在未安装Hadoop的情况下运行Spark?换句话说,我可以使用Spark库来运行代码吗? :
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object SimpleApp {
def main(args: Array[String]) {
val logFile = "$YOUR_SPARK_HOME/README.md" // Should be some file on your system
val sc = new SparkContext("local", "Simple App", "YOUR_SPARK_HOME",
List("target/scala-2.10/simple-project_2.10-1.0.jar"))
val logData = sc.textFile(logFile, 2).cache()
val numAs = logData.filter(line => line.contains("a")).count()
val numBs = logData.filter(line => line.contains("b")).count()
println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
}
}
从https://spark.apache.org/docs/0.9.0/quick-start.html#a-standalone-app-in-scala
采取若然是我需要我的项目中包括星火库?
[spark.apache.org](https://spark.apache.org/downloads.html)表示火花core_2.10,版本0.9.0,温育。我会开始与它的依赖关系。你可以找到[这里](http://mvnrepository.com/artifact/org.apache.spark/spark-core_2.10/0.9.0-incubating)或者[这里](http://search.maven.org /#浏览| -183575761)。如果您没有使用依赖管理插件创建您的项目来创建您的eclipse项目,则必须自行下载依赖关系。其中一个似乎是hadoop-client。 – n0741337