我想在百万宋数据集上使用LinearRegressionWithSGD,并且我的模型返回NaN作为权重,0.0作为截距。这个错误可能是什么问题?我在独立模式下使用Spark 1.40。LinearRegressionWithSGD()返回NaN
的样本数据:http://www.filedropper.com/part-00000
这里是我的全码:
//导入依赖
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
//定义RDD
val data =
sc.textFile("/home/naveen/Projects/millionSong/YearPredictionMSD.txt")
//转换为标记点
def parsePoint (line: String): LabeledPoint = {
val x = line.split(",")
val head = x.head.toDouble
val tail = Vectors.dense(x.tail.map(x => x.toDouble))
return LabeledPoint(head,tail)
}
//查找范围
val parsedDataInit = data.map(x => parsePoint(x))
val onlyLabels = parsedDataInit.map(x => x.label)
val minYear = onlyLabels.min()
val maxYear = onlyLabels.max()
//移标签
val parsedData = parsedDataInit.map(x => LabeledPoint(x.label-minYear
, x.features))
//培训,验证和测试设置
val splits = parsedData.randomSplit(Array(0.8, 0.1, 0.1), seed = 123)
val parsedTrainData = splits(0).cache()
val parsedValData = splits(1).cache()
val parsedTestData = splits(2).cache()
val nTrain = parsedTrainData.count()
val nVal = parsedValData.count()
val nTest = parsedTestData.count()
// RMSE
def squaredError(label: Double, prediction: Double): Double = {
return scala.math.pow(label - prediction,2)
}
def calcRMSE(labelsAndPreds: RDD[List[Double]]): Double = {
return scala.math.sqrt(labelsAndPreds.map(x =>
squaredError(x(0),x(1))).mean())
}
val numIterations = 100
val stepSize = 1.0
val regParam = 0.01
val regType = "L2"
val algorithm = new LinearRegressionWithSGD()
algorithm.optimizer
.setNumIterations(numIterations)
.setStepSize(stepSize)
.setRegParam(regParam)
val model = algorithm.run(parsedTrainData)
给出您的数据样本,以便我们可以重现错误 – eliasah