2017-06-14 30 views
2

给出一列具有NaN条目的密集向量,我想计算列之间的相关性。有没有办法做到这一点,而不需要拆卸矢量来清理价值?如何计算带零点的列上的火花相关性?

#pyspark 
from pyspark.sql import SparkSession 
from pyspark.ml.linalg import Vectors 
from pyspark.mllib.linalg import Vectors as MlVectors # (
from pyspark.mllib.stat import Statistics 

def get_data(): 
    spark = SparkSession.builder.getOrCreate() 
    df = spark.createDataFrame(
    [ 
     (Vectors.dense(1., 3., 2.), 0), 
     (Vectors.dense(None, 4., 1.), 1), 
     (Vectors.dense(3., None, 0.), 2), 
     (Vectors.dense(4., 12., None), 3), 
     (Vectors.dense(5., 0., 1.), 5), 
     (Vectors.dense(6., -1., 0.), 6)], ["features", "foo"]) 
    return df 


def correlation(df): 
    digestible_data = df.select("features").rdd.map(lambda row: MlVectors.dense(row[0])) 
    print(Statistics.corr(digestible_data)) 


if __name__ == '__main__': 
    correlation(get_data()) 
# OUTPUT: 
# [[ 1. nan nan] 
# [ nan 1. nan] 
# [ nan nan 1.]] 
+0

我只对输出矩阵的最后一列(行)感兴趣,但这与问题无关。 –

回答

0

我看到没有人想深入这一点。因此,这里是一个尽可能慢的解决方案:

from pyspark.sql import SparkSession 
from pyspark.ml.linalg import Vectors 
from pyspark.mllib.linalg import Vectors as MlVectors # (
from pyspark.mllib.stat import Statistics 
import numpy as np 

def get_data(): 
    spark = SparkSession.builder.getOrCreate() 
    df = spark.createDataFrame(
    [ 
     (Vectors.dense(1., 3., 2.), 0), 
     (Vectors.dense(None, 4., 1.), 1), 
     (Vectors.dense(3., None, 0.), 2), 
     (Vectors.dense(4., 12., None), 3), 
     (Vectors.dense(5., 0., 1.), 5), 
     (Vectors.dense(6., -1., 0.), 6)], ["features", "foo"]) 
    return df 


def correlation(df): 
    digestible_data = df.select("features").rdd.map(lambda row: MlVectors.dense(row[0])) 
    print(Statistics.corr(digestible_data)) 


def nullproofed_correlation(df, column='features'): 
    num_colls = len(df.head()[column]) 
    res = np.ones((num_colls, num_colls), dtype=np.float32) 
    for i in range(1, num_colls): 
    for j in range(i): 
     feature_pair_df = df.select("features").rdd.map(lambda x: MlVectors.dense([x[0][i], x[0][j]])) 
     feature_pair_df = feature_pair_df.filter(lambda x: not np.isnan(x[0]) and not np.isnan(x[1])) 
     corr_matrix = Statistics.corr(feature_pair_df, method="pearson") 
     corr = corr_matrix[0, 1] 
     res[i, j], res[j, i] = corr, corr 
    print(res) 
    return res 


if __name__ == '__main__': 
    print(correlation(get_data())) 
    print(nullproofed_correlation(get_data())) 

一般而言,只能对现有数据计算相关性。因此,创建一个新列表示值是否存在,然后仅针对当前数据计算是有意义的。并在其他地方使用“presense”信息作为附加功能。不幸的是,处理稀疏数据时,火花相关性没有帮助。