2016-05-25 35 views
0

我需要通过共同的行或列标签合并2个大型csv文件(每个大约4000万个数据元素),这可以由用户指定。例如,如果dataset1.csv包含:通过普通标签有效合并java中的2个大型csv文件

patient_id x1  x2 x3 
pi1   1  2  3 
pi3   4  5  6 

dataset2.csv包含:

patient_id y1 y2 y3 
pi0   0  0  0 
pi1   11 12 13 
pi2   99 98 97 
pi3   14 15 16 

用户可以通过指定的行标签,以这两个文件合并(该患者ID ),并将所得output.csv将是:

patient_id x1 x2 x3 y1 y2 y3 
pi1   1 2 3 11 12 13 
pi3   4 5 6 14 15 16 

由于我们只将常见(相交)的患者ID的信息组合到两个输入文件中。我对这个问题的策略是创建一个HashMap,其中要合并的行或列标签(在这种情况下是行标签,它们是患者ID)是键,并且患者ID的数据作为ArrayList存储为值。我为每个输入数据文件创建一个HashMap,然后根据相似的键合并这些值。我将数据表示为ArrayList>类型的二维ArrayList,因此合并的数据也具有此类型。然后,我只需遍历合并的ArrayList>对象,我将其称为数据类型对象,然后将其打印到文件中。代码如下:

以下是依赖于下面的Data类文件的DataMerge类。

import java.util.HashMap; 
import java.util.ArrayList; 

public class DataMerge { 


/**Merges two Data objects by a similar label. For example, if two data sets represent 
* different data for the same set of patients, which are represented by their unique patient 
* ID, mergeData will return a data set containing only those patient IDs that are common to both 
* data sets along with the data represented in both data sets. labelInRow1 and labelInRow2 separately 
* indicate whether the common labels are in separate rows(true) of d1 and d2, respectively, or separate columns otherwise.*/ 


public static Data mergeData(Data d1, Data d2, boolean labelInRow1, 
     boolean labelInRow2){ 
    ArrayList<ArrayList<String>> mergedData = new ArrayList<ArrayList<String>>(); 
    HashMap<String,ArrayList<String>> d1Map = d1.mapFeatureToData(labelInRow1); 
    HashMap<String,ArrayList<String>> d2Map = d2.mapFeatureToData(labelInRow2); 
    ArrayList<String> d1Features; 
    ArrayList<String> d2Features; 

    if (labelInRow1){ 
     d1Features = d1.getColumnLabels(); 
    } else { 
     d1Features = d1.getRowLabels(); 
    } 
    if (labelInRow2){ 
     d2Features = d2.getColumnLabels(); 
    } else { 
     d2Features = d2.getRowLabels(); 
    } 
    d1Features.trimToSize(); 
    d2Features.trimToSize(); 

    ArrayList<String> mergedFeatures = new ArrayList<String>(); 
    if ((d1.getLabelLabel() != "") && (d1.getLabelLabel() == "")) { 
     mergedFeatures.add(d1.getLabelLabel()); 
    } 
    else if ((d1.getLabelLabel() == "") && (d1.getLabelLabel() != "")) { 
     mergedFeatures.add(d2.getLabelLabel()); 
    } else { 
     mergedFeatures.add(d1.getLabelLabel()); 
    } 

    mergedFeatures.addAll(d1Features); 
    mergedFeatures.addAll(d2Features); 
    mergedFeatures.trimToSize(); 
    mergedData.add(mergedFeatures); 

    for (String key : d1Map.keySet()){ 
     ArrayList<String> curRow = new ArrayList<String>(); 
     if (d2Map.containsKey(key)){ 
      curRow.add(key); 
      curRow.addAll(d1Map.get(key)); 
      curRow.addAll(d2Map.get(key)); 
      curRow.trimToSize(); 
      mergedData.add(curRow); 
     } 
    } 
    mergedData.trimToSize(); 
    Data result = new Data(mergedData, true); 
    return result; 
} 

} 

下面是Data type对象及其关联的HashMap生成函数,其中包含一些行和列标签提取方法。

import java.util.*; 
import java.io.*; 

/**Represents an unlabeled or labeled data set as a series of nested  ArrayLists, where each nested 
* ArrayList represents a line of the input data.*/ 

public class Data { 
private ArrayList<String> colLabels = new ArrayList<String>(); //row labels 

private ArrayList<String> rowLabels = new ArrayList<String>(); //column labels 

private String labelLabel; 

private ArrayList<ArrayList<String>> unlabeledData; //data without row and column labels 



/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line 
*of the input file.*/ 
@SuppressWarnings("resource") 
private static ArrayList<ArrayList<String>> readFile(String filePath, String fileSep){ 
    ArrayList<ArrayList<String>> result = new ArrayList<ArrayList<String>>(); 
    try{ 
     BufferedReader input = new BufferedReader(new FileReader (filePath)); 
     String line = input.readLine(); 
     while (line != null){ 
      String[] splitLine = line.split(fileSep); 
      result.add(new ArrayList<String>(Arrays.asList(splitLine))); 
      line = input.readLine(); 
     } 
    } 
    catch (Exception e){ 
     System.err.println(e); 
    } 
    result.trimToSize();; 
    return result; 
} 


/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line of the input 
* data but WITHOUT any row or column labels*/ 


private ArrayList<ArrayList<String>> extractLabelsAndData(String filePath, String fileSep){ 
    ArrayList<ArrayList<String>> tempData = new ArrayList<ArrayList<String>>(); 
    tempData.addAll(readFile(filePath, fileSep)); 
    tempData.trimToSize(); 
    this.colLabels.addAll(tempData.remove(0)); 
    this.labelLabel = this.colLabels.remove(0); 
    this.colLabels.trimToSize(); 
    for (ArrayList<String> line : tempData){ 
     this.rowLabels.add(line.remove(0)); 
    } 
    this.rowLabels.trimToSize(); 
    return tempData; 
} 




/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line of the input 
* data but WITHOUT any row or column labels. Does mutate the original data*/ 
private ArrayList<ArrayList<String>> extractLabelsAndData (ArrayList<ArrayList<String>> data){ 
    ArrayList<ArrayList<String>> result = new ArrayList<ArrayList<String>>(); 
    for (ArrayList<String> line : data){ 
     ArrayList<String> temp = new ArrayList<String>(); 
     for (String element : line){ 
      temp.add(element); 
     } 
     temp.trimToSize(); 
     result.add(temp); 
    } 
    this.colLabels.addAll(result.remove(0)); 
    this.labelLabel = this.colLabels.remove(0); 
    this.colLabels.trimToSize(); 
    for (ArrayList<String> line : result){ 
     this.rowLabels.add(line.remove(0)); 
    } 
    this.rowLabels.trimToSize(); 
    result.trimToSize(); 
    return result; 
} 


/**Returns the labelLabel for the data*/ 
public String getLabelLabel(){ 
    return this.labelLabel; 
} 


/**Returns an ArrayList of the labels while maintaining the order 
* in which they appear in the data. Row indicates that the desired 
* features are all in the same row. Assumed that the labels are in the 
* first row of the data. */ 
public ArrayList<String> getColumnLabels(){ 
    return this.colLabels; 
} 


/**Returns an ArrayList of the labels while maintaining the order 
* in which they appear in the data. Column indicates that the desired 
* features are all in the same column. Assumed that the labels are in the 
* first column of the data.*/ 
public ArrayList<String> getRowLabels(){ 
    return this.rowLabels; 
} 


/**Creates a HashMap where a list of feature labels are mapped to the entire data. For example, 
* if a data set contains patient IDs and test results, this function can be used to create 
* a HashMap where the keys are the patient IDs and the values are an ArrayList of the test 
* results. The boolean input isRow, which, when true, designates that the 
* desired keys are listed in the rows or false if they are in the columns.*/ 
public HashMap<String, ArrayList<String>> mapFeatureToData(boolean isRow){ 
    HashMap<String, ArrayList<String>> featureMap = new HashMap<String,ArrayList<String>>(); 
    if (!isRow){ 
     for (ArrayList<String> line : this.unlabeledData){ 
      for (int i = 0; i < this.colLabels.size(); i++){ 
       if (featureMap.containsKey(this.colLabels.get(i))){ 
        featureMap.get(this.colLabels.get(i)).add(line.get(i)); 
       } else{ 
        ArrayList<String> firstValue = new ArrayList<String>(); 
        firstValue.add(line.get(i)); 
        featureMap.put(this.colLabels.get(i), firstValue); 
       } 
      } 
     } 
    } else { 
     for (int i = 0; i < this.rowLabels.size(); i++){ 
      if (!featureMap.containsKey(this.rowLabels.get(i))){ 
       featureMap.put(this.rowLabels.get(i), this.unlabeledData.get(i)); 
      } else { 
       featureMap.get(this.rowLabels.get(i)).addAll(this.unlabeledData.get(i)); 
      } 
     } 
    } 
    return featureMap; 
} 


/**Writes the data to a file in the specified outputPath. sep indicates the data delimiter. 
* labeledOutput indicates whether or not the user wants the data written to a file to be 
* labeled or unlabeled. If the data was unlabeled to begin with, then labeledOutput 
* should not be set to true. */ 
public void writeDataToFile(String outputPath, String sep){ 
    try { 
     PrintStream writer = new PrintStream(new BufferedOutputStream (new FileOutputStream (outputPath, true))); 
     String sol = this.labelLabel + sep; 
     for (int n = 0; n < this.colLabels.size(); n++){ 
      if (n == this.colLabels.size()-1){ 
       sol += this.colLabels.get(n) + "\n"; 
      } else { 
       sol += this.colLabels.get(n) + sep; 
      } 
     } 
     for (int i = 0; i < this.unlabeledData.size(); i++){ 
      ArrayList<String> line = this.unlabeledData.get(i); 
      sol += this.rowLabels.get(i) + sep; 
      for (int j = 0; j < line.size(); j++){ 
       if (j == line.size()-1){ 
        sol += line.get(j); 
       } else { 
        sol += line.get(j) + sep; 
       } 
      } 
      sol += "\n"; 
     } 
     sol = sol.trim(); 
     writer.print(sol); 
     writer.close(); 

    } catch (Exception e){ 
     System.err.println(e); 
    } 
} 


/**Constructor for Data object. filePath specifies the input file directory, 
* fileSep indicates the file separator used in the input file, and hasLabels 
* designates whether the input data has row and column labels. Note that if 
* hasLabels is set to true, it is assumed that there are BOTH row and column labels*/ 
public Data(String filePath, String fileSep, boolean hasLabels){ 
    if (hasLabels){ 
     this.unlabeledData = extractLabelsAndData(filePath, fileSep); 
     this.unlabeledData.trimToSize(); 
    } else { 
     this.unlabeledData = readFile(filePath, fileSep); 
     this.unlabeledData.trimToSize(); 
    } 

} 


/**Constructor for Data object that accepts nested ArrayLists as inputs*/ 
public Data (ArrayList<ArrayList<String>> data, boolean hasLabels){ 
    if (hasLabels){ 
     this.unlabeledData = extractLabelsAndData(data); 
     this.unlabeledData.trimToSize(); 
    } else { 
     this.unlabeledData = data; 
     this.unlabeledData.trimToSize(); 
    } 
} 
} 

该程序适用于小数据集,但已超过5天,合并仍未完成。我正在寻找更有效的时间和内存解决方案。有人建议使用字节数组而不是字符串,这可能会使其运行速度更快。任何人有任何建议?

编辑:我在我的代码中进行了一些挖掘,发现读取输入文件并合并它们几乎没有时间(如20秒)。写入文件是需要5天以上的部分

+0

不是你问的,但如果我合并csv文件,Java将是我最后的解决方案。:)我会留下更多有见识的人来回答有关Java效率的问题。 –

+0

我完全同意。我可以在R中轻松完成此任务,但我需要使用Java –

+0

您可以使用从JRI调用的R库吗?或者任何通过Hibernate和朋友调用的SQL风格? –

回答

1

您将您所有数百万行数据的所有数据字段连接成一个巨大的字符串,然后编写该单个字符串。当你分配和重新分配极大的字符串时,内存抖动会导致内存缓慢死亡,一遍又一遍地将它们复制到的每个字段,并将分隔符添加到字符串中。第3天或第4天左右,每个字符串是......数百万字符长? ...和你可怜的垃圾收集器出汗,把它拿出来。

不要这样做。

分别构建输出文件的每一行并写入它。然后建立下一行。

此外,使用StringBuilder类来构建线条,尽管前面的步骤可能会得到如此改进,您甚至可能无法理解这一点。虽然这是做到这一点的方法,你应该学习如何。

+0

感谢您的评论。我明白你的意思,写出每个文件绝对比写入所有文件更好。我会做出改变并让你知道!感谢您的帮助 –

+0

不完全是。正如您发现的那样,您当然可以从两个文件中读取所有数据,甚至可以进行合并。但是_一次构建输出文件的一行(一个'patient_id'的数据),并写入它。然后扔掉那条线(当然是隐含的)然后再做一次。然后再次。对于每个'patient_id'。然后,你建立的这些字符串每个不超过100-120个字符,而且你不会把它们放在比写它们更长的时间。没有记忆压力。 – davidbak

+0

对不起!我的意思是“写每一行”而不是“写每个文件”。我绝对明白你的意思。但是,为了理解,为什么试图编写连接的输出字符串需要比编写每行更长的时间?似乎基本上正在写入相同数量的数据。这是否仅仅是因为发送了大量的文本扼杀了内存,使得写入过程的内存减少了? –