通过普通标签有效合并java中的2个大型csv文件

我需要通过共同的行或列标签合并2个大型csv文件（每个大约4000万个数据元素），这可以由用户指定。例如，如果dataset1.csv包含：通过普通标签有效合并java中的2个大型csv文件

patient_id x1  x2 x3 
pi1   1  2  3 
pi3   4  5  6

和dataset2.csv包含：

patient_id y1 y2 y3 
pi0   0  0  0 
pi1   11 12 13 
pi2   99 98 97 
pi3   14 15 16

用户可以通过指定的行标签，以这两个文件合并（该患者ID ），并将所得output.csv将是：

patient_id x1 x2 x3 y1 y2 y3 
pi1   1 2 3 11 12 13 
pi3   4 5 6 14 15 16

由于我们只将常见（相交）的患者ID的信息组合到两个输入文件中。我对这个问题的策略是创建一个HashMap，其中要合并的行或列标签（在这种情况下是行标签，它们是患者ID）是键，并且患者ID的数据作为ArrayList存储为值。我为每个输入数据文件创建一个HashMap，然后根据相似的键合并这些值。我将数据表示为ArrayList>类型的二维ArrayList，因此合并的数据也具有此类型。然后，我只需遍历合并的ArrayList>对象，我将其称为数据类型对象，然后将其打印到文件中。代码如下：

以下是依赖于下面的Data类文件的DataMerge类。

import java.util.HashMap; 
import java.util.ArrayList; 

public class DataMerge { 


/**Merges two Data objects by a similar label. For example, if two data sets represent 
* different data for the same set of patients, which are represented by their unique patient 
* ID, mergeData will return a data set containing only those patient IDs that are common to both 
* data sets along with the data represented in both data sets. labelInRow1 and labelInRow2 separately 
* indicate whether the common labels are in separate rows(true) of d1 and d2, respectively, or separate columns otherwise.*/ 


public static Data mergeData(Data d1, Data d2, boolean labelInRow1, 
     boolean labelInRow2){ 
    ArrayList<ArrayList<String>> mergedData = new ArrayList<ArrayList<String>>(); 
    HashMap<String,ArrayList<String>> d1Map = d1.mapFeatureToData(labelInRow1); 
    HashMap<String,ArrayList<String>> d2Map = d2.mapFeatureToData(labelInRow2); 
    ArrayList<String> d1Features; 
    ArrayList<String> d2Features; 

    if (labelInRow1){ 
     d1Features = d1.getColumnLabels(); 
    } else { 
     d1Features = d1.getRowLabels(); 
    } 
    if (labelInRow2){ 
     d2Features = d2.getColumnLabels(); 
    } else { 
     d2Features = d2.getRowLabels(); 
    } 
    d1Features.trimToSize(); 
    d2Features.trimToSize(); 

    ArrayList<String> mergedFeatures = new ArrayList<String>(); 
    if ((d1.getLabelLabel() != "") && (d1.getLabelLabel() == "")) { 
     mergedFeatures.add(d1.getLabelLabel()); 
    } 
    else if ((d1.getLabelLabel() == "") && (d1.getLabelLabel() != "")) { 
     mergedFeatures.add(d2.getLabelLabel()); 
    } else { 
     mergedFeatures.add(d1.getLabelLabel()); 
    } 

    mergedFeatures.addAll(d1Features); 
    mergedFeatures.addAll(d2Features); 
    mergedFeatures.trimToSize(); 
    mergedData.add(mergedFeatures); 

    for (String key : d1Map.keySet()){ 
     ArrayList<String> curRow = new ArrayList<String>(); 
     if (d2Map.containsKey(key)){ 
      curRow.add(key); 
      curRow.addAll(d1Map.get(key)); 
      curRow.addAll(d2Map.get(key)); 
      curRow.trimToSize(); 
      mergedData.add(curRow); 
     } 
    } 
    mergedData.trimToSize(); 
    Data result = new Data(mergedData, true); 
    return result; 
} 

}

下面是Data type对象及其关联的HashMap生成函数，其中包含一些行和列标签提取方法。

import java.util.*; 
import java.io.*; 

/**Represents an unlabeled or labeled data set as a series of nested  ArrayLists, where each nested 
* ArrayList represents a line of the input data.*/ 

public class Data { 
private ArrayList<String> colLabels = new ArrayList<String>(); //row labels 

private ArrayList<String> rowLabels = new ArrayList<String>(); //column labels 

private String labelLabel; 

private ArrayList<ArrayList<String>> unlabeledData; //data without row and column labels 



/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line 
*of the input file.*/ 
@SuppressWarnings("resource") 
private static ArrayList<ArrayList<String>> readFile(String filePath, String fileSep){ 
    ArrayList<ArrayList<String>> result = new ArrayList<ArrayList<String>>(); 
    try{ 
     BufferedReader input = new BufferedReader(new FileReader (filePath)); 
     String line = input.readLine(); 
     while (line != null){ 
      String[] splitLine = line.split(fileSep); 
      result.add(new ArrayList<String>(Arrays.asList(splitLine))); 
      line = input.readLine(); 
     } 
    } 
    catch (Exception e){ 
     System.err.println(e); 
    } 
    result.trimToSize();; 
    return result; 
} 


/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line of the input 
* data but WITHOUT any row or column labels*/ 


private ArrayList<ArrayList<String>> extractLabelsAndData(String filePath, String fileSep){ 
    ArrayList<ArrayList<String>> tempData = new ArrayList<ArrayList<String>>(); 
    tempData.addAll(readFile(filePath, fileSep)); 
    tempData.trimToSize(); 
    this.colLabels.addAll(tempData.remove(0)); 
    this.labelLabel = this.colLabels.remove(0); 
    this.colLabels.trimToSize(); 
    for (ArrayList<String> line : tempData){ 
     this.rowLabels.add(line.remove(0)); 
    } 
    this.rowLabels.trimToSize(); 
    return tempData; 
} 




/**Returns an ArrayList of ArrayLists, where each nested ArrayList represents a line of the input 
* data but WITHOUT any row or column labels. Does mutate the original data*/ 
private ArrayList<ArrayList<String>> extractLabelsAndData (ArrayList<ArrayList<String>> data){ 
    ArrayList<ArrayList<String>> result = new ArrayList<ArrayList<String>>(); 
    for (ArrayList<String> line : data){ 
     ArrayList<String> temp = new ArrayList<String>(); 
     for (String element : line){ 
      temp.add(element); 
     } 
     temp.trimToSize(); 
     result.add(temp); 
    } 
    this.colLabels.addAll(result.remove(0)); 
    this.labelLabel = this.colLabels.remove(0); 
    this.colLabels.trimToSize(); 
    for (ArrayList<String> line : result){ 
     this.rowLabels.add(line.remove(0)); 
    } 
    this.rowLabels.trimToSize(); 
    result.trimToSize(); 
    return result; 
} 


/**Returns the labelLabel for the data*/ 
public String getLabelLabel(){ 
    return this.labelLabel; 
} 


/**Returns an ArrayList of the labels while maintaining the order 
* in which they appear in the data. Row indicates that the desired 
* features are all in the same row. Assumed that the labels are in the 
* first row of the data. */ 
public ArrayList<String> getColumnLabels(){ 
    return this.colLabels; 
} 


/**Returns an ArrayList of the labels while maintaining the order 
* in which they appear in the data. Column indicates that the desired 
* features are all in the same column. Assumed that the labels are in the 
* first column of the data.*/ 
public ArrayList<String> getRowLabels(){ 
    return this.rowLabels; 
} 


/**Creates a HashMap where a list of feature labels are mapped to the entire data. For example, 
* if a data set contains patient IDs and test results, this function can be used to create 
* a HashMap where the keys are the patient IDs and the values are an ArrayList of the test 
* results. The boolean input isRow, which, when true, designates that the 
* desired keys are listed in the rows or false if they are in the columns.*/ 
public HashMap<String, ArrayList<String>> mapFeatureToData(boolean isRow){ 
    HashMap<String, ArrayList<String>> featureMap = new HashMap<String,ArrayList<String>>(); 
    if (!isRow){ 
     for (ArrayList<String> line : this.unlabeledData){ 
      for (int i = 0; i < this.colLabels.size(); i++){ 
       if (featureMap.containsKey(this.colLabels.get(i))){ 
        featureMap.get(this.colLabels.get(i)).add(line.get(i)); 
       } else{ 
        ArrayList<String> firstValue = new ArrayList<String>(); 
        firstValue.add(line.get(i)); 
        featureMap.put(this.colLabels.get(i), firstValue); 
       } 
      } 
     } 
    } else { 
     for (int i = 0; i < this.rowLabels.size(); i++){ 
      if (!featureMap.containsKey(this.rowLabels.get(i))){ 
       featureMap.put(this.rowLabels.get(i), this.unlabeledData.get(i)); 
      } else { 
       featureMap.get(this.rowLabels.get(i)).addAll(this.unlabeledData.get(i)); 
      } 
     } 
    } 
    return featureMap; 
} 


/**Writes the data to a file in the specified outputPath. sep indicates the data delimiter. 
* labeledOutput indicates whether or not the user wants the data written to a file to be 
* labeled or unlabeled. If the data was unlabeled to begin with, then labeledOutput 
* should not be set to true. */ 
public void writeDataToFile(String outputPath, String sep){ 
    try { 
     PrintStream writer = new PrintStream(new BufferedOutputStream (new FileOutputStream (outputPath, true))); 
     String sol = this.labelLabel + sep; 
     for (int n = 0; n < this.colLabels.size(); n++){ 
      if (n == this.colLabels.size()-1){ 
       sol += this.colLabels.get(n) + "\n"; 
      } else { 
       sol += this.colLabels.get(n) + sep; 
      } 
     } 
     for (int i = 0; i < this.unlabeledData.size(); i++){ 
      ArrayList<String> line = this.unlabeledData.get(i); 
      sol += this.rowLabels.get(i) + sep; 
      for (int j = 0; j < line.size(); j++){ 
       if (j == line.size()-1){ 
        sol += line.get(j); 
       } else { 
        sol += line.get(j) + sep; 
       } 
      } 
      sol += "\n"; 
     } 
     sol = sol.trim(); 
     writer.print(sol); 
     writer.close(); 

    } catch (Exception e){ 
     System.err.println(e); 
    } 
} 


/**Constructor for Data object. filePath specifies the input file directory, 
* fileSep indicates the file separator used in the input file, and hasLabels 
* designates whether the input data has row and column labels. Note that if 
* hasLabels is set to true, it is assumed that there are BOTH row and column labels*/ 
public Data(String filePath, String fileSep, boolean hasLabels){ 
    if (hasLabels){ 
     this.unlabeledData = extractLabelsAndData(filePath, fileSep); 
     this.unlabeledData.trimToSize(); 
    } else { 
     this.unlabeledData = readFile(filePath, fileSep); 
     this.unlabeledData.trimToSize(); 
    } 

} 


/**Constructor for Data object that accepts nested ArrayLists as inputs*/ 
public Data (ArrayList<ArrayList<String>> data, boolean hasLabels){ 
    if (hasLabels){ 
     this.unlabeledData = extractLabelsAndData(data); 
     this.unlabeledData.trimToSize(); 
    } else { 
     this.unlabeledData = data; 
     this.unlabeledData.trimToSize(); 
    } 
} 
}

该程序适用于小数据集，但已超过5天，合并仍未完成。我正在寻找更有效的时间和内存解决方案。有人建议使用字节数组而不是字符串，这可能会使其运行速度更快。任何人有任何建议？

编辑：我在我的代码中进行了一些挖掘，发现读取输入文件并合并它们几乎没有时间（如20秒）。写入文件是需要5天以上的部分

来源

2016-05-25 Chandra_Rathnam

不是你问的，但如果我合并csv文件，Java将是我最后的解决方案。:)我会留下更多有见识的人来回答有关Java效率的问题。 –

我完全同意。我可以在R中轻松完成此任务，但我需要使用Java –

您可以使用从JRI调用的R库吗？或者任何通过Hibernate和朋友调用的SQL风格？ –

您将您所有数百万行数据的所有数据字段连接成一个巨大的字符串，然后编写该单个字符串。当你分配和重新分配极大的字符串时，内存抖动会导致内存缓慢死亡，一遍又一遍地将它们复制到的每个字段，并将分隔符添加到字符串中。第3天或第4天左右，每个字符串是......数百万字符长？ ...和你可怜的垃圾收集器出汗，把它拿出来。

不要这样做。

分别构建输出文件的每一行并写入它。然后建立下一行。

此外，使用StringBuilder类来构建线条，尽管前面的步骤可能会得到如此改进，您甚至可能无法理解这一点。虽然这是做到这一点的方法，你应该学习如何。

来源

2016-05-25 00:43:54 davidbak

感谢您的评论。我明白你的意思，写出每个文件绝对比写入所有文件更好。我会做出改变并让你知道！感谢您的帮助 –

不完全是。正如您发现的那样，您当然可以从两个文件中读取所有数据，甚至可以进行合并。但是_一次构建输出文件的一行（一个'patient_id'的数据），并写入它。然后扔掉那条线（当然是隐含的）然后再做一次。然后再次。对于每个'patient_id'。然后，你建立的这些字符串每个不超过100-120个字符，而且你不会把它们放在比写它们更长的时间。没有记忆压力。 – davidbak

对不起！我的意思是“写每一行”而不是“写每个文件”。我绝对明白你的意思。但是，为了理解，为什么试图编写连接的输出字符串需要比编写每行更长的时间？似乎基本上正在写入相同数量的数据。这是否仅仅是因为发送了大量的文本扼杀了内存，使得写入过程的内存减少了？ –

通过普通标签有效合并java中的2个大型csv文件

回答

相关问题