2014-03-13 52 views
2

我有一个txt文件的语料库,我想要计算它们的Tfidf值。 我想我必须将文件标记为文字作为第一步,然后计算重量。 我需要的这个程序的输出是一个矩阵,其中行将是文件,列将是单词,并且重新生成矩阵单元格将是Tfidf值。在Java中的Tfidf计算和矩阵存储

我卡在矩阵部分。 这里是我的尝试

import java.io.BufferedReader; 
import java.io.File; 
import java.io.FileNotFoundException; 
import java.io.FileReader; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.List; 


public class DocumentParser { 

//This variable will hold all terms of each document in an array. 
private List<String[]> termsDocsArray = new ArrayList<String[]>(); 
private List<String> allTerms = new ArrayList<String>(); //to hold all terms 
private List<double[]> tfidfDocsVector = new ArrayList<double[]>(); 
private List fileNameList = new ArrayList(); 
private File[] allfiles; 
private StringBuilder sb; 
private BufferedReader in = null; 

/** 
* Method to read files and store in array. 
* @param filePath : source file path 
* @throws FileNotFoundException 
* @throws IOException 
*/ 

public void parseFiles(String filePath) throws FileNotFoundException, IOException { 
    allfiles = new File(filePath).listFiles(); 
    for (File f : allfiles) { 
     if (f.getName().endsWith(".txt")) { 
      fileNameList.add(f.getName()); 
      in = new BufferedReader(new FileReader(f)); 
      sb = new StringBuilder(); 
      String s = null; 
      while ((s = in.readLine()) != null) { 
       sb.append(s); 
      } 
      String[] tokenizedTerms = sb.toString().replaceAll("[\\W&&[^\\s]]", "").split("\\W+"); //to get individual terms 
      for (String term : tokenizedTerms) { 
       if (!allTerms.contains(term)) { //avoid duplicate entry 
        allTerms.add(term); 
       } 
      } 
      termsDocsArray.add(tokenizedTerms); 
     } 
    } 

} 

/** 
* Method to create termVector according to its tfidf score. 
* @return 
*/ 
public double tfIdfCalculator(String file, String word) { 
    double tf; //term frequency 
    double idf; //inverse document frequency 
    double tfidf = 0; //term requency inverse document frequency   
    for (String[] docTermsArray : termsDocsArray) { 
     double[] tfidfvectors = new double[allTerms.size()]; 
     int count = 0; 
     for (String terms : allTerms) { 
      tf = new TfIdf().tfCalculator(docTermsArray, terms); 
      idf = new TfIdf().idfCalculator(termsDocsArray, terms); 
      tfidf = tf * idf; 
      System.out.println(terms+"\t" + tfidf); 
      tfidfvectors[count] = tfidf; 
      count++; 


     } 
     tfidfDocsVector.add(tfidfvectors); //storing document vectors;    
    } 

    return tfidf; 
} 


public void TfIdfMatrix() throws IOException { 

    int r=allTerms.size(); 
    int c=tfidfDocsVector.size(); 

    String mat[][]= new String [r][c]; 

    int rNumber=0; 

    for (int i = 0; i < fileNameList.size(); i++) { 

     rNumber++; 

     mat[rNumber][0]=(String) fileNameList.get(i); 

     } 

    String s; 
    while ((s = in.readLine()) != null) { 

     rNumber++; 

     mat[0][rNumber]=s; 

    } 


     //System.out.print(mat); 

    for (int row = 1; row <= rNumber; row++){ 
     for (int col = 1; col <= rNumber; col++){ 
      double ifidfValue=tfIdfCalculator(mat[0][col], mat[row][0]); 
      mat[row][col]=Double.toString(ifidfValue); 

       System.out.print(mat[row][col]); 
     } 
     }  
} 

}

请帮助!

+0

可能会更清楚地说明确切的问题。什么是你需要帮助的矩阵。 –

回答

0

这是我正在使用的代码示例。

public void tfIdfCalculator() { 
      double tf; //term frequency 
      double idf; //inverse document frequency 
      double tfidf; //term requency inverse document frequency 

      for (String[] docTermsArray : termsDocsArray) { 
       double[] tfidfvectors = new double[allTerms.size()]; 
       int count = 0; 
       for (String terms : allTerms) { 
        tf = new TfIdf().tfCalculator(docTermsArray, terms); 
        idf = new TfIdf().idfCalculator(termsDocsArray, terms); 
        tfidf = tf * idf; 
        tfidfvectors[count] = tfidf; 
        count++; 
       } 
       tfidfDocsVector.add(tfidfvectors); //storing document vectors;    
      } 
     } 
+0

请在提供代码之前添加您自己的描述。 – erhun