2010-11-20 51 views
0

我确信这个问题可以相对容易地解决,但我正在努力寻找问题。 我的代码只是读取文件中的所有单词,然后将每个单词,单词位置,句子的开始和结束存储在一个数组中。数组输出到另一个文本文件。从文件读取时遇到问题。似乎过早达到EOF

我可以阅读所有信息,直到最后一句,然后我有一个错误。有什么想法吗?

/** 
* Programmer: fryeguy 
* Course: 
* Program: TxtCrawl for MicroSearch 
* 
* Algorithm: 
* TxtCrawl is the component of MicroSearch that reads text 
* documents for search terms and stores them for 
* indexing 
* 
* 1. Count words in doc, then initialize 
*  wordsFromDoc array to wordCount 
* 2. Initiate output file for writing. 
* 3. Open input file for reading words. 
* 4. Until reaching EOF: 
*  4.a. Set value for start "get pointer" in startSentence (.tellg()). 
*  4.b. Store value for end "get pointer" in endSentence (.tellg()). 
*  4.c. Reset "get pointer" to startSentence location. 
*  4.d. Until reaching endSentence, Read into the 
*   array theWord, wordPos, startSent, and endSent 
* 5. Write wordsFromDoc array to file 
* 6. When EOF is reached close the files. 
*/ 

#include <iostream> 
#include <iomanip> 
#include <fstream> 
#include <string> 

using namespace std; 

struct wordProps  // stores word info to be placed in array 
{ 
    string theWord; // stores the word 
    int  wordPos; // stores the position of word 
    int  startSent; // stores the start point of the sentence 
    int  endSent; // stores the end point of the sentence 
}; 

void countWords(string, int&, int&); 

int main() 
{ 

    ifstream iFile; // file stream for reading in data 
    ofstream oFile; // file stream for writing data 

    string iFileName = "TextFile2.txt"; // name of test file to read from 
    string oFileName = "OutputFile.txt"; // name of test file to write to 
    string aLine = "";      // stores a line preceeding a newline character (\n) 
    string aWord = "";      // stores words from doc for indexing 
    int  charCount = 0;     // count of characters in doc 
    int  wordCount = 0;     // count of words in doc 
    int  aLineWordCount = 0;    // count of words in a single line being processed 
    int  wordBegin = 0;     // stores location of word in doc 
    int  startSentence = 0;    // stores pointer value for start of sentence 
    int  endSentence = 0;    // stores pointer value for end of sentence 

    /** 
    * 1. Count words in doc, then initialize 
    * wordsFromDoc array to wordCount 
    */ 
    countWords(iFileName, charCount, wordCount); 
    cout << "charCount: " << charCount << endl; // DEBUG CODE 
    cout << "wordCount: " << wordCount << endl; // DEBUG CODE 
    wordProps wordsFromDoc[wordCount]; 
    cout<< "length of array: " << (sizeof(wordsFromDoc)/sizeof(*wordsFromDoc)) << endl; // DEBUG CODE 

    /** 
    * 2. Initiate output file for writing 
    */ 
    oFile.open (oFileName.c_str()); // setup output file and write header 
    oFile << setw(20) << left << "File Name: " << iFileName << endl; 
    oFile << setw(20) << "---------------------------------------" << endl << endl; 

    /** 
    * 3. Open input file for reading words 
    */ 
    iFile.open (iFileName.c_str()); 
    if (!iFile.is_open()) 
     cout << "No such file exists!" << endl; 
    else 
    { 
     /** 
     * 4. Until reaching EOF: 
     */ 
     // I have been attempting different counting methods assuming the eof was being reached prematurely 
     // The results really have not varied with this code 
     // while (iFile.tellg() != charCount) 
     while (!iFile.eof()) 
     { 
      //cout << "count: " << count << endl; 
      /** 
      * 4.a. Set value for start "get pointer" in startSentence (.tellg()). 
      */ 
      startSentence = iFile.tellg(); 
      cout << "startSentence: " << startSentence << endl; // DEBUG CODE 

      /** 
      * 4.b. Store value for end "get pointer" in endSentence (.tellg()). 
      */ 
      getline(iFile, aLine, '.'); 
      cout << aLine << endl; // DEBUG CODE 
      endSentence = iFile.tellg(); 
      aLine.clear(); 
      cout << "endSentence: " << endSentence << endl; // DEBUG CODE 

      if (!iFile.is_open()) 
      { 
       cout << "The if, iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE 
       iFile.close(); 
       iFile.open (iFileName.c_str()); 
      } 

      /** 
      * 4.c. Reset "get pointer" to startSentence location. 
      */ 
      iFile.seekg(startSentence); 
      cout << "iFile.tellg(): " << iFile.tellg() << endl; // DEBUG CODE 

      /** 
      * 4.d. Until reaching endSentence, Read into the 
      *  array theWord, wordPos, startSent, and endSent 
      */ 

      // As the last line is about to be read there is an error of some sort. 
      // My guess is that somehow I exceed the end of the file but my startSentence 
      // and endSentence variables are pointing where I think they should. 

      for (; iFile.tellg() < endSentence; aLineWordCount++) 
      { 
       wordsFromDoc[aLineWordCount].wordPos = iFile.tellg(); 
       cout << "wordPos: " << wordsFromDoc[aLineWordCount].wordPos << endl; // DEBUG CODE 
       iFile >> wordsFromDoc[aLineWordCount].theWord; 
       cout << "theWord: " << wordsFromDoc[aLineWordCount].theWord << endl; // DEBUG CODE 
       wordsFromDoc[aLineWordCount].startSent = startSentence; 
       cout << "startSent: " << wordsFromDoc[aLineWordCount].startSent << endl; // DEBUG CODE 
       wordsFromDoc[aLineWordCount].endSent = endSentence; 
       cout << "endSent: " << wordsFromDoc[aLineWordCount].endSent << endl << endl; // DEBUG CODE 
       cout << "aLineWordCount: " << aLineWordCount << endl; 
      } // end for 

     } // end while !=iFile.eof 

      // THIS section of code is never reached because of the hang up above. 
      /** 
      * 5. Write wordsFromDoc array to file 
      */ 
      for (int count = 0; count < aLineWordCount; count++) 
      { 
       oFile << setw(20) << left 
       << wordsFromDoc[count].theWord << " " 
       << wordsFromDoc[count].wordPos << " " 
       << wordsFromDoc[count].startSent << " " 
       << wordsFromDoc[count].endSent << endl; 
      } 

    } // end else 

    /** 
    * 6. When EOF is reached close the files. 
    */ 
    iFile.close(); 
    oFile.close(); 

// DEBUG CDODE for verifying results 
// for (int count = 0; count < wordCount; count++) { 
//  cout << "theWord: " << wordsFromDoc[count].theWord << endl; 
//  cout << "wordPos: " << wordsFromDoc[count].wordPos << endl; 
//  cout << "startSent: " << wordsFromDoc[count].startSent << endl; 
//  cout << "endSent: " << wordsFromDoc[count].endSent << endl << endl; 
// } 

} 

/** 
* Implement countWords function 
*/ 
void countWords(string theFileName, int &charCount, int &wordCount) 
{ 
    string theWord = ""; 
    char theChar = ' '; 
    fstream inFile; 

    //count the chars 
    inFile.open (theFileName.c_str()); 
    if (!inFile.is_open()) 
     cout << "No such file exists!" << endl; 
    else 
    { 
     inFile.get(theChar); 
     while (!inFile.eof()) 
     { 
      charCount++; 
      inFile.get(theChar); 
     } 
    } 
    inFile.close(); 

    // count the words 
    inFile.open (theFileName.c_str()); 
    if (!inFile.is_open()) 
     cout << "No such file exists!" << endl; 
    else 
    { 
     while (!inFile.eof()) 
     { 
      inFile >> theWord; 
      wordCount++; 
     } 
    } 
    inFile.close(); 
} 
+0

需要看到它似乎无法输入;该程序似乎罚款任意输入。 – frayser 2010-11-21 03:49:29

+0

感谢您的回复。我已经测试了一些不同的文件,并且这段文字给出了期望的(不希望的)错误:“这是一个要读入搜索引擎爬行器的文本样本,我将输入几个句子,包括句点以提供一些休息时间 这条线在两个换行符之后出现 最后一段文字应该做的!“当代码到达最后一个句子时,我读入startSent和endSent值以及wordPos(iFile.tellg()),然后看起来iFile已经发布。 – fryeguy 2010-11-21 03:58:47

+0

您是指由于句子以感叹号(!)而非句点结尾而导致的失败?该代码严格书写只处理句点结尾的句子。 – frayser 2010-11-21 04:10:20

回答

1

的IStream

我检查。 Istream没有化身得到getline 一次处理多个分隔符1。其他人也有同样的问题2。逐字符IO是最实用的解决方案。其他解决方案涉及编码当前Istream方法的增强版本 。

构想

  1. 阅读完整的文件到内存中一次。
  2. 删除换行符(任何CR或LF)。
  3. 拆分文档到在每个特殊 句号分隔符,结束放置一个一致的标记(LF或ETX“\ 003”)行,每行的分隔符后,一边写文档回到磁盘。
  4. 现在文档可以照常处理;但是使用周期的已知标记代替 作为分隔符。
  5. 删除包含重新定界文档的临时文件。

立即读取整个文件不是问题,因为最终反正都在 内存中;将文字全部放在一起的字符串 等于整个文档。一旦重新定界的文件被写入磁盘,内存就可以被释放。

注意

1的IStream ::获得
2与函数getline(在代码大师讨论)多定界符

+0

感谢您以前的评论。我认为,我最终发现的主要问题是数据的性质是从不同的网站复制和粘贴的文本,编码遍布整个地图。很确定我的代码在意想不到的角色上咳嗽。 – fryeguy 2011-01-21 22:35:49

相关问题