倒排索引：在一组文档

我执行的倒排索引结构，特别是一个允许布尔查询和字级粒度查找短语。倒排索引：在一组文档

我有一个大型的文本数据库，并且我保留一个索引，告诉我每个单词，它是哪个文件（IDdoc），以及它在哪个文件中（position）。（一个字可以在许多文件，并在一个文件中的许多地方。）

因此，我保持一个向量每个字：

vector<pair<IDdoc,position>> occurences_of_word;

（矢量由IDdoc，然后按位置排序，升序）。

我有一个string物体由字。这是短语我在找。

对于在短语每个字我想知道哪些文件包含这个词组，因此返回的IDdoc秒的载体。

这是我在一个解决方案的尝试：

typedef std::string  Word_t; 
typedef unsigned int WordPosition_t; 
typedef unsigned int IDdocument_t; 

vector<pair<IDdocument_t,WordPosition_t> > IndiceInvertidoBooleanoConPosicion::_interseccion_dos_listas 
    (const vector<pair<IDdocument_t,WordPosition_t>> & v1, 
    const vector<pair<IDdocument_t,WordPosition_t>> & v2) 
{ 
vector<pair<IDdocument_t,WordPosition_t> > intersection; 

IDdocument_t ID_doc_one, ID_doc_two; 

int i = 0; 
int j = 0; 
const int MAX_INDEX_V1 = v1.size() -1; 
const int MAX_INDEX_V2 = v2.size() -1; 

while(i <= MAX_INDEX_V1 && j <= MAX_INDEX_V2) 
{ 
    ID_doc_one = v1[i].first; 
    ID_doc_two = v2[j].first; 
    if (ID_doc_one < ID_doc_two) 
     i++; 
    else if (ID_doc_one > ID_doc_two) 
     j++; 
    else // The words were found in the same document! 
    { 
     WordPosition_t pos_word_one = v1[i].second; 
     WordPosition_t pos_word_two = v2[j].second; 

     // The words make a phrase! Return pos_two for the next intersection finding step 
     if (pos_word_one + 1 == pos_word_two) 
     { 
      intersection.push_back(make_pair(ID_doc_one,pos_word_two)); 
      i++; 
      j++; 
     } 

     // Phrase not found 
     else 
     { 
      if (pos_word_one < pos_word_two) 
       i++; 
      else 
       j++; 
     } 

    } 
} 

return intersection; 
} 

int find_phrase(const string phrase, vector<IDdocument_t> & id_docs) 
{ 
Word_t word; 
id_docs.clear(); 
Text parsed_phrase; 
// Extract the relevant words from the phrase 
parsed_phrase.parse(phrase); 

vector<pair<IDdocument_t,WordPosition_t> > intersection; 
vector<pair<IDdocument_t,WordPosition_t> > second_vector; 

while (parsed_phrase.get_next_word(word) != RES_END) 
{ 
    _find_vector_words(word,intersection); 

    while (parsed_phrase.get_next_word(word) != RES_END) 
    { 
     _find_vector_words(word,second_vector); 

     intersection = _intersect_two_words(intersection,second_vector); 

    } 
} 

for (unsigned int i = 0; i < intersection.size(); i ++) 
{ 
    IDdocument_t id_doc = intersection[i].first; 
    if(std::find(id_docs.begin(), id_docs.end(), id_doc) == id_docs.end()) 
     id_docs.push_back(id_doc); 
} 

return RES_OK; 
}

来源

2013-06-27 Maria Ines Parnisari

不知道你在问究竟 - 你问如何确定哪些文件包含“A头号飞利浦螺丝刀“，或者哪些文件包含单词”A“，”编号“，”一个“，”philips“或”螺丝刀“。如果前者，他们是否必须是连续的或将“一把螺丝刀的手柄数量是一个飞利浦和pozidrive”是一个匹配？ –

@MatsPetersson，他们需要是连续的。 –

相关：http://stackoverflow.com/questions/2659120/how-to-search-phrase-queries-in-inverted-index-structure – jogojapan

对于从字符串表示查找一个特定的单词，你可能想看看像map。为了创建一个简单的结果联合，你可能需要set。这个实现更像是一个演示而不是一个非常理想的最终实现（c.f.）。草率的短语解析）。

#include <vector> 
#include <map> 
#include <set> 
#include <iostream> 
#include <string> 

typedef std::string IDdoc; 
typedef int position; 

typedef std::pair<IDdoc,position> Occurrence; 
typedef std::vector<Occurrence> OccurrencesOfWord; 
typedef std::map<std::string /*word*/, OccurrencesOfWord> Dictionary; 
typedef std::set<IDdoc> Matches; 

bool findMatchesForPhrase(const std::string& phrase, const Dictionary& dictionary, Matches& matches) 
{ 
    size_t pos = 0; 
    size_t len = 0; 
    while (pos < phrase.length()) { 
     size_t end = phrase.find(' ', pos); 
     size_t len = ((end == phrase.npos) ? phrase.length() : end) - pos; 
     std::string word(phrase, pos, len); 
     pos += len + 1; // to skip the space. 

     // ignore words not in the dictionary. 
     auto dictIt = dictionary.find(word); 
     if (dictIt == dictionary.end()) 
      continue; 

     auto& occurrences = dictIt->second; // shortcut/alias,. 
     for (auto& occurIt : occurrences) { 
      // Add all the IDdoc's of this occurence to the set. 
      matches.insert(occurIt.first); 
     } 
    } 

    return !matches.empty(); 
} 

void addToDictionary(Dictionary& dict, const char* word, const char* doc, int position) 
{ 
    dict[word].push_back(std::make_pair(std::string(doc), position)); 
} 

int main(int argc, const char** argv) 
{ 
    std::string phrase("pizza is life"); 
    Dictionary dict; 

    addToDictionary(dict, "pizza", "book1", 10); 
    addToDictionary(dict, "pizza", "book2", 30); 
    addToDictionary(dict, "life", "book1", 1); 
    addToDictionary(dict, "life", "book3", 1); 
    addToDictionary(dict, "goat", "book4", 99); 

    Matches matches; 
    bool result = findMatchesForPhrase(phrase, dict, matches); 

    std::cout << "result = " << result << std::endl; 
    for (auto& ent : matches) { 
     std::cout << ent << std::endl; 
    } 

    return 0; 
}

在这个在线演示：http://ideone.com/Zlhfua

跟进，以解决您的更改：

while(i < SIZE_VECTOR_ONE && j < SIZE_VECTOR_TWO) 
{ 
    if (ID_doc_one < ID_doc_two) 
    { 
     ID_doc_one = v1[++i].first;

比方说 “SIZE_VECTOR 1” 是1，这意味着，有一个元素在向量中，元素[0]。如果ID_doc_one是0并且ID_doc_two是1，则

if (0 < 1) { 
    ID_doc_one = v1[1].first;

这是无效的。你可能会关闭使用迭代器或指针更好：

while (oneIt != v1.end() && twoIt != v2.end()) { 
    if (oneIt->first < twoIt->first) { 
     ++oneIt; 
     continue; 
    } else if (*twoIt < *oneIt) { 
     ++twoIt; 
     continue; 
    } 
    // same documentId in both lists, snag positions. 
    ... 
}

下，这看起来有点破：

else { 
    } // To avoid "out of range" errors <-- but also ends the "else" 
     if (i < SIZE_VECTOR_ONE - 1) 
      ID_doc_one = v1[++i].first; 
     if (j < SIZE_VECTOR_TWO - 1) 
      ID_doc_two = v2[++j].first; 
    }

我不知道，如果你有相同的文档，但在多个位置会发生什么？

接下来的这位是挑剔的，但我花了很长的时间来解析

WordPosition_t pos_one = v1[i].second; 
    WordPosition_t pos_two = v2[j].second; 

    // The words make a phrase! Return pos_two for the next intersection finding step 
    if (pos_one + 1 == pos_two)

似乎大大清晰的写本，你可能会说“（如果第二个字是在后的位置第一个字）：

WordPosition_t posFirstWord = v1[i].second; 
    WordPosition_t posSecondWord = v2[j].second; 

    // The words make a phrase! Return pos_two for the next intersection finding step 
    if (posSecondWord == posFirstWord + 1)

接下来的这个部分是一种令人困惑的，因为这两个条款似乎是为了增加i和j和更新ID_doc_one和二，它会是有意义的那部分吊到一个共同的在if块之后的部分，但是再次使用else {}很难说你实际上在做什么。

if (pos_one + 1 == pos_two) 
    { 
     intersection.push_back(make_pair(ID_doc_one,pos_two)); 
     ID_doc_one = v1[++i].first; 
     ID_doc_two = v2[++j].first; 
    } 

    else { 
    } // To avoid "out of range" errors 
     if (i < SIZE_VECTOR_ONE - 1) 
      ID_doc_one = v1[++i].first; 
     if (j < SIZE_VECTOR_TWO - 1) 
      ID_doc_two = v2[++j].first; 
    }

当你匹配两个数组，你总是希望增加双方i和j，这不是调理，我也不知道为什么你正在使用pos_two，因为这句话在pos_one居然发现？

这是我怎么会写它：

#include<iostream> 
#include<map> 
#include<vector> 
#include<string> 

typedef std::string   Word_t; 
typedef unsigned int  WordPosition_t; 
typedef unsigned int  IDdocument_t; 

typedef std::pair<IDdocument_t, WordPosition_t> DocumentPosition_t; 
typedef std::vector<DocumentPosition_t> WordReferences_t; 

WordReferences_t _intersect_two_words(const WordReferences_t& v1, const WordReferences_t& v2) 
{ 
    // all the locations where the words occur one after the other. 
    WordReferences_t intersection; 

    auto firstIt = v1.begin(); 
    auto secondIt = v2.begin(); 
    while (firstIt != v1.end() && secondIt != v2.end()) 
    { 
     if (firstIt->first < secondIt->first) 
     { 
      ++firstIt; 
      continue; 
     } 
     // find the second word in the same document and AFTER the first word. 
     if (secondIt->first < firstIt->first || secondIt->second < firstIt->second + 1) 
     { 
      ++secondIt; 
      continue; 
     } 
     // first word wasn't just before the second, it's not a phrase. 
     if (secondIt->second > firstIt->second + 1) 
     { 
      ++firstIt; 
      continue; 
     } 
     // We found a phrase. 
     intersection.emplace_back(*firstIt); 
     ++firstIt; 
     ++secondIt; 
    } 

    return intersection; 
} 

int main() 
{ 
    WordReferences_t v1, v2; 
    v1.push_back(std::make_pair(10, 5)); 
    v1.push_back(std::make_pair(10, 25)); 
    v1.push_back(std::make_pair(11, 10)); 
    v1.push_back(std::make_pair(12, 1)); 
    v1.push_back(std::make_pair(12, 11)); 
    v1.push_back(std::make_pair(12, 21)); 
    v1.push_back(std::make_pair(12, 31)); 
    v1.push_back(std::make_pair(15, 11)); 
    v1.push_back(std::make_pair(100, 1)); 
    v1.push_back(std::make_pair(100, 11)); 
    v1.push_back(std::make_pair(100, 21)); 
    v1.push_back(std::make_pair(101, 11)); 
    v1.push_back(std::make_pair(102, 11)); 
    v1.push_back(std::make_pair(102, 13)); 
    v1.push_back(std::make_pair(102, 14)); 
    v1.push_back(std::make_pair(103, 11)); 
    v1.push_back(std::make_pair(103, 13)); 

    v2.push_back(std::make_pair(10, 11)); 
    v2.push_back(std::make_pair(12, 10)); 
    v2.push_back(std::make_pair(12, 40)); 
    v2.push_back(std::make_pair(16, 11)); 
    v2.push_back(std::make_pair(100, 12)); // match 
    v2.push_back(std::make_pair(101, 12)); // match 
    v2.push_back(std::make_pair(101, 13)); 
    v2.push_back(std::make_pair(101, 14)); 
    v2.push_back(std::make_pair(102, 12)); //match 
    v2.push_back(std::make_pair(103, 1)); 
    v2.push_back(std::make_pair(103, 10)); 
    v2.push_back(std::make_pair(103, 12)); // match 
    v2.push_back(std::make_pair(103, 15)); 

    auto intersection = _intersect_two_words(v1, v2); 
    for (auto entry : intersection) 
    { 
     std::cout << entry.first << ", " << entry.second << "+" << (entry.second + 1) << std::endl; 
    } 

    return 0; 
}

活生生的例子：http://ideone.com/XRfhAI

来源

2013-06-28 02:54:00 kfsone

嘿，你介意看看我原来的帖子吗？我发布了我的解决方案。谢谢！ –

看到我的修改回复。 – kfsone

谢谢@kfsone！我用我的新版代码更新了我的帖子。 –