2014-09-19 44 views
1

现在,我有一个这样的文件:查找频率和使用NLTK两字组的PMI分数(Python 3中)

with open('BigramCounter.txt', encoding='utf-8') as wordfile: 
    text = wordfile.read() 
words = nltk.word_tokenize(text) 
bigram_measures = nltk.collocations.BigramAssocMeasures() 
bgs = nltk.bigrams(words) 
fdist = nltk.FreqDist(bgs) 
pmi = bgs.score_ngrams(bigram_measures.pmi) 

现在我可以获取该文件中的每个两字的频率,并分别我可以获得文件中的bigram的PMI,但我不知道如何将它们放在一起,以便NLTK创建Bigram并为其PMI分数!有其他人遇到过这个问题吗?谢谢!

回答

0

尝试了这一点:

#!/usr/bin/env python -*- coding: utf-8 -*- 

""" 
This is a Multi-Word Expression (MWE) extractor from the "Terminator" project, 
see https://github.com/alvations/Terminator. 

Here's some legalese: 

############################################################################## 
Terminator is copyrighted under MIT License by alvations. 

Copyright (c) 2013-2014 Liling Tan (@alvations) 

Permission is hereby granted, free of charge, to any person obtaining a 
copy of this software and associated documentation files (the "Software"), 
to deal in the Software without restriction, including without limitation the 
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell copies of the Software, and to permit persons to whom the Software is 
furnished to do so, subject to the following conditions: 

Please cite the following when using part-of or the full code/software: 

    Liling Tan. 2013. Terminator - Terminology Extraction to Improve 
    Machine Translation [Software]. Available from 
    https://github.com/alvations/Terminator. 

The above copyright notice and this permission notice shall be included in all 
copies or substantial portions of the Software. 

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
THE SOFTWARE. 

""" 

# Authorship Info. 
__author__ = "Liling Tan (aka @alvations)" 
__copyright__ = "(c) Copyright 2013" 
__license__ = "MIT" 
__date__ = "20 Dec 2013" 
__version__ = "0.1" 
__maintainer__ = "Liling Tan" 
__email__ = "[email protected]" 
__status__ = "pre-development" 

import codecs, math, os 
from collections import Counter 
import cPickle as pickle 

def ngram(text,n=2): 
    if n==1: return text.split(); 
    return zip(*[text.split()[i:] for i in range(n)]) 

def pmi(word1, word2, unigram_freq, bigram_freq): 
    prob_word1 = unigram_freq[word1]/float(sum(unigram_freq.values())) 
    prob_word2 = unigram_freq[word2]/float(sum(unigram_freq.values())) 
    prob_word1_word2 = bigram_freq[" ".join([word1, word2])]/float(sum(bigram_freq.values())) 
    try: 
    return math.log(prob_word1_word2/float(prob_word1*prob_word2),2) 
    except: # Occurs when calculating PMI for Out-of-Vocab words. 
    return 0 

def phi2(word1,word2, unigram_freq, bigram_freq): 
    n12 = sum(bigram_freq[i] for i in bigram_freq if \ 
      word1 in i.split() and word2 not in i.split()) 
    n21 = sum(bigram_freq[i] for i in bigram_freq if \ 
      word1 not in i.split() and word2 in i.split()) 
    n11 = bigram_freq[word1+" "+word2] 
    n22 = sum(bigram_freq.values()) - n11 
    n1p = n11 + n12 
    n2p = n21 + n22 
    np1 = n11 + n21 
    np2 = n12 + n22 
    assert np1 + np2 == n1p + n2p 
    return math.log((n11*n22 - n21*n12)*(n11*n22 - n21*n12)/float(n1p*np1*np2*n2p),2) 

def llr(word1,word2, unigram_freq): 
    return math.log(unigram_freq[word1]*unigram_freq[word2],2) 

def load_ngramfreq_pickle(filename): 
    if os.path.exists(filename): 
    return pickle.load(codecs.open(filename,'rb')) 
    else: 
    infile,n = filename.split("-"); n = int(n[0]) 
    ngram_freq = Counter() 
    with codecs.open(infile,'r','utf8') as fin: 
     for line in fin: 
     line = line.lower() 
     if n > 1: ngram_freq.update([" ".join(j) for j in ngram(line,n)]); 
     else: ngram_freq.update(ngram(line,n)); 
    pickle.dump(ngram_freq, codecs.open(filename,'wb')) 
    return ngram_freq 

def load_ngramfreq(srcfile, trgfile): 
    src_unigramfile = srcfile+"-1gram.pk" 
    src_bigramfile = srcfile+"-2gram.pk" 
    trg_unigramfile = trgfile+"-1gram.pk" 
    trg_bigramfile = trgfile+"-2gram.pk" 

    return load_ngramfreq_pickle(src_unigramfile), \ 
    load_ngramfreq_pickle(src_bigramfile), \ 
    load_ngramfreq_pickle(trg_unigramfile), \ 
    load_ngramfreq_pickle(trg_bigramfile) 

def load_precalculated_pmi(srcfile,trgfile): 
    filename = srcfile+"_"+trgfile+"_pmi.pk" 
    if os.path.exists(filename): 
    return pickle.load(codecs.open(filename,'rb')) 
    else: 
    return {} 

def extract_mwe(sentence, unigramfreq, bigramfreq, precal_pmi, threshold=10): 
    mwes = [] 
    for ng in ngram(sentence,2): 
    ng = ng[0].lower()+" "+ng[1].lower() 
    if ng in precal_pmi: 
     score = precal_pmi[ng] 
    else: 
     score = pmi(ng[0].lower(), ng[1].lower(), unigramfreq, bigramfreq) 
     precal_pmi[ng] = score 
    if score > threshold: 
     mwes.append(ng) 
    return " ".join(mwes) 

def main(srcfile, trgfile): 
    src_unigram, src_bigram, trg_unigram, trg_bigram = \ 
    load_ngramfreq(srcfile, trgfile) 

    precal_pmi = load_precalculated_pmi(srcfile, trgfile) 

    fout = codecs.open('mwe_pmi.de-en','w','utf8') 

    with codecs.open(srcfile,'r','utf8') as srcfin, \ 
    codecs.open(trgfile, 'r','utf8') as trgfin: 
    for src, trg in zip(srcfin,trgfin): 
     src_mwe = extract_mwe(src.strip().lower(), src_unigram, src_bigram, 
          precal_pmi) 
     trg_mwe = extract_mwe(trg.strip().lower(), trg_unigram, trg_bigram, 
          precal_pmi) 
     if src_mwe and len(src_mwe) == len(trg_mwe): 
     print>>fout, " ".join(src_mwe) +"\t"+" ".join(trg_mwe) 

if __name__ == '__main__': 
    import sys 
    if len(sys.argv) < 2: 
    sys.stderr.write('Usage: python %s srcfile trgfile \n' % sys.argv[0]) 
    sys.exit(1) 
    main(sys.argv[1], sys.argv[2]) 
1

您可以使用此代码与它们的频率一起提取二元语法,或提取PMI分数一定两字:

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 

import math 
import nltk 
from collections import defaultdict 

def generateUnigramsInMovie(Tokens,freqThreshold):   
    unigrams_in_movie=defaultdict(int)   
    fdistUnigrams = nltk.FreqDist(Tokens) 
    for unigram, freq in sorted(fdistUnigrams.iteritems(), key=lambda (k,v): (v,k)): 
     if freq > freqThreshold: 
      unigrams_in_movie[unigram] = freq 
    return unigrams_in_movie 

def generateBigramsInMovie(Tokens,freqThreshold): 
     bigrams_in_movie=defaultdict(int) 

     b = nltk.collocations.BigramCollocationFinder.from_words(Tokens) 
     b.apply_freq_filter(freqThreshold) 
     for bigram, freq in b.ngram_fd.items(): 

       bigram=" ".join([bigram[0], bigram[1]]) 
       bigrams_in_movie[bigram] = freq 
     return bigrams_in_movie 


#This method is copied from the code given by "alvas" 
#Taken from this project: Multi-Word Expression (MWE) extractor from the "Terminator" project 
#Liling Tan. 2013. Terminator - Terminology Extraction to Improve 
#Machine Translation [Software]. Available from 
#https://github.com/alvations/Terminator. 

def pmi(word1, word2, unigram_freq, bigram_freq): 

    prob_word1 = unigram_freq[word1]/float(sum(unigram_freq.values())) 
    prob_word2 = unigram_freq[word2]/float(sum(unigram_freq.values())) 
    prob_word1_word2 = bigram_freq[" ".join([word1, word2])]/float(sum(bigram_freq.values())) 

    try: 

     return math.log(prob_word1_word2/float(prob_word1*prob_word2),2) 

    except: # Occurs when calculating PMI for Out-of-Vocab words. 

     return 0 



with open('Text.txt') as wordfile: 
    text = wordfile.read() 
Tokens = nltk.word_tokenize(text) 

unigrams_in_movie= generateUnigramsInMovie(Tokens,1) 
bigrams_in_movie= generateBigramsInMovie(Tokens,1) 

b = nltk.collocations.BigramCollocationFinder.from_words(Tokens) 
b.apply_freq_filter(1) 
bigram_measures = nltk.collocations.BigramAssocMeasures() 
bestBigrams=b.nbest(bigram_measures.pmi, 50) 
#I guess that this is what you are looking for it prints the bigram along with its frequency 
for bigram in bestBigrams: 
    bigram=" ".join([bigram[0], bigram[1]]) 

    bigrmaFreq=bigrams_in_movie[bigram] 
    print str(bigram) +" "+str(bigrmaFreq) 

# Then if you want the pmi score for a certain bigram use this : 
#As stated before this method is copied from the code given by "alvas" 
print pmi(word1, word2, unigrams_in_movie, bigrams_in_movie) 

希望这有助于。 干杯