-1
我有我使用垃圾邮件分类代码和它的作品很好,但每次我尝试干/ lemmatize字我得到这个错误:Ascii码编解码器不能解码字节为0xC2蟒蛇NLTK
文件“ /Users/Ramit/Desktop/Bayes1/src/filter.py “第16行,在trim_word 字= ps.stem(字)
文件” /Library/Python/2.7/site-packages/nltk/stem /porter.py“,行664,在词干 stem = self._step1a(词干)
文件”/Library/Python/2.7/site-packages/nltk/stem/porter.py“,第289行,在_step1a
if word.endswith('ies') and len(word) == 4:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
这里是我的代码:
from word import Word
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
class Filter():
def __init__(self):
self.words = dict()
def trim_word(self, word):
# Helper method to trim away some of the non-alphabetic characters
# I deliberately do not remove all non-alphabetic characters.
word = word.strip(' .:,-!()"?+<>*')
word = word.lower()
word = ps.stem(word)
return word
def train(self, train_file):
lineNumber = 1
ham_words = 0
spam_words = 0
stop = set(stopwords.words('english'))
# Loop through all the lines
for line in train_file:
if lineNumber % 2 != 0:
line = line.split('\t')
category = line[0]
input_words = line[1].strip().split(' ')
#Loop through all the words in the line, remove some characters
for input_word in input_words:
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
# Check if word is in dicionary, else add
if input_word in self.words:
word = self.words[input_word]
else:
word = Word(input_word)
self.words[input_word] = word
# Check wether the word is in ham or spam sentence, increment counters
if category == "ham":
word.increment_ham()
ham_words += 1
elif category == "spam":
word.increment_spam()
spam_words += 1
# Probably bad training file input...
else:
print "Not valid training file format"
lineNumber+=1
# Compute the probability for each word in the training set
for word in self.words:
self.words[word].compute_probability(ham_words, spam_words)
def get_interesting_words(self, sms):
interesting_words = []
stop = set(stopwords.words('english'))
# Go through all words in the SMS and append to list.
# If we have not seen the word in training, assign probability of 0.4
for input_word in sms.split(' '):
input_word = self.trim_word(input_word)
if (input_word != "") and (input_word not in stop):
if input_word in self.words:
word = self.words[input_word]
else:
word = Word(input_word)
word.set_probability(0.40)
interesting_words.append(word)
# Sort the list of interesting words, return top 15 elements if list is longer than 15
interesting_words.sort(key=lambda word: word.interesting(), reverse=True)
return interesting_words[0:15]
def filter(self, input_file, result_file):
# Loop through all SMSes and compute total spam probability of the sms-message
lineNumber = 0
for sms in input_file:
lineNumber+=1
spam_product = 1.0
ham_product = 1.0
if lineNumber % 2 != 0:
try:
for word in self.get_interesting_words(sms):
spam_product *= word.get_probability()
ham_product *= (1.0 - word.get_probability())
sms_spam_probability = spam_product/(spam_product + ham_product)
except:
result_file.write("error")
if sms_spam_probability > 0.8:
result_file.write("SPAM: "+sms)
else:
result_file.write("HAM: "+sms)
result_file.write("\n")
我只是寻找一个解决办法,让我lemmatize /干的话。我试过在网上寻找,我发现了类似的问题,但他们一直没有为我工作。
建议:(1)在发布之前将标签转换为空格。 (2)创建一个[最小示例](http://stackoverflow.com/help/mcve)。 –
也许这将有助于https://gist.github.com/alvations/07758d02412d928414bb从https://github.com/alvations/pywsd/blob/master/pywsd/utils.py#L66 – alvas
的问题可能是你”不正确地读取文件?尝试'导入io; file_in = io.open('filename.txt','r',encoding ='utf8')'。有点不清楚什么是错误的,但是如果你可以发布你想要处理的数据,那么理解错误就会容易得多。 – alvas