from gensim import corpora, models, similarities
documents = ["This is a book about cars, dinosaurs, and fences"]
# remove common words and tokenize
stoplist = set('for a of the and to in - , is'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
# Remove commas
texts[0] = [text.replace(',','') for text in texts[0]]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = "I like cars and birds"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi] # perform a similarity query against the corpus
print(sims)
在上面的代码中,我比较多少“这是一本关于汽车,恐龙,和围栏”文本相似性是类似“我喜欢车和鸟类”使用余弦相似技术。与gensim和余弦相似
这两句话有共同的有效1个字,这是“汽车”,但是当我运行的代码,我得到他们是100%相似。这对我没有意义。
有人可以建议如何提高我的代码,让我得到一个合理的数字?