2015-11-15 105 views
0
# search for a term in twitter 
rdmTweets <- searchTwitteR("machine learning", n=500, lang="en") 

dtm.control <- list(
tolower   = TRUE, 
removePunctuation = TRUE, 
removeNumbers  = TRUE, 
removestopWords = TRUE, 
stemming   = TRUE, # false for sentiment 
wordLengths  = c(3, "inf")) 

# create a dataframe around the results 
df <- do.call("rbind", lapply(rdmTweets, as.data.frame)) 

# Here are the columns 
names(df) 

# And some example content 
head(df,10) 

counts = table(df$screenName) 
barplot(counts) 

# Plot the data as received from Twitter 
cc <- subset(counts,counts>1) 
barplot(cc,las = 2,cex.names = 0.3) 

# the most commonly cited words in the tweets 
rdm_texts <- sapply(rdmTweets, function(x) x$getText()) 
rdm_corpus <- Corpus(VectorSource(rdm_texts)) 

dtm <- DocumentTermMatrix(rdm_corpus, control = dtm.control) # throws error 

错误抛出误差 -叽叽喳喳数据< - 在termdocumentmatrix

Error in simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow = length(allTerms), : 
'i, j, v' different lengths 
In addition: Warning messages: 
1: In mclapply(unname(content(x)), termFreq, control) : 
all scheduled cores encountered errors in user code 
2: In simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow =  length(allTerms), : 
NAs introduced by coercion 

试图搜索关键字鸣叫声,然后创建一个wordcloud。 删除所有标点,停止词语,删除号码,但似乎仍然不起作用。

任何帮助将不胜感激。

+0

在此处运行良好(tm_0.6-2,NLP_0.1-8,R版本3.2.2,Windows 7 x64) – lukeA

+0

感谢您的帮助。我正在使用Mac并让我更新tm包以查看它是否可用。 – user2241260

+0

使用tm_0.6-2,NLP_0.1-8,R版本3.2.0(2015-04-16) 平台:x86_64-苹果(64位) 运行于:OS X 10.11.1 @lukeA – user2241260

回答

0

utf8towcs - 破解了这个问题。

#the cainfo parameter is necessary only on Windows 
r_stats <- searchTwitter("#IpadPro", n=500, lang="en") 
#should get 500 
length(r_stats) 

#save text 
r_stats_text <- sapply(r_stats, function(x) x$getText()) 
r_stats_text_corpus <- Corpus(VectorSource(r_stats_text)) 

r_stats_text_corpus <- tm_map(r_stats_text_corpus, 
          content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), 
          mc.cores=1) 

r_stats_text_corpus <- tm_map(r_stats_text_corpus,  content_transformer(tolower)) 
r_stats_text_corpus <- tm_map(r_stats_text_corpus, removePunctuation) 
r_stats_text_corpus <- tm_map(r_stats_text_corpus, function(x)removeWords(x,stopwords())) 


# Creating a term document matrix 
tdm <- TermDocumentMatrix(r_stats_text_corpus) 
m <- as.matrix(tdm) 
word_freqs <- sort(rowSums(m), decreasing = TRUE) 

# create the data frame with the words and their frequencies 
dm <- data.frame(word = names(word_freqs), freq = word_freqs) 
wordcloud(dm$word, dm$freq, random.order = FALSE , colors = brewer.pal(8,"Dark2"))