# set the working directory
setwd("D:/Kile/語言分析與資料科學/week6/Bonus/news")
# load the required packages
library(jiebaR)
## Loading required package: jiebaRD
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
# create the lists for placing the words frequency outcomes
news_list = list()
news_list_cut = list()
news_list_cut_table=list()
news_list_cut_dataframe=list()
# read lines to list
for(i in 1:length(list.files())){
news_list[[i]]=readLines(list.files()[i],encoding="UTF-8")
}
# construct the word-cutting function
cutter=worker()
cutter$stop_word="C:/Program Files/R/R-3.2.2/library/jiebaRD/dict/stop_words.utf8"
# cut the words and put the outcomes into the new list
for(i in 1:length(news_list)){
news_list_cut[[i]]=cutter[news_list[[i]]]
}
# put the cut words into tables
for(i in 1:length(news_list)){
news_list_cut_table[[i]]=table(news_list_cut[[i]])
}
# transform the words-counts tables into data frames
for(i in 1:length(news_list)){
news_list_cut_dataframe[[i]]=as.data.frame.table(news_list_cut_table[[i]],stringsAsFactors = FALSE)
}
# order the data frames by words' frequency in decreasing order
for(i in 1:length(news_list)){
news_list_cut_dataframe[[i]]=news_list_cut_dataframe[[i]][order(news_list_cut_dataframe[[i]][2],decreasing=T),]
}
# rename the columns and rows
for(i in 1:length(news_list)){
colnames(news_list_cut_dataframe[[i]])=c("word","count")
rownames(news_list_cut_dataframe[[i]])=1:nrow(news_list_cut_dataframe[[i]])
}
# name the elements' names in the data frames list
for(i in 1:length(list.files())){
names(news_list_cut_dataframe)[i]=substr(list.files()[i],1,nchar(list.files()[i])-4)
}
# remove the words with length equal to 1, which usually with little meaning
for(i in 1:length(list.files())){
news_list_cut_dataframe[[i]]=news_list_cut_dataframe[[i]][nchar(news_list_cut_dataframe[[i]]$word)!=1,]
}
# plot wordcloud graph
wordcloud(news_list_cut_dataframe[[1]]$word,news_list_cut_dataframe[[1]]$count, scale=c(3,0.5),min.freq=10, max.words=Inf, random.order=FALSE, rot.per=FALSE, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"),main=names(news_list_cut_dataframe)[1])
wordcloud(news_list_cut_dataframe[[2]]$word,news_list_cut_dataframe[[2]]$count, scale=c(3,0.5),min.freq=10, max.words=Inf, random.order=FALSE, rot.per=FALSE, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"),main=names(news_list_cut_dataframe)[2])
wordcloud(news_list_cut_dataframe[[3]]$word,news_list_cut_dataframe[[3]]$count, scale=c(3,0.5),min.freq=10, max.words=Inf, random.order=FALSE, rot.per=FALSE, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"),main=names(news_list_cut_dataframe)[3])
結論:
1. 從整體來看,中國時報在使用次數至少10次的詞中,顯得較不豐富。
2. 雖然國民黨、洪秀柱、朱立倫以及總統四個詞明顯地都有在三家媒體中以高頻率出現,但是在彼此用詞的數量上也略有差異,根據文字的大小可以看出像是在蘋果日報裡“洪秀柱”和“朱立倫”兩詞出現的次數相當,而在其他兩家媒體裡則是有較大的差距。
for(i in 1:length(news_list)){
news_list_cut_table[[i]]=news_list_cut_table[[i]][nchar(names(news_list_cut_table[[i]]))!=1]
news_list_cut_table[[i]]=sort(news_list_cut_table[[i]],T)
}
par(mfrow=c(1,3))
for(i in 1:length(news_list)){
name=c("蘋果日報","中國時報","自由時報")
yrange=range(news_list_cut_table)
barplot(sort(news_list_cut_table[[i]][1:20]),xlim = c(0,yrange[2]+5),horiz = TRUE,border=NA,las=1,col=ifelse(names(sort(news_list_cut_table[[i]][1:20]))%in%intersect(intersect(news_list_cut_dataframe[[1]][1:20,1],news_list_cut_dataframe[[2]][1:20,1]),news_list_cut_dataframe[[3]][1:20,1]),"firebrick2","springgreen3"),xlab="Frequency",main=name[i],cex.main=1.5)
}
結論:
1. 在重疊用字的使用排名上,三家媒體各有其趨勢,比如說在“立委”一詞,於蘋果日報排在第20名,而在中國時報位居前段,自由時報落於中段。
2. 雖然新聞為“換柱”的主題,但是蘋果日報裡“換柱”一詞未出現在其使用次數較高的詞語當中。
3. 除了新聞裡牽涉到的主要角色“洪秀柱”與“朱立倫”,在蘋果日報裡“李四川”也出現在較高用詞頻率的排名裡,而在中國時報裡則是“蔡英文”,或許可以歸因於報導重點上的差異。