# set the working directory as .../week5/usP
setwd("C:/Users/Kile/Desktop/week5/usP")
# load the jiebaR package
library(jiebaR)
## Loading required package: jiebaRD
# create the lists for placing the words frequency outcomes
USP_list = list()
USP_list_cut = list()
USP_list_cut_table=list()
USP_list_cut_dataframe=list()
# read lines to list
for(i in 1:length(list.files())){
USP_list[[i]]=readLines(list.files()[i],encoding="UTF-8")
}
# turn the words into lowercase words
for(i in 1:length(USP_list)){
USP_list[[i]]=tolower(USP_list[[i]])
}
# construct the word-cutting function
cutter=worker()
cutter$symbol=TRUE
# cut the words and put the outcomes into the new list
for(i in 1:length(USP_list)){
USP_list_cut[[i]]=cutter[USP_list[[i]]]
}
# turn the cut punctuations into spaces
for(i in 1:length(USP_list)){
USP_list_cut[[i]]=gsub("[[:punct:]]", " ", USP_list_cut[[i]])
}
# put the cut words into tables
for(i in 1:length(USP_list)){
USP_list_cut_table[[i]]=table(USP_list_cut[[i]])
}
# transform the words-counts tables into data frames
for(i in 1:length(USP_list)){
USP_list_cut_dataframe[[i]]=as.data.frame.table(USP_list_cut_table[[i]],stringsAsFactors = FALSE)
}
# order the data frames by words' frequency in decreasing order
for(i in 1:length(USP_list)){
USP_list_cut_dataframe[[i]]=USP_list_cut_dataframe[[i]][order(USP_list_cut_dataframe[[i]][2],decreasing=T),]
}
# remove the row with words: " "
for(i in 1:length(USP_list)){
USP_list_cut_dataframe[[i]]=subset(USP_list_cut_dataframe[[i]],USP_list_cut_dataframe[[i]][1]!=" ")
}
# rename the columns and rows
for(i in 1:length(USP_list)){
colnames(USP_list_cut_dataframe[[i]])=c("word","count")
rownames(USP_list_cut_dataframe[[i]])=1:nrow(USP_list_cut_dataframe[[i]])
}
# name the elements' names in the data frames list
for(i in 1:length(list.files())){
names(USP_list_cut_dataframe)[i]=substr(list.files()[i],1,nchar(list.files()[i])-4)
}
# display the first five and the last rows of the first data frame
USP_list_cut_dataframe[[1]][c(1:5,nrow(USP_list_cut_dataframe[[1]])),]
## word count
## 1 the 116
## 2 of 71
## 3 and 48
## 4 to 47
## 5 which 36
## 599 years 1
# create a vector for placing tokens sums
tokens_vec=c()
# sum the tokens for each speech and put them in the list
for(i in 1:length(USP_list)){
tokens_vec[i]=sum(USP_list_cut_dataframe[[i]][2])
}
# find the president with the most tokens and return the name
names(USP_list_cut_dataframe)[which(tokens_vec==max(tokens_vec))]
## [1] "1841-Harrison"
我們可以發現答案是1841年演講的William Henry Harrison。
# create a vector for placing unique tokens sums
tokens_unique_vec=c()
# sum the unique tokens for each speech and put them in the list
for(i in 1:length(USP_list)){
tokens_unique_vec[i]=nrow(USP_list_cut_dataframe[[i]])
}
# find the president with the most unique tokens and return the name
names(USP_list_cut_dataframe)[which(tokens_unique_vec==max(tokens_unique_vec))]
## [1] "1841-Harrison"
我們可以發現答案是1841年演講的William Henry Harrison。
# create histogram displaying the tokens distribution
hist(tokens_vec,main="Distribution of Number of Tokens in U.S. Presidents' Speeches",xlab="Number of Tokens",col="firebrick1",border=NA)
# create boxplot displaying the tokens distribution
boxplot(tokens_vec,main="Boxplot of Number of Tokens in U.S. Presidents' Speeches",ylab="Number of Tokens")
從直方圖和箱形圖中可以觀察到絕大部分的總統演講用字數落在1500到3000之間,分散程度不大,且有一極端直落在8000以上。
# create histogram displaying the unique tokens distribution
hist(tokens_unique_vec,main="Distribution of Number of Unique Tokens in U.S. Presidents' Speeches",xlab="Number of Unique Tokens",col="firebrick1",border=NA)
# create boxplot displaying the tokens distribution
boxplot(tokens_unique_vec,main="Boxplot of Number of Unique Tokens in U.S. Presidents' Speeches",ylab="Number of Unique Tokens")
從直方圖和箱形圖中可以觀察到絕大部分的總統演講用字豐富度落在400到1000之間,分散程度不大,且有一極端直落在1800到2000之間。
# create vectors for placing comma and period sums
comma_vec=c()
period_vec=c()
# construct the list containing the punctuations
for(i in 1:length(USP_list)){
USP_list_cut[[i]]=cutter[USP_list[[i]]]
}
# calculcte the comma and period sums for each speech and place them in the vector
for(i in 1:length(USP_list)){
comma_vec[i]=sum(USP_list_cut[[i]]==",")
period_vec[i]=sum(USP_list_cut[[i]]==".")
}
# create the vector for placing the ratios of commas to periods
ratio_vec=c()
# calculate the ratios of commas to periods in each speech and place them in a vector
ratio_vec=comma_vec/period_vec
# list the presidents with the highest five ratios of commas to periods and return the names
names(USP_list_cut_dataframe)[order(ratio_vec,decreasing = TRUE)][1:5]
## [1] "1797-Adams" "1805-Jefferson" "1801-Jefferson" "1789-Washington"
## [5] "1849-Taylor"
我們可以發現美國總統裡講話複雜程度的前五名分別是:
1.1797的John Adams
2.1805的Thomas Jefferson
3.1801的Thomas Jefferson
4.1789的George Washington
5.1849的Zachary Taylor
# list the presidents with the lowest five ratios of commas to periods and return the names
names(USP_list_cut_dataframe)[order(ratio_vec,decreasing = TRUE)][(length(USP_list_cut_dataframe)-4):length(USP_list_cut_dataframe)]
## [1] "1893-Cleveland" "1917-Wilson" "1933-Roosevelt" "1929-Hoover"
## [5] "1937-Roosevelt"
我們可以發現美國總統裡講話複雜程度的倒數五名分別是:
1.1937的Franklin D. Roosevelt
2.1929的Herbert Hoover
3.1933的Franklin D. Roosevelt
4.1917的Woodrow Wilson
5.1893的Grover Cleveland
# create histogram displaying the complexity distribution
hist(ratio_vec,main="Distribution of Complexity in U.S. Presidents' Speeches",xlab="Level of Complexity",col="firebrick1",border=NA)
# create boxplot displaying the tokens distribution
boxplot(ratio_vec,main="Boxplot of Complexity in U.S. Presidents' Speeches",ylab="Level of Complexity")
從直方圖和箱形圖中可以觀察到絕大部分的總統演講複雜程度在1到2倍的逗號數量對句號數量之間,集中程度高,且有一極端直落在6到7之間。
# speeches of Taiwan presidents
# set the working directory as ...week5/twp
setwd("C:/Users/Kile/Desktop/week5/twp")
# load the jiebaR and ggplot2 package
library(jiebaR)
require(ggplot2)
## Loading required package: ggplot2
# create the list for placing the words outcomes
TWP_list = list()
TWP_list_cut = list()
TWP_list_cut_table=list()
TWP_list_cut_dataframe=list()
# read lines to list
n=1
for(j in 1:length(list.files())){
setwd(paste("C:/Users/Kile/Desktop/week5/twp",list.files()[j],sep="/"))
for(i in 1:length(list.files())){
TWP_list[[n]]=readLines(list.files()[i],encoding="UTF-8")
n=n+1
}
setwd("C:/Users/Kile/Desktop/week5/twp")
}
# construct the word-cutting function
cutter=worker()
cutter$symbol=TRUE
cutter$stop_word="C:/Program Files/R/R-3.2.2/library/jiebaRD/dict/stop_words.utf8"
# cut the words and put the outcomes into the new list
for(i in 1:length(TWP_list)){
TWP_list_cut[[i]]=cutter[TWP_list[[i]]]
}
# turn the cut punctuations into spaces
for(i in 1:length(TWP_list)){
TWP_list_cut[[i]]=gsub("[[:punct:]]", " ", TWP_list_cut[[i]])
}
# put the cut words into tables
for(i in 1:length(TWP_list)){
TWP_list_cut_table[[i]]=table(TWP_list_cut[[i]])
}
# transform the words count table into data frames
for(i in 1:length(TWP_list)){
TWP_list_cut_dataframe[[i]]=as.data.frame.table(TWP_list_cut_table[[i]],stringsAsFactors = FALSE)
}
# order the data frames by words frequency
for(i in 1:length(TWP_list)){
TWP_list_cut_dataframe[[i]]=TWP_list_cut_dataframe[[i]][order(TWP_list_cut_dataframe[[i]][2],decreasing=T),]
}
# remove the rows with words " " and " "
for(i in 1:length(TWP_list)){
TWP_list_cut_dataframe[[i]]=subset(TWP_list_cut_dataframe[[i]],TWP_list_cut_dataframe[[i]][1]!=" ")
TWP_list_cut_dataframe[[i]]=subset(TWP_list_cut_dataframe[[i]],TWP_list_cut_dataframe[[i]][1]!=" ")
}
# rename the columns and rows
for(i in 1:length(TWP_list)){
colnames(TWP_list_cut_dataframe[[i]])=c("word","count")
rownames(TWP_list_cut_dataframe[[i]])=1:nrow(TWP_list_cut_dataframe[[i]])
}
# name the elements' names in the data frame list
n=1
for(j in 1:length(list.files())){
setwd(paste("C:/Users/Kile/Desktop/week5/twp",list.files()[j],sep="/"))
for(i in 1:length(list.files())){
names(TWP_list_cut_dataframe)[n]=substr(list.files()[i],1,nchar(list.files()[i])-4)
n=n+1
}
setwd("C:/Users/Kile/Desktop/week5/twp")
}
# display the first five and the last rows of the first data frame
TWP_list_cut_dataframe[[1]][c(1:5,nrow(TWP_list_cut_dataframe[[1]])),]
## word count
## 1 的 68
## 2 我們 22
## 3 與 16
## 4 在 15
## 5 和 10
## 484 讚譽 1
# construct get_date_str function
get_date_str <- function(raw_date){
is_10x_yr <- (substr(raw_date, 1, 1) == 1) & nchar(raw_date) == 7
yy <- substr(raw_date,1, if(is_10x_yr) 3L else 2L)
mm <- substr(raw_date,if(is_10x_yr) 4L else 3L, nchar(raw_date)-2)
dd <- substr(raw_date, nchar(raw_date)-1, nchar(raw_date))
ymd <- paste(yy,mm,dd,sep="/")
return(ymd)
}
# construct get_keyword_result function
get_keyword_result <- function(keyword){
headregex <- c('CSB*','CKS*', 'CCK*', 'LTH*', 'MYJ*')
names(headregex) <- c('陳水扁', '蔣介石', '蔣經國', '李登輝', '馬英九')
all_I_freqs <- list()
for(x in headregex){
idxs <- grep(x, names(TWP_list_cut_dataframe))
I_freqs<-c()
I_freqs_p<-c()
n<-1
date <- c()
for(idx in idxs){
pattern <- paste("^[\u4E00-\u9FA5]*",keyword,"[\u4E00-\u9FA5]*",sep='')
I_row <- grep(pattern,TWP_list_cut_dataframe[[idx]]$word)
I_freq <- TWP_list_cut_dataframe[[idx]][I_row,]
slice_start <- if(x=="MYJ*") 4L else 5L
date_raw <- substr(names(TWP_list_cut_dataframe[idx]),
slice_start,nchar(names(TWP_list_cut_dataframe[idx]))+1)
date[n] <- get_date_str(date_raw)
I_freqs[n] <- sum(I_freq$count)
I_freqs_p[n] <- sum(I_freq$count)/sum(TWP_list_cut_dataframe[[idx]]$count)
if(is.na(I_freqs[n])){
I_freqs[n] <- 0L
}
n <- n + 1
}
freq_to_date <- data.frame(I_word=I_freqs, I_word_p=I_freqs_p, date=as.Date(date))
col.names = c("I_word %", "date")
#print(freq_to_date)
all_I_freqs[[names(which(headregex==x))]] <- freq_to_date
}
factors <- c()
for(i in 1:length(all_I_freqs)){
factors <- c(factors,rep(names(all_I_freqs[i]),times=nrow(all_I_freqs[[i]])))
}
df <- do.call("rbind", all_I_freqs)
df["president"] <- factors
#折線圖
print(ggplot(data = df, aes(x=date, y=I_word))
+ geom_line(aes(colour=president))
+ scale_x_date("年份")
+ scale_y_continuous("字頻")
+ ggtitle(keyword))
print(ggplot(data = df, aes(x=date, y=I_word_p))
+ geom_line(aes(colour=president))
+ scale_x_date("年份")
+ scale_y_continuous("字頻百分比")
+ ggtitle(keyword))
#平均次數
print(paste("出現",keyword,"的平均次數",sep=""))
print(avg <- tapply(df$I_word, df$president, mean))
print(paste("說最多",keyword, "的總統", sep=""))
print(paste(names(which(avg == max(avg))),",平均次數: ", max(avg)))
print(paste("出現",keyword,"的平均百分比",sep=""))
print(avg <- tapply(df$I_word_p, df$president, mean))
print(paste("最常說",keyword, "的總統", sep=""))
print(paste(names(which(avg == max(avg))),",平均百分比: ", max(avg)))
}
get_keyword_result("我")
## [1] "出現我的平均次數"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 21.29167 48.08333 21.93333 30.86441 15.20000
## [1] "說最多我的總統"
## [1] "馬英九 ,平均次數: 48.0833333333333"
## [1] "出現我的平均百分比"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 0.02585617 0.02563176 0.01431429 0.02771402 0.03246463
## [1] "最常說我的總統"
## [1] "蔣經國 ,平均百分比: 0.0324646319057476"
根據以上的結果,我們可以發現:
1.在歷屆總統裡,馬英九說“我”的次數最多。
2.但若以“我”在斷字中出現的比例計算,兩位蔣先生比馬英九厲害。
3.雖然大家對陳水扁的“偶men~”很有印象,但意外地陳水扁不太常講“我”。
get_keyword_result("[台臺]灣")
## [1] "出現[台臺]灣的平均次數"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 3.0416667 32.2500000 27.1333333 0.6440678 0.2000000
## [1] "說最多[台臺]灣的總統"
## [1] "馬英九 ,平均次數: 32.25"
## [1] "出現[台臺]灣的平均百分比"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 0.0037548284 0.0172917867 0.0135675226 0.0005312704 0.0005733736
## [1] "最常說[台臺]灣的總統"
## [1] "馬英九 ,平均百分比: 0.0172917867157584"
根據以上的結果,我們可以發現:
1.在兩蔣時代幾乎不會提到臺灣。
2.很明顯地愈近代臺灣出現的比例愈高。
get_keyword_result("經濟")
## [1] "出現經濟的平均次數"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 6.291667 19.250000 8.800000 1.000000 0.650000
## [1] "說最多經濟的總統"
## [1] "馬英九 ,平均次數: 19.25"
## [1] "出現經濟的平均百分比"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 0.007126715 0.009419863 0.005319176 0.001004128 0.001391303
## [1] "最常說經濟的總統"
## [1] "馬英九 ,平均百分比: 0.00941986318266844"
根據以上的結果,我們可以發現:
1.“經濟”同樣隨年代遞增,但是馬英九比起其它總統特別喜歡這個字眼。
get_keyword_result("中國")
## [1] "出現中國的平均次數"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 6.916667 1.416667 5.066667 5.016949 6.050000
## [1] "說最多中國的總統"
## [1] "李登輝 ,平均次數: 6.91666666666667"
## [1] "出現中國的平均百分比"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 0.0093888696 0.0007074108 0.0018181105 0.0064723941 0.0132287057
## [1] "最常說中國的總統"
## [1] "蔣經國 ,平均百分比: 0.0132287057490537"
根據以上的結果,我們可以發現:
1.若看字頻百分比的話,蔣經國、李登輝的年代最常提到中國,反而近代不太常提到。
2.陳水扁快下台時反而提到最多中國,猜測是反分裂國家法的緣故。
get_keyword_result("中華民國")
## [1] "出現中華民國的平均次數"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 5.500000 10.750000 3.066667 5.016949 4.550000
## [1] "說最多中華民國的總統"
## [1] "馬英九 ,平均次數: 10.75"
## [1] "出現中華民國的平均百分比"
## 李登輝 馬英九 陳水扁 蔣介石 蔣經國
## 0.006588495 0.006340454 0.002460809 0.005715282 0.009868586
## [1] "最常說中華民國的總統"
## [1] "蔣經國 ,平均百分比: 0.00986858564825112"
根據以上的結果,我們可以發現:
1.百年國慶是個“中華民國”出現的高峰。