利用https://www.dropbox.com/sh/pbbsla84bq6o678/AACtO1WjaMIxVh97eyWO81yNa?dl=0 中的 usP 和 twP探討: 1.哪個美國總統最多話 (tokens),用字量最豐富 (types) ? 2.哪個台灣總統最多話 (tokens),用字量最豐富 (types) ? 1.
library(tm)
## Loading required package: NLP
library(SnowballC)
fileName <- list.files("/Users/haoyi/Documents/104-1/lads/week4/usP/", "*.*")
maxArticleLength <- 0
maxWordKind <- 0
talkTooMuchPresident <- character()
TooManyKindWordPresident <- character()
for (x in c(1:length(fileName)))
{
filePath <- paste('/Users/haoyi/Documents/104-1/lads/week4/usP/',fileName[x],sep='')
fileContent <- readLines(filePath,encoding = 'UTF-8')
wordVector <- strsplit(fileContent," ")
wordTable <- table(wordVector[[1]])
if (length(wordVector[[1]]) > maxArticleLength)
{
maxArticleLength <- length(wordVector[[1]])
talkTooMuchPresident <- fileName[x]
}
if(length(wordTable) > maxWordKind)
{
maxWordKind <- length(wordTable)
TooManyKindWordPresident <- fileName[x]
}
}
用字量最豐富 (types)的美國總統檔案:
## [1] "1841-Harrison.txt"
最多話 (tokens)的美國總統檔案:
## [1] "1841-Harrison.txt"
library(tm)
library(SnowballC)
library(tmcn)
## # tmcn Version: 0.1-4
library(jiebaR)
## Loading required package: jiebaRD
fileName <- list.files("/Users/haoyi/Documents/104-1/lads/week4/twP/", "*.*")
maxArticleLength <- 0
maxWordKind <- 0
talkTooMuchPresident <- character()
TooManyKindWordPresident <- character()
for (x in c(1:length(fileName)))
{
filePath <- paste('/Users/haoyi/Documents/104-1/lads/week4/twP/',fileName[x],sep='')
txt = scan(filePath,what='char',encoding='UTF-8')
words_vector=worker()<=txt
words_char=paste(words_vector,collapse=' ')
wordVector <- strsplit(words_char," ")
wordTable <- table(wordVector[[1]])
if (length(wordVector[[1]]) > maxArticleLength)
{
maxArticleLength <- length(wordVector[[1]])
talkTooMuchPresident <- fileName[x]
}
if(length(wordTable) > maxWordKind)
{
maxWordKind <- length(wordTable)
TooManyKindWordPresident <- fileName[x]
}
}
用字量最豐富 (types))的台灣總統檔案:
## [1] "CKS_550101.txt"
最多話 (tokens)的台灣總統檔案:
## [1] "CKS_550101.txt"