利用https://www.dropbox.com/sh/pbbsla84bq6o678/AACtO1WjaMIxVh97eyWO81yNa?dl=0 中的 usP 和 twP探討: 1.哪個美國總統最多話 (tokens)用字量最豐富 (types) ? 2.哪個台灣總統最多話 (tokens)用字量最豐富 (types) ? 1.

library(tm)
## Loading required package: NLP
library(SnowballC)
fileName <- list.files("/Users/haoyi/Documents/104-1/lads/week4/usP/", "*.*")

maxArticleLength <- 0
maxWordKind <- 0
talkTooMuchPresident <- character() 
TooManyKindWordPresident <- character()
for (x in c(1:length(fileName)))
{
  filePath <- paste('/Users/haoyi/Documents/104-1/lads/week4/usP/',fileName[x],sep='')
  fileContent <- readLines(filePath,encoding = 'UTF-8')
  wordVector <- strsplit(fileContent," ") 
  wordTable <- table(wordVector[[1]])
  if (length(wordVector[[1]]) > maxArticleLength) 
  {
    maxArticleLength <- length(wordVector[[1]])
    talkTooMuchPresident <- fileName[x]
  }
  if(length(wordTable) > maxWordKind) 
  {
    maxWordKind <- length(wordTable)
    TooManyKindWordPresident <- fileName[x]
  }
}

用字量最豐富 (types)的美國總統檔案:

## [1] "1841-Harrison.txt"

最多話 (tokens)的美國總統檔案:

## [1] "1841-Harrison.txt"
  1. (*需要將所有的總統文告檔案存在同一個資料夾)
library(tm)
library(SnowballC)
library(tmcn)
## # tmcn Version: 0.1-4
library(jiebaR)
## Loading required package: jiebaRD
fileName <- list.files("/Users/haoyi/Documents/104-1/lads/week4/twP/", "*.*")

maxArticleLength <- 0
maxWordKind <- 0
talkTooMuchPresident <- character() 
TooManyKindWordPresident <- character()
for (x in c(1:length(fileName)))
{
  filePath <- paste('/Users/haoyi/Documents/104-1/lads/week4/twP/',fileName[x],sep='')
  txt = scan(filePath,what='char',encoding='UTF-8')
  words_vector=worker()<=txt
  words_char=paste(words_vector,collapse=' ')
  wordVector <- strsplit(words_char," ") 
  wordTable <- table(wordVector[[1]])
  if (length(wordVector[[1]]) > maxArticleLength) 
  {
    maxArticleLength <- length(wordVector[[1]])
    talkTooMuchPresident <- fileName[x]
  }
  if(length(wordTable) > maxWordKind) 
  {
    maxWordKind <- length(wordTable)
    TooManyKindWordPresident <- fileName[x]
  }
}

用字量最豐富 (types))的台灣總統檔案:

## [1] "CKS_550101.txt"

最多話 (tokens)的台灣總統檔案:

## [1] "CKS_550101.txt"