# ------------------------------------------------------------
## Uni-Variate Data Analysis 
## Instructor: Cheng-shan Liu 劉正山
##
## Date: 2011.11.8
## -------------------------------------------------------------

# 基本描述統計函式
sum(1:10) # 總合
max(1:10) # 取最大值
min(1:10) # 取最小值
range(1:10) # 範圍（標出最大最小值）
mean(1:10) # 平均數
median(1:9) # 中位數
max(1:10)-min(1:10) # 全距
var(1:10) # 變異數
sd(1:10)  # 標準差

##=======================================
## 1. 單一變數分析（Univariate Data Analysis)
##========================================

## 先以Winter College和Verzani書中提供的資料來學習基本指令的使用
## library(foreign)
## wgc <- read.csv("/mnt/xpdoc/MyDocuments/teaching/statistics/data/wgcoll.csv")

load("/mnt/xpdoc/MyDocuments/teaching/statistics/data/wgcoll.rda")
summary(wgc)

#summary() gives all necessary information about a vairable, for example,
summary(wgc$aa)

##------------------------------------------------
## 1.1 類別/名目型變數（categorical/nominal variable）
##------------------------------------------------
## 統計表（table）
## 下一節將進一步介紹關於CrossTable指令的使用
table(wgc$aa)
table(wgc$g)
table(central.park.cloud) # Verzani Ex.2.1 (p.33)

##------------------------------------------------
## 1.2 順序型變數（factor or ordinal variable）
##------------------------------------------------
is.factor(central.park.cloud)
mean(central.park.cloud) #factors are not numeric


##---------------
## 1.3 基本製圖
##---------------
## 圓餅圖或餅狀圖（pie chart)
## 不適用於比例尺度的變項，如：
pie(wgc$g)
pie(wgc$pe)

## 條狀圖或煙囪圖（barplot）
## 適用於類別、名目型變數
barplot(wgc$g) #wrong
barplot(table(wgc$g))
barplot(table(wgc$g), xlab="Gender", ylab="Frequency",
        main="Gender", ylim=c(0,100))
barplot(table(wgc$g), xlab="Gender", ylab="Frequency",
        ylim=c(16,30), xpd=FALSE)

# 看到了嗎？使用條狀圖可以容易欺騙讀者的眼睛

## 點狀分佈圖（dot chart or Cleveland dotplot)
dotchart(wgc$aa, ylab="observations (from 1 to 50)")


##------------------
## 1.4 數量資料的分析
##------------------
## 莖葉圖或枝葉圖（stem-and-leaf plots）
stem(wgc$aa)
hist(wgc$aa) # 比較：從直方圖上無法看到原始資料

## 點狀圖（strip charts or dot-plots)
stripchart(wgc$aa, method="stack", pch=1, offset=1)
stripchart(wgc$pe, method="stack", pch=1, offset=1)
DOTplot(wgc$aa)
dotchart(wgc$aa)


## 1.4.1 資料的中央趨勢（central tendency）的衡量
##------------------------------------------
# 樣本平均數（mean）
sum(wgc$aa)/length(wgc$aa)
mean(wgc$aa)

# 樣本中位數 （median）
median(wgc$aa)

# 從這個小實驗可以看出中位數較算數平均數較會受極端值的影響
aa.with.extreme <- c(wgc$aa, 1000) # add an outlier
stripchart(aa.with.extreme, pch=1)
stem(aa.with.extreme)
mean(aa.with.extreme) 
median(aa.with.extreme) 

# 觀察樣本平均數時扣除極端值的影響（trimmed mean）
mean(aa.with.extreme, trim=.1) #trim 10% of the smallest and largest values from the data
mean(wgc$aa)

# 眾數（mode）和 區數平均數（ midrange of a dataset）
# 注意：區間平均數(not a good summary for a dataset)
which(table(wgc$aa)==max(table(wgc$aa))) #or
which.max(table(wgc$aa))
mean(range(wgc$aa))


## 1.4.2 資料離散趨勢（dispersion）的衡量
##----------------------------------
# 樣本的變數程度（the sample variance）
range(wgc$aa)       
diff(range(wgc$aa))  #distance of the sample, nonnegative

# 變異數的計算（compute the sample variance）
n=length(wgc$aa)
1/(n-1) * sum((wgc$aa-mean(wgc$aa))^2)
var(wgc$aa)

# 標準差（standard deviation）
sqrt(var(wgc$aa))
sd(wgc$aa)

# 四分位數（quantiles and percentiles）
quantile(wgc$aa) #default gives quartiles
quantile(wgc$aa, .5) #the .5 quantile is equal to the median
median(wgc$aa)

# 分位數區間（inter-quartile range）
#compare the middle 50% of the data with the standard deviation;
#if sd > IQR, this suggests that outliers skew the results.
IQR(wgc$aa)
sd(wgc$aa)

# 計算每個值的z-scores (for comparing samples with different senses of scale)
(wgc$aa-mean(wgc$aa))/sd(wgc$aa)
scale(wgc$aa)

# 1.4.3 峰度（kurtosis） and 偏態（skewness）
##------------------------------------------
# 峰度值愈大曲線愈不對稱；係數值等於零為常態峰，大於零為高峽峰，小於零為低闊峰
kurtosis(wgc$aa)

# 偏態值（對稱分佈的係數）愈大表資料分佈愈不對稱；係數值大於零為右偏分佈，小於零為左偏分佈
skewness(wgc$aa)


## 1.4.4 直方圖 histogam 
##--------------------
## 注意：直方圖不宜使用於名目型變數的分析
hist(wgc$aa)
hist(wgc$aa, breaks=10)
hist(wgc$aa, breaks=20, xlim=c(0,100), col=gray(0.9),
     main="Academic ability of the sample of WGC students")

# 由於直方圖的「長相」受到組距（bins）的影響，
# 我們可以使用density這個指令將次數分佈圖以平滑曲線來呈現
hist(wgc$aa, probability=TRUE, main="")
lines(density(wgc$aa))  #add to histogram
plot(density(wgc$aa))   #make a separate plot


## 1.4.4 盒狀圖或盒鬚圖（box plots）
##---------------------------------------
boxplot(wgc$aa, notch=FALSE, col=gray(0.9),
        horizontal=TRUE,xlab="observed values",
#       horizontal=FALSE,ylab="observed values",
        main="WGC Academic Ability")

# 看看以1.4.1裡的aa.with.extreme
boxplot(aa.with.extreme, notch=FALSE, col=gray(0.9),
        horizontal=TRUE,xlab="observed values",
        main="WGC Academic Ability")


##=================================
## 2. 新增變數、重新命名變數與變數值的處理
##=================================

## 使用的範例檔：TEDS2006年高雄市長選舉資料檔：teds2006_kao.rda
## 資料檔出處：http://www.tedsnet.org

## 這個檔是由原始的SPSS資料檔匯出為por檔後，再轉存為rda檔的，
## 因此這個檔中的變數及變數值皆與原始檔案一致。

load("/mnt/xpdoc/MyDocuments/data/teds06/teds06e/Kaohsiung/independance/teds2006_kao.rda")
names(kao06)

## 在kao06的資料中新建一個叫作partyID（政黨認同）的變數
## 並將無效變數值（>=90）設為missing

kao06$partyID[kao06$L2B==1]<-1   #KMT
kao06$partyID[kao06$L2B==2]<-2   #DPP
kao06$partyID[kao06$L2B==3]<-3   #NP
kao06$partyID[kao06$L2B==4]<-4   #PFP
kao06$partyID[kao06$L2B==6]<-5   #TSU
kao06$partyID[kao06$L2B==90]<-90 #others
kao06$partyID[kao06$L2B>90]<-NA 

## 再建立一個叫作turnout（有去投票）的變數
# kao06$turnout <- kao06$H01
kao06$turnout[kao06$H01==1]<-1   # yes
kao06$turnout[kao06$H01==2]<-0   # no
kao06$turnout[kao06$H01>=2]<-NA

## 注意：這些新增變數在你離開R之後就會消失
## 下次要取得這些變數必須以C-c r 執行以上幾行
## 如果希望將這些變數永久加入kao06的資料檔，就要另存新資料檔（例如，存為kao06.rda)：
## save(kao06,file="你的絕對路徑/kao06.rda",compress=T)
## 下次要工作時，就去讀取你自行製作的新資料kao06.rda。
save(kao06,file="/home/ashan/website/teaching/dataAnalysis/data/kao06.rda",compress=T)

## 使用table指令來比較新與舊變數之間是不是一致
table(kao06$L2B)
table(kao06$partyID, exclude=NULL)

table(kao06$H01)
table(kao06$turnout, exclude=NULL)

rm(list=ls())


## 練習：請自行建立兩個變數，一個叫作gender的，並讓男性為1，女性為0
## (在原始檔案中的變數名稱為SEX，男生為1，女生為2)
## 另一個是age, 原始的變數名稱為AGE.
## 記得，完成之後一定要檢查是否coding無誤。
## H0:gender is not related to turnout
## H1:gender is related to turnout


##===========================
## 3. CrossTable交叉表分析
##===========================

# 這個套件很重要，裡頭有許多好用的指令工具沒有這個套件，請在R視窗中用
# install.packages("gmodels")來下載安裝
library(gmodels)

## 3.1 描述資料檔和變項
load("/home/ashan/website/teaching/dataAnalysis/data/kao06.rda")
attach(kao06)
CrossTable(partyID)
CrossTable(turnout)

## 2.2 分析兩個變數之間是否存在關聯性
## 研究問題：那一黨的支持者比較積極參與投票？
CrossTable(partyID,turnout)
CrossTable(partyID,turnout,prop.r=TRUE, prop.t=FALSE, prop.c=TRUE, prop.chisq=FALSE, chisq=TRUE)

## null hypothesis: 政黨傾向與投票參與兩者無關
## alternative hypothesis: 政黨傾向與投票參與兩者有關

## 顯然年齡與投票參與有關！
CrossTable(AGE,turnout,prop.r=TRUE, prop.t=FALSE, prop.c=TRUE, prop.chisq=FALSE, chisq=TRUE)

## 練習：那麼，性別與投票參與有關聯嗎？ 年齡(AGE)與投票參與有相關嗎？如果有，是什麼樣的關聯？
CrossTable(gender,turnout,prop.r=TRUE, prop.t=FALSE, prop.c=TRUE, prop.chisq=FALSE, chisq=TRUE)

detach(kao06)


##=====================
## 把結果轉成html檔
##=====================
library(R2HTML)
HTMLStart(outdir="d:/你的資料存放絕對路徑", filename = "defection00", echo = FALSE)

table(teds08l$eth, exclude=NULL) 
## 把你的程式碼放在HTMLStart和HTMLStop兩行中間

HTMLStop()