## ----------------------------------------------------------- ## Topic: Sampling Distribution ## Instructor: Cheng-shan Liu 劉正山 ## Reference: Verzani Chapter 5 ## Date: 2011.11.1 ## ----------------------------------------------------------- ############### ## 1 機率與抽樣 ############### ## 隨便丟個骰子,你得幾點? sample(1:6, size=1) ## 丟同個骰子二十次,你得到的二十個樣本為何? sample(1:6, size=20, replace=TRUE) hist(sample(1:6, size=20, replace=TRUE)) ## 丟硬幣十次(假設正面為1反面為0)你得到什麼樣的分配? coin <- sample(0:1, 10, replace=TRUE) hist(coin) ## 排列 ## 母體有五個人,隨機抽出三人為一組樣本,則可能出現的樣本組有幾個(或樣本空間有多大)? choose(5,3) ## 伯努利試驗(Bernoulli sequence) n=10000; p=1/2 hist(sample(0:1, size=n, replace=TRUE, prob=c(1-p,p))) ## 從1到1000之中(即抽樣母體編號從1到1000),隨機抽10個(不相重複的)樣本 sample01 <- sample(1:1000,300,replace=FALSE) #此樣本平均數與標準差為 mean(sample01) sd(sample01) ## 練習: ## 請重複抽樣五次,並將每次抽樣結果存成物件 ## 用mean()與sd()兩個指令比較看看這五個樣本的平均數與標準差有多大差異? ################# ## 2 幾種機率分配 ################# ##================================================ ## 2.1 二元變數(Binomial random variables)及其分配 ##================================================ ## 常出現在「支持」或「反對」這樣二元的抽樣調查中 ## 丟硬幣十次(n=10)會出現Binomial (10, 1/2) 的分配. ## 請問得到五次正面(k=5)的機率有多少? n=10; k=5; p=1/2 mu=n*p #母體平均數(population mean) mu sigma=sqrt(n*p*(1-p)) #母體標準差(population standard deviation) sigma # 機率為:(兩個方法皆可) choose(n,k) * (p)^k * (1-p)^(n-k) dbinom(k, size=n, prob=p) # 再請問得到七次以上為正面的機率有多少?(兩個方法皆可) sum(dbinom(7:10, size=n, prob=p)) 1-pbinom(6, size=n, prob=p) ##--------------------------- ## 從已知的母體機率來推知樣本的機率 ##--------------------------- ## 在一份有100人受訪的問卷中有個「支持」與「反對」台灣入(返)聯的二元問項。 ## 假設這個「支持與否」的問項是呈現二元分配(Binomial(100,p) random varialbe) ## 且知道母體的62%的偏好為支持。 ## 那麼,這個100人的樣本中,六成或六成以下支持的機率為何? pbinom(60, size=100, prob=.62) # 七成以上支持的機率為何? 1-pbinom(70,size=100,prob=.62) ##================================== ## 2.2 常態分配(Normal distribution) ##================================== pnorm(1.5, mean=0, sd=1) #1.5 standard deviation above the mean # 68-95-99.7法則(the 68-95-99.7 rule) pnorm(1)-pnorm(-1) # 1 sd from the mean 1 - 2*pnorm(-2) # 2 sd from the mean diff(pnorm(c(-3,3))) # 3 sd from the mean mu=100; sigma=10 res = rnorm(1000, mean=mu, sd=sigma) k=1; sum(res > mu - k*sigma & res < mu + k*sigma) # 1 sd from the mean k=2; sum(res > mu - k*sigma & res < mu + k*sigma) # 2 sd from the mean k=3; sum(res > mu - k*sigma & res < mu + k*sigma) # 3 sd from the mean ## 峰度(kurtosis) and 偏態(skewness) ## 根據Kline(1998), 峰度的絕對值在10以內,偏態的絕對值在3以內 ## 變項的分配對於以最大概似法(Most-Likelihood,ML)計算過程的影響可以忽略。 ## 如果分配違反常態性相當嚴重,則必須選擇其它估計方法,如漸進分配自由法(ADF)。 hist(sample01) hist(sample(1:1000,70,replace=FALSE)) ## 峰度值愈大曲線愈不對稱;係數值等於零為常態峰,大於零為高峽峰,小於零為低闊峰 library(moments) kurtosis(sample(1:1000,70,replace=FALSE)) kurtosis(sample01) ## 偏態值(對稱分佈的係數)愈大表資料分佈愈不對稱;係數值大於零為右偏分佈,小於零為左偏分佈 skewness(sample(1:1000,70,replace=FALSE)) skewness(sample01) ## 常態性檢驗(test for normality) library(nortest) lillie.test(rnorm(100, mean = 5, sd = 3)) lillie.test(sample(1:1000,5,replace=FALSE)) lillie.test(sample(1:1000,20,replace=FALSE)) lillie.test(sample(1:1000,70,replace=FALSE)) lillie.test(sample01) ## 進階:compare litlie.test() with ks.test() ## Tests of normality for continuous distributions (Verzani, 9.3) ## Visualization means like q-q plots and boxplots are not enough! (p.270) data(stud.recs, package="UsingR") attach(stud.recs) par(mfrow=c(1,3)) qqplot(sat.m, sat.v) #q-q plots show similarity boxplot(list(math=sat.m, verbal=sat.v)) # boxplots show some difference plot(ecdf(sat.m)) lines(ecdf(sat.v), lty=2) #a plot of e.c.d.f shows more difference detach(stud.recs) ## Kolmogorov-Smirnov test ## a big shortcoming: Use it when we know the parameters of the underlying distribution; ## see 9.3.3 for ways to find parameter values. ## Are the two samples from the same population of scores? ks.test(sat.m, sat.v) ## Shapiro-Wilk test for nomality shapiro.test(stud.recs$sat.m) # H0:parent distribution is normal shapiro.test(stud.recs$sat.v) # No evidence that the data is not normally distributed ## T-test is still appropreate to use regardless of the result of the shapiro test, see p.273. ##========================================= ## 2.3 離散均勻分配(Uniform distribution) ##========================================= ## 當某一變數的每個值出現的機率都一樣時,如0到10中間的這11個數值出現的機率都相等時 ## 我們說這個變數呈離散均均分配,即Uniform(0,10) # 在0到10這11個值中隨機任取50個值,並存成叫作res的樣本 res= runif(50, min=0, max=10) # 制作分析圖表 par(fig=c(0,1, 0, .35)) # set xlim and ylim boxplot(res, horizontal=TRUE, bty="n", xlab="uniform sample") par(fig=c(0,1,.25,1), new=TRUE) # overlay two graphics hist(res, prob=TRUE, main="", col=gray(.9)) lines(density(res), lty=2) curve(dunif(x, min=0, max=10), lwd=2, add=TRUE) rug(res) ##======================================= ## 2.4 指數分配(Exponential distribution) ##======================================= ## 例如, 燈泡的壽命可能為 Exponential(1/5) res <- rexp(50, rate=1/5) par(fig=c(0,1, 0, .35)) boxplot(res, horizontal=TRUE, bty="n", xlab="exponential sample") par(fig=c(0,1,.25,1), new=TRUE) #find the largest y one to set ylim= tmp.hist=hist(res, plot=FALSE) tmp.edens=density(res) tmp.dens=dexp(0, rate=1/5) y.max=max(tmp.hist$density, tmp.edens$y, tmp.dens) hist(res, ylim=c(0, y.max), prob=TRUE, main="", col=gray(.9)) lines(density(res), lty=2) curve(dexp(x, rate=1/5), lwd=2, add=TRUE) rug(res) ##======================================== ## 2.5 對數常態分配(Lognormal distribution) ##======================================== ## 例如:選民收入的分佈可能為 Lognormal(0,1) #size=50 res= rlnorm(50, meanlog=0, sdlog=1) par(fig=c(0,1, 0, .35)) boxplot(res, horizontal=TRUE, bty="n", xlab="log-normal sample") par(fig=c(0,1,.25,1), new=TRUE) tmp.hist=hist(res, plot=FALSE) tmp.edens=density(res) tmp.dens=dlnorm(0.5, meanlog=0, sdlog=1) #the population density is maximal at .5 y.max=max(tmp.hist$density, tmp.edens$y, tmp.dens) tmp.dens hist(res, ylim=c(0,y.max), prob=TRUE, main="", col=gray(.9)) lines(density(res), lty=2) curve(dlnorm(x, meanlog=0, sdlog=1), lwd=2, add=TRUE) rug(res) #################################### ## 3 抽樣分配(Sampling distribution) #################################### ## 請使用這個網站,把抽樣分配的概念「玩清楚」 ## http://onlinestatbook.com/stat_sim/sampling_dist/index.html ## For the range containing 95% of the area: ## t分佈(t-distribution) example(qt) qt(c(.025,.975), df=10) curve(dt(x,1), lwd=2, col=2) curve(dt(x,2), lwd=2, col=3, add=TRUE) curve(dt(x,10), lwd=2, col=4, add=TRUE) ## 卡方分佈(Chi-squared distribution) example(qchisq) qchisq(c(.025,.0975), df=10) curve(dchisq(x,1), lwd=2, col=2) curve(dchisq(x,2), lwd=2, col=3, add=TRUE) curve(dchisq(x,4), lwd=2, col=4, add=TRUE) ## F分佈(F-distribution) qf(c(.025,.975), df1=10, df2=5) curve(df(x,1,10), lwd=2, col=2, ylab="density of F") curve(df(x,5,10), lwd=2, col=3, add=TRUE) curve(df(x,10,10), lwd=2, col=4, add=TRUE) curve(df(x,30,10), lwd=2, col=5, add=TRUE) ################################################# ## 4 中央極限定理( the central limit theorem, CLT) ################################################# ## When all of the possible sample means are computed, then the following ## properties are true: * The mean of the sample means will be the mean ## of the population * The variance of the sample means will be the ## variance of the population divided by the sample size. * The standard ## deviation of the sample means (known as the standard error of the ## mean) will be smaller than the population mean and will be equal to ## the standard deviation of the population divided by the square root of ## the sample size. * If the population has a normal distribution, then ## the sample means will have a normal distribution. * If the population ## is not normally distributed, but the sample size is sufficiently ## large, then the sample means will have an approximately normal ## distribution. Some books define sufficiently large as at least 30 and ## others as at least 31. ## Normal parent population n=25; curve(dnorm(x, mean=0, sd=1/sqrt(n)), -3,3, xlab="x", ylab="Densities of sample mean", bty="l") n=5; curve(dnorm(x, mean=0, sd=1/sqrt(n)), add=TRUE, lty=2) n=1; curve(dnorm(x, mean=0, sd=1/sqrt(n)), add=TRUE, lty=3) ### Nonnormal parent population ## Binomial distribution (p.166) m=200; p=1/2; n=5 res= rbinom(m,n,p) hist(res, prob=TRUE, main="n=5") curve(dnorm(x, n*p, sqrt(n*p*(1-p))), add=TRUE) n=10 res= rbinom(m,n,p) hist(res, prob=TRUE, main="n=10") curve(dnorm(x, n*p, sqrt(n*p*(1-p))), add=TRUE) #the histogram looks like a normal distribution n=25 res= rbinom(m,n,p) hist(res, prob=TRUE, main="n=25") curve(dnorm(x, n*p, sqrt(n*p*(1-p))), add=TRUE) ## Uniform(0,1) (p.168) plot(0,0, type="n", xlim=c(0,1), ylim=c(0, 13.5), # "n" for not plotting xlab="Density estimate", ylab="f(x)") m=500; a=0; b=1 n=2 for(i in 1:m) res[i] = mean(runif(n,a,b)) # store the sample mean lines(density(res), lwd=2) n=10 for(i in 1:m) res[i] = mean(runif(n,a,b)) lines(density(res), lwd=2) n=100 for(i in 1:m) res[i] = mean(runif(n,a,b)) lines(density(res), lwd=2)