第3章のRコード

第3章 確率論の基礎

サンプルデータ

wage.csv:男性労働者の賃金データ.

乱数の設定

set.seed(2022)

3.1.2 試行と事象

coin <- c("Head", "Tail")
sample(coin, 100, replace = TRUE)
  [1] "Tail" "Head" "Tail" "Head" "Head" "Tail" "Tail" "Head" "Tail" "Tail"
 [11] "Head" "Head" "Tail" "Tail" "Head" "Head" "Tail" "Head" "Tail" "Tail"
 [21] "Head" "Head" "Head" "Tail" "Head" "Tail" "Tail" "Head" "Head" "Tail"
 [31] "Head" "Tail" "Tail" "Tail" "Head" "Head" "Tail" "Head" "Head" "Tail"
 [41] "Head" "Tail" "Tail" "Tail" "Head" "Tail" "Head" "Tail" "Tail" "Tail"
 [51] "Head" "Tail" "Head" "Head" "Tail" "Head" "Head" "Head" "Head" "Tail"
 [61] "Head" "Tail" "Head" "Head" "Tail" "Tail" "Head" "Head" "Tail" "Head"
 [71] "Tail" "Head" "Tail" "Tail" "Tail" "Head" "Tail" "Head" "Tail" "Tail"
 [81] "Tail" "Tail" "Head" "Head" "Head" "Tail" "Head" "Tail" "Tail" "Head"
 [91] "Tail" "Tail" "Tail" "Head" "Tail" "Head" "Tail" "Tail" "Head" "Head"

3.1.3 コイン投げのシミュレーション

factorial(100) / (factorial(50) ^ 2) / (2 ^ 100)
[1] 0.07958924
coin <- c(1, 0)
z <- sample(coin, 100, replace = TRUE)

sum(z)
[1] 55
S <- 100000
rec <- numeric(S)
coin <- c(1, 0)

for(i in 1:S){
  
  z <- sample(coin, 100, replace = 100)
  rec[i] <- sum(z)
  
}
hist(rec)

summary(rec)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  27.00   47.00   50.00   49.99   53.00   72.00 

3.1.4 論理演算によるカウントの方法

2 > 1
[1] TRUE
2 > 1000
[1] FALSE
200 == 100 * 2
[1] TRUE
よくある間違い
200 = 100 * 2
Error in 200 = 100 * 2 : invalid (do_set) left-hand side to assignment
TRUE + TRUE
[1] 2
TRUE * FALSE
[1] 0
count <- (rec == 50)
head(count)
[1] FALSE FALSE FALSE FALSE  TRUE FALSE
sum(count)
[1] 7966
mean(count)
[1] 0.07966

3.2.3 独立ではない例

# 独立性の確認

S <- 100000 # シミュレーション回数
X <- rnorm(S, 50, 10) # Xを抽出
Y <- rnorm(S, 50, 10) # Yを抽出
Z <- X + Y # Zを構成
# Pr(X > 70) * Pr(Z > 100)
mean(X > 70) * mean(Z > 100)
[1] 0.01119444
# Pr(X > 70 かつ Z > 100)
mean((X > 70) * (Z > 100))
[1] 0.02207

3.2.4 独立性と相関係数

X <- rnorm(100000, 50, 10)
Y <- rnorm(100000, 50, 10)

cor(X, Y)
[1] 0.003516287
Z <- - ((X - 50) ^ 2) / 10
cor(X, Z)
[1] 0.01343314
plot(X, Z)

3.3.1 分布関数

 curve(pnorm(x, 50, 10), 0, 100)

pnorm(60, 50, 10) - pnorm(40, 50, 10)
[1] 0.6826895

3.3.2 確率密度関数

pnorm(50 + 0.1, 50, 10) - pnorm(50, 50, 10)
[1] 0.003989356
dnorm(50, 50, 10)
[1] 0.03989423
dnorm(80, 50, 10)
[1] 0.0004431848
curve(dnorm(x, 50, 10), 0, 100)

3.3.7 データによる条件付期待値の推定

malesdata <- read.csv("wage.csv")

head(malesdata)
  educ exper wage
1    7    16  548
2   12     9  481
3   12    16  721
4   11    10  250
5   12    16  729
6   12     8  500
summary(malesdata)
      educ           exper             wage       
 Min.   : 1.00   Min.   : 0.000   Min.   : 100.0  
 1st Qu.:12.00   1st Qu.: 6.000   1st Qu.: 394.2  
 Median :13.00   Median : 8.000   Median : 537.5  
 Mean   :13.26   Mean   : 8.856   Mean   : 577.3  
 3rd Qu.:16.00   3rd Qu.:11.000   3rd Qu.: 708.8  
 Max.   :18.00   Max.   :23.000   Max.   :2404.0  
plot(malesdata)

sch12 <- malesdata[malesdata$school == 12, ] # 高校卒業者の抜き出し
sch16 <- malesdata[malesdata$school == 16, ] # 学部卒業者の抜き出し
exp(mean(sch12$wage)) # 高校卒業者の平均賃金
[1] NaN
exp(mean(sch16$wage)) # 学部卒業者の平均賃金
[1] NaN
sch11 <- malesdata[malesdata$school <= 11, ] # 高卒未満の抜き出し
sch12 <- malesdata[malesdata$school >= 12, ] # 高卒以上の抜き出し
exp(mean(sch11$wage)) # 高卒未満の平均賃金
[1] NaN
exp(mean(sch12$wage)) # 高卒以上の平均賃金
[1] NaN

3.4.2 中心極限定理のシミュレーション

S <- 10000
n <- 10000
Zn <- numeric(S)

for(i in 1:S){
  
  X <- rnorm(n, 50, 10)
  Xbar <- mean(X)
  Sn <- var(X)
  Zn[i] <- sqrt(n) * (Xbar  - 50) / sqrt(Sn)
  
}
hist(Zn)

summary(Zn)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-3.786927 -0.685862  0.004751  0.000495  0.683271  3.361441 

3.4.3 信頼区間のシミュレーション

# 信頼区間シミュレーション

S <- 10000         # シミュレーション回数
n <- 10000         # 標本の大きさ
rec <- numeric(S)  # 結果記録用のベクトル

for(i in 1:S){     # 繰り返し開始
  
  X <- rnorm(n, 50, 10)  # N(50,10^2) から標本抽出
  Xbar <- mean(X)
  Sn <- var(X)
  rec[i] <- (Xbar - 1.96 * sqrt(Sn / n) < 50) * (50 < Xbar + 1.96 * sqrt(Sn / n))
  
}

mean(rec) # 確率を計算
[1] 0.944

3.4.4 信頼区間の導出

pnorm(69.6,50,10) - pnorm(30.4,50,10)
[1] 0.9500042