import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
import seaborn as sns
第3章のPythonコード
第3章 確率論の基礎
モジュールのインポート
乱数の設定
= 2022) np.random.seed(seed
3.1.2 試行と事象
= ['Head', 'Tail']
coin = 100, replace = True) np.random.choice(coin, size
array(['Tail', 'Head', 'Tail', 'Head', 'Tail', 'Tail', 'Head', 'Tail',
'Head', 'Head', 'Head', 'Head', 'Tail', 'Tail', 'Tail', 'Tail',
'Tail', 'Tail', 'Head', 'Head', 'Head', 'Tail', 'Head', 'Tail',
'Head', 'Head', 'Head', 'Tail', 'Head', 'Head', 'Head', 'Tail',
'Head', 'Tail', 'Head', 'Head', 'Tail', 'Tail', 'Tail', 'Tail',
'Tail', 'Tail', 'Tail', 'Tail', 'Head', 'Head', 'Head', 'Head',
'Head', 'Tail', 'Head', 'Tail', 'Tail', 'Head', 'Head', 'Head',
'Head', 'Tail', 'Tail', 'Head', 'Head', 'Tail', 'Head', 'Tail',
'Tail', 'Tail', 'Tail', 'Tail', 'Tail', 'Tail', 'Head', 'Tail',
'Tail', 'Tail', 'Head', 'Head', 'Head', 'Head', 'Tail', 'Head',
'Head', 'Tail', 'Head', 'Tail', 'Tail', 'Head', 'Head', 'Tail',
'Tail', 'Tail', 'Tail', 'Head', 'Tail', 'Tail', 'Tail', 'Tail',
'Head', 'Tail', 'Tail', 'Head'], dtype='<U4')
3.1.3 コイン投げのシミュレーション
100) / (math.factorial(50) ** 2) / (2 ** 100) math.factorial(
0.07958923738717877
= [1, 0]
coin = np.random.choice(coin, size = 100, replace = True)
z
sum() z.
53
= 100000
S = np.zeros(S)
rec = [1, 0]
coin
for i in range(S):
= np.random.choice(coin, 100, replace = True)
z = z.sum() rec[i]
= 20)
plt.hist(rec, bins plt.show()
pd.DataFrame(rec).describe()
0 | |
---|---|
count | 100000.000000 |
mean | 49.986290 |
std | 4.987395 |
min | 28.000000 |
25% | 47.000000 |
50% | 50.000000 |
75% | 53.000000 |
max | 74.000000 |
3.1.4 論理演算によるカウントの方法
2 > 1
True
2 > 1000
False
200 == 100 * 2
True
True + True
2
True * False
0
= (rec == 50)
count print(count)
[False False False ... False False False]
sum() count.
7997
count.mean()
0.07997
3.2.3 独立ではない例
# 独立性の確認
= 10000 # シミュレーション回数
S = np.random.normal(loc = 50, scale = 10, size = S) # Xを抽出
X = np.random.normal(loc = 50, scale = 10, size = S) # Yを抽出
Y = X + Y # Zを構成 Z
# Pr(X > 70) * Pr(Z > 100)
> 70).mean() * (Z > 100).mean() (X
0.010649170000000001
# Pr(X > 70 かつ Z > 100)
> 70) * (Z > 100)).mean() ((X
0.0207
3.2.4 独立性と相関係数
= np.random.normal(loc = 50, scale = 10, size = 100000)
X = np.random.normal(loc = 50, scale = 10, size = 100000)
Y
np.corrcoef(X, Y)
array([[ 1. , -0.0090849],
[-0.0090849, 1. ]])
= -((X - 50) ** 2) / 10
Z np.corrcoef(X, Z)
array([[1. , 0.00952884],
[0.00952884, 1. ]])
plt.scatter(X, Z) plt.show()
3.3.1 分布関数
= np.linspace(0, 100, 1000)
x 50, 10))
plt.plot(x, stats.norm.cdf(x, plt.show()
print(stats.norm.cdf(60, 50, 10) - stats.norm.cdf(40, 50, 10))
0.6826894921370859
3.3.2 確率密度関数
print(stats.norm.cdf(50 + 0.1, 50, 10) - stats.norm.cdf(50, 50, 10))
0.003989356314631709
print(stats.norm.pdf(50, 50, 10))
0.03989422804014327
print(stats.norm.pdf(80, 50, 10))
0.00044318484119380076
= np.linspace(start = 0, stop = 100, num = 1000)
x 50, 10))
plt.plot(x, stats.norm.pdf(x, plt.show()
3.3.7 データによる条件付期待値の推定
= pd.read_csv("wage.csv")
wagedata wagedata.head()
educ | exper | wage | |
---|---|---|---|
0 | 7 | 16 | 548.000002 |
1 | 12 | 9 | 481.000000 |
2 | 12 | 16 | 721.000002 |
3 | 11 | 10 | 250.000001 |
4 | 12 | 16 | 728.999999 |
wagedata.describe()
educ | exper | wage | |
---|---|---|---|
count | 3010.000000 | 3010.000000 | 3010.000000 |
mean | 13.263455 | 8.856146 | 577.282392 |
std | 2.676913 | 4.141672 | 262.958303 |
min | 1.000000 | 0.000000 | 100.000000 |
25% | 12.000000 | 6.000000 | 394.250000 |
50% | 13.000000 | 8.000000 | 537.499999 |
75% | 16.000000 | 11.000000 | 708.750001 |
max | 18.000000 | 23.000000 | 2404.000010 |
sns.pairplot(wagedata)
<seaborn.axisgrid.PairGrid at 0x12592fa90>
= wagedata.loc[wagedata['educ'] == 12]
educ12 = wagedata.loc[wagedata['educ'] == 16] educ16
'wage'].mean() educ12[
563.5342744186569
'wage'].mean() educ16[
642.8932464464075
= wagedata.loc[wagedata['educ'] < 12]
educ11_less = wagedata.loc[wagedata['educ'] >= 12]
educ12_more 'wage'].mean() educ11_less[
438.10865197516847
'wage'].mean() educ12_more[
604.8070038470858
3.4.2 中心極限定理のシミュレーション
= 10000
S = 10000
n = np.zeros(S)
Zn
for i in range(S):
= np.random.normal(loc = 50, scale = 10, size = n)
X = X.mean()
Xbar = X.var()
Sn = (n ** 0.5) * (Xbar - 50) / (Sn ** 0.5)
Zn[i]
= 20)
plt.hist(Zn, bins plt.show()
pd.DataFrame(Zn).describe()
0 | |
---|---|
count | 10000.000000 |
mean | 0.010709 |
std | 1.000739 |
min | -3.803410 |
25% | -0.663426 |
50% | 0.015367 |
75% | 0.688068 |
max | 3.487146 |
3.4.3 信頼区間のシミュレーション
# 信頼区間シミュレーション
= 10000 # シミュレーション回数
S = 10000 # 標本の大きさ
n = np.zeros(S) # 結果記録用のベクトル
rec
for i in range(S): # 繰り返し開始
= np.random.normal(loc = 50, scale = 10, size = n) # N(50,10^2) から標本抽出
X = X.mean()
Xbar = X.var()
Sn = (Xbar - 1.96 * ((Sn / n) ** 0.5) < 50) * (50 < Xbar + 1.96 * ((Sn / n) ** 0.5))
rec[i]
# 確率を計算 rec.mean()
0.9466
3.4.4 信頼区間の導出
print(stats.norm.cdf(69.6, 50, 10) - stats.norm.cdf(30.4, 50, 10))
0.9500042097035591