Multiple regression
Question 1: Fama-french
library(tidyverse)
library(broom)
library(gt)
intercept <- .03
intercept_sig <- .01
x1_mu <- .04
x1_sig <- .01
x1_beta <- 0.4
x2_mu <- .03
x2_sig <- .01
x2_beta <- -0.6
x3_mu <- .03
x3_sig <- .01
x3_beta <- -0.3
noise_mu <- 0
noise_sig <- 0 # low value gets low p-value b/c low noise
size <- 96
set.seed(18)
results <- tibble(
x0 = rnorm(size, intercept, intercept_sig),
x1 = rnorm(size, x1_mu, x1_sig),
x2 = rnorm(size, x2_mu, x2_sig),
x3 = rnorm(size, x3_mu, x3_sig),
x1_b = rep(x1_beta, size),
x2_b = rep(x2_beta, size),
x3_b = rep(x3_beta, size),
noise = rnorm(size, 0, noise_sig)
)
results1 <- results %>% mutate(
y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)
model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0179934 -0.0063017 -0.0002194 0.0069123 0.0252648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.026900 0.005785 4.650 1.11e-05 ***
## x1 0.501752 0.099805 5.027 2.44e-06 ***
## x2 -0.702932 0.096522 -7.283 1.09e-10 ***
## x3 -0.276534 0.102339 -2.702 0.0082 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00983 on 92 degrees of freedom
## Multiple R-squared: 0.4619, Adjusted R-squared: 0.4444
## F-statistic: 26.33 on 3 and 92 DF, p-value: 2.206e-12
model_tidy <- tidy(model)
model_tidy[2,1] <- "MKT"
model_tidy[3,1] <- "SMB"
model_tidy[4,1] <- "HML"
gt_table_model <- gt(model_tidy)
gt_table_model <-
gt_table_model %>%
tab_options(
table.font.size = 14
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body()
) %>%
tab_header(
title = "Portfolio excess returns regressed against MKT + SMB + HML",
subtitle ="i.e., Fama-French three-factor model"
#) %>% tab_source_note(
# source_note = md("the source is ... FRED")
) %>% cols_label(
term = "Coefficient",
estimate = "Estimate",
std.error = "Std Error",
statistic = "t-stat",
p.value = "p value"
) %>% fmt_number(
columns = vars(estimate, std.error, statistic, p.value),
decimals = 3
) %>% fmt_scientific(
columns = vars(statistic, p.value),
) %>% tab_options(
heading.title.font.size = 14,
heading.subtitle.font.size = 12
)
gt_table_model
Coefficient |
Estimate |
Std Error |
t-stat |
p value |
(Intercept) |
0.027 |
0.006 |
4.65 |
1.11 × 10−5 |
MKT |
0.502 |
0.100 |
5.03 |
2.44 × 10−6 |
SMB |
−0.703 |
0.097 |
−7.28 |
1.09 × 10−10 |
HML |
−0.277 |
0.102 |
−2.70 |
8.20 × 10−3 |
Question 2: House Prices
library(tidyverse)
library(broom)
library(gt)
intercept <- 40
intercept_sig <- .01
x1_mu <- 1200
x1_sig <- 30
x1_beta <- 0.35
x2_mu <- 4.5
x2_sig <- 2
x2_beta <- 10.0
x3_mu <- 15
x3_sig <- 4
x3_beta <- -8.0
noise_mu <- 0
noise_sig <- 20 # low value gets low p-value b/c low noise
size <- 96
set.seed(43)
results <- tibble(
x0 = rnorm(size, intercept, intercept_sig),
x1 = rnorm(size, x1_mu, x1_sig),
x2 = rnorm(size, x2_mu, x2_sig),
x3 = rnorm(size, x3_mu, x3_sig),
x1_b = rep(x1_beta, size),
x2_b = rep(x2_beta, size),
x3_b = rep(x3_beta, size),
noise = rnorm(size, 0, noise_sig)
)
results1 <- results %>% mutate(
y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)
model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.088 -13.508 -1.795 13.819 41.934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110.95667 76.98277 1.441 0.153
## x1 0.29138 0.06283 4.638 1.16e-05 ***
## x2 9.53971 0.85291 11.185 < 2e-16 ***
## x3 -8.06268 0.48275 -16.702 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.28 on 92 degrees of freedom
## Multiple R-squared: 0.8378, Adjusted R-squared: 0.8325
## F-statistic: 158.4 on 3 and 92 DF, p-value: < 2.2e-16
model_tidy <- tidy(model)
model_tidy[2,1] <- "SQFEET"
model_tidy[3,1] <- "ROOMS"
model_tidy[4,1] <- "AGE"
gt_table_model <- gt(model_tidy)
gt_table_model <-
gt_table_model %>%
tab_options(
table.font.size = 14
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body()
) %>%
tab_header(
title = "House Price regressed against ft^2 (SQFEET) + ROOMS(#) + AGE(years)",
subtitle = md("House Price in Thousands **($000)** of dollars")
#) %>% tab_source_note(
# source_note = md("the source is ... FRED")
) %>% cols_label(
term = "Coefficient",
estimate = "Estimate",
std.error = "Std Error",
statistic = "t-stat",
p.value = "p value"
) %>% fmt_number(
columns = vars(estimate, std.error, statistic, p.value),
decimals = 3
) %>% fmt_scientific(
columns = vars(p.value),
) %>% tab_options(
heading.title.font.size = 14,
heading.subtitle.font.size = 12
)
gt_table_model
Coefficient |
Estimate |
Std Error |
t-stat |
p value |
(Intercept) |
110.957 |
76.983 |
1.441 |
1.53 × 10−1 |
SQFEET |
0.291 |
0.063 |
4.638 |
1.16 × 10−5 |
ROOMS |
9.540 |
0.853 |
11.185 |
7.64 × 10−19 |
AGE |
−8.063 |
0.483 |
−16.702 |
1.33 × 10−29 |
mean(results1$y) # price
## [1] 386.051
mean(results1$x0) # intercept
## [1] 40.0002
mean(results1$x1) # sqfeet
## [1] 1203.809
mean(results1$x2) # rooms
## [1] 4.548471
mean(results1$x3) # age
## [1] 14.76695
Question 3: Insurance
library(tidyverse)
library(broom)
library(gt)
intercept <- 150
intercept_sig <- 40
# age
x1_mu <- 38
x1_sig <- 7
x1_beta <- 50
# bmi
x2_mu <- 22
x2_sig <- 4
x2_beta <- 100
# smoker
x3_mu <- 15
x3_sig <- 0.5
x3_beta <- 535
# spend
x4_mu <- 500
x4_sig <- 250
x4_beta <- -0.4
noise_mu <- 0
noise_sig <- 300 # low value gets low p-value b/c low noise
size <- 43
set.seed(12)
results <- tibble(
x0 = rnorm(size, intercept, intercept_sig),
x1 = round(rnorm(size, x1_mu, x1_sig)),
x2 = rnorm(size, x2_mu, x2_sig),
x3 = round(runif(size)-.35), # smoker = 1, non = 0
x4 = rnorm(size, x4_mu, x4_sig),
x1_b = rep(x1_beta, size),
x2_b = rep(x2_beta, size),
x3_b = rep(x3_beta, size),
x4_b = rep(x4_beta, size),
noise = rnorm(size, 0, noise_sig)
)
results1 <- results %>% mutate(
y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + x4_b * x4 + noise
)
model <- lm(y ~ x1 + x2 + x3 + x4, data = results1)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = results1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -635.96 -169.65 15.59 219.73 543.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -324.2133 415.7736 -0.780 0.440348
## x1 56.9518 7.5517 7.542 4.60e-09 ***
## x2 111.7571 11.9374 9.362 2.06e-11 ***
## x3 454.0799 125.8293 3.609 0.000884 ***
## x4 -0.5485 0.1793 -3.058 0.004064 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 298.3 on 38 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8027
## F-statistic: 43.71 on 4 and 38 DF, p-value: 1.009e-13
model_tidy <- tidy(model)
model_tidy[2,1] <- "AGE"
model_tidy[3,1] <- "BMI"
model_tidy[4,1] <- "SMOKER"
model_tidy[5,1] <- "CHARITY"
gt_table_model <- gt(model_tidy)
gt_table_model <-
gt_table_model %>%
tab_options(
table.font.size = 14
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body()
) %>%
tab_header(
title = "Medical COST regressed against AGE + BMI + SMOKER(1/0) + CHARITY($)",
subtitle = md("Simulated dataset")
) %>% tab_source_note(
source_note = md("Residual standard error: 295.2 on 38 degrees of freedom")
) %>% tab_source_note(
source_note = md("Multiple R-squared: 0.8343, Adjusted R-squared: 0.8168")
) %>% tab_source_note(
source_note = md("F-statistic: 47.82 on 4 and 38 DF, p-value: 2.486e-14")
) %>% cols_label(
term = "Coefficient",
estimate = "Estimate",
std.error = "Std Error",
statistic = "t-stat",
p.value = "p value"
) %>% fmt_number(
columns = vars(estimate, std.error, statistic, p.value),
decimals = 2
) %>% fmt_scientific(
columns = vars(p.value),
) %>% tab_options(
heading.title.font.size = 14,
heading.subtitle.font.size = 12
)
gt_table_model
Coefficient |
Estimate |
Std Error |
t-stat |
p value |
(Intercept) |
−324.21 |
415.77 |
−0.78 |
4.40 × 10−1 |
AGE |
56.95 |
7.55 |
7.54 |
4.60 × 10−9 |
BMI |
111.76 |
11.94 |
9.36 |
2.06 × 10−11 |
SMOKER |
454.08 |
125.83 |
3.61 |
8.84 × 10−4 |
CHARITY |
−0.55 |
0.18 |
−3.06 |
4.06 × 10−3 |
Residual standard error: 295.2 on 38 degrees of freedom |
Multiple R-squared: 0.8343, Adjusted R-squared: 0.8168 |
F-statistic: 47.82 on 4 and 38 DF, p-value: 2.486e-14 |
mean(results1$y) # cost
## [1] 4150.588
mean(results1$x0) # intercept
## [1] 145.2152
mean(results1$x1) # age
## [1] 38.51163
mean(results1$x2) # bmi
## [1] 22.16166
mean(results1$x3) # smoker
## [1] 0.1627907
mean(results1$x4) # charity
## [1] 490.6912