BT Question Set P1-T2-20-18: Multivariate regressions

Multiple regression

Question 1: Fama-french

library(tidyverse)
library(broom)
library(gt)

intercept <- .03
intercept_sig <- .01

x1_mu <- .04
x1_sig <- .01
x1_beta <- 0.4

x2_mu <- .03
x2_sig <- .01
x2_beta <- -0.6

x3_mu <- .03
x3_sig <- .01
x3_beta <- -0.3

noise_mu <- 0
noise_sig <- 0 # low value gets low p-value b/c low noise

size <- 96
set.seed(18)

results <- tibble(
  x0 = rnorm(size, intercept, intercept_sig),
  x1 = rnorm(size, x1_mu, x1_sig),
  x2 = rnorm(size, x2_mu, x2_sig),
  x3 = rnorm(size, x3_mu, x3_sig),
  x1_b = rep(x1_beta, size),
  x2_b = rep(x2_beta, size),
  x3_b = rep(x3_beta, size),
  noise = rnorm(size, 0, noise_sig)
)


results1 <- results %>% mutate(
  y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)

model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0179934 -0.0063017 -0.0002194  0.0069123  0.0252648 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.026900   0.005785   4.650 1.11e-05 ***
## x1           0.501752   0.099805   5.027 2.44e-06 ***
## x2          -0.702932   0.096522  -7.283 1.09e-10 ***
## x3          -0.276534   0.102339  -2.702   0.0082 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00983 on 92 degrees of freedom
## Multiple R-squared:  0.4619, Adjusted R-squared:  0.4444 
## F-statistic: 26.33 on 3 and 92 DF,  p-value: 2.206e-12
model_tidy <- tidy(model)
model_tidy[2,1] <- "MKT"
model_tidy[3,1] <- "SMB"
model_tidy[4,1] <- "HML"

gt_table_model <- gt(model_tidy)

gt_table_model <- 
  gt_table_model %>% 
  tab_options(
    table.font.size = 14
  ) %>% 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body()
  ) %>% 
  tab_header(
    title = "Portfolio excess returns regressed against MKT + SMB + HML",
    subtitle ="i.e., Fama-French three-factor model"
  #) %>% tab_source_note(
  #   source_note = md("the source is ... FRED")
  ) %>% cols_label(
    term = "Coefficient",
    estimate = "Estimate",
    std.error = "Std Error",
    statistic = "t-stat",
    p.value = "p value"
  ) %>% fmt_number(
    columns = vars(estimate, std.error, statistic, p.value),
    decimals = 3
  ) %>% fmt_scientific(
    columns = vars(statistic, p.value),
  ) %>% tab_options(
    heading.title.font.size = 14,
    heading.subtitle.font.size = 12
  )

gt_table_model
Portfolio excess returns regressed against MKT + SMB + HML
i.e., Fama-French three-factor model
Coefficient Estimate Std Error t-stat p value
(Intercept) 0.027 0.006 4.65 1.11 × 10−5
MKT 0.502 0.100 5.03 2.44 × 10−6
SMB −0.703 0.097 −7.28 1.09 × 10−10
HML −0.277 0.102 −2.70 8.20 × 10−3

Question 2: House Prices

library(tidyverse)
library(broom)
library(gt)

intercept <- 40
intercept_sig <- .01

x1_mu <- 1200
x1_sig <- 30
x1_beta <- 0.35

x2_mu <- 4.5
x2_sig <- 2
x2_beta <- 10.0

x3_mu <- 15
x3_sig <- 4
x3_beta <- -8.0

noise_mu <- 0
noise_sig <- 20 # low value gets low p-value b/c low noise

size <- 96
set.seed(43)

results <- tibble(
  x0 = rnorm(size, intercept, intercept_sig),
  x1 = rnorm(size, x1_mu, x1_sig),
  x2 = rnorm(size, x2_mu, x2_sig),
  x3 = rnorm(size, x3_mu, x3_sig),
  x1_b = rep(x1_beta, size),
  x2_b = rep(x2_beta, size),
  x3_b = rep(x3_beta, size),
  noise = rnorm(size, 0, noise_sig)
)


results1 <- results %>% mutate(
  y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)

model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.088 -13.508  -1.795  13.819  41.934 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 110.95667   76.98277   1.441    0.153    
## x1            0.29138    0.06283   4.638 1.16e-05 ***
## x2            9.53971    0.85291  11.185  < 2e-16 ***
## x3           -8.06268    0.48275 -16.702  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.28 on 92 degrees of freedom
## Multiple R-squared:  0.8378, Adjusted R-squared:  0.8325 
## F-statistic: 158.4 on 3 and 92 DF,  p-value: < 2.2e-16
model_tidy <- tidy(model)
model_tidy[2,1] <- "SQFEET"
model_tidy[3,1] <- "ROOMS"
model_tidy[4,1] <- "AGE"

gt_table_model <- gt(model_tidy)

gt_table_model <- 
  gt_table_model %>% 
  tab_options(
    table.font.size = 14
  ) %>% 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body()
  ) %>% 
  tab_header(
    title = "House Price regressed against ft^2 (SQFEET) + ROOMS(#) + AGE(years)",
    subtitle = md("House Price in Thousands **($000)** of dollars")
  #) %>% tab_source_note(
  #   source_note = md("the source is ... FRED")
  ) %>% cols_label(
    term = "Coefficient",
    estimate = "Estimate",
    std.error = "Std Error",
    statistic = "t-stat",
    p.value = "p value"
  ) %>% fmt_number(
    columns = vars(estimate, std.error, statistic, p.value),
    decimals = 3
  ) %>% fmt_scientific(
    columns = vars(p.value),
  ) %>% tab_options(
    heading.title.font.size = 14,
    heading.subtitle.font.size = 12
  )

gt_table_model
House Price regressed against ft^2 (SQFEET) + ROOMS(#) + AGE(years)
House Price in Thousands ($000) of dollars
Coefficient Estimate Std Error t-stat p value
(Intercept) 110.957 76.983 1.441 1.53 × 10−1
SQFEET 0.291 0.063 4.638 1.16 × 10−5
ROOMS 9.540 0.853 11.185 7.64 × 10−19
AGE −8.063 0.483 −16.702 1.33 × 10−29
mean(results1$y) # price
## [1] 386.051
mean(results1$x0) # intercept
## [1] 40.0002
mean(results1$x1) # sqfeet
## [1] 1203.809
mean(results1$x2) # rooms
## [1] 4.548471
mean(results1$x3) # age
## [1] 14.76695

Question 3: Insurance

library(tidyverse)
library(broom)
library(gt)

intercept <- 150
intercept_sig <- 40

# age
x1_mu <- 38
x1_sig <- 7
x1_beta <- 50

# bmi
x2_mu <- 22
x2_sig <- 4
x2_beta <- 100

# smoker
x3_mu <- 15
x3_sig <- 0.5
x3_beta <- 535

# spend
x4_mu <- 500
x4_sig <- 250
x4_beta <- -0.4


noise_mu <- 0
noise_sig <- 300 # low value gets low p-value b/c low noise

size <- 43
set.seed(12)

results <- tibble(
  x0 = rnorm(size, intercept, intercept_sig),
  x1 = round(rnorm(size, x1_mu, x1_sig)),
  x2 = rnorm(size, x2_mu, x2_sig),
  x3 = round(runif(size)-.35), # smoker = 1, non = 0
  x4 = rnorm(size, x4_mu, x4_sig),
  x1_b = rep(x1_beta, size),
  x2_b = rep(x2_beta, size),
  x3_b = rep(x3_beta, size),
  x4_b = rep(x4_beta, size),
  noise = rnorm(size, 0, noise_sig)
)


results1 <- results %>% mutate(
  y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + x4_b * x4 + noise
)

model <- lm(y ~ x1 + x2 + x3 + x4, data = results1)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = results1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -635.96 -169.65   15.59  219.73  543.51 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -324.2133   415.7736  -0.780 0.440348    
## x1            56.9518     7.5517   7.542 4.60e-09 ***
## x2           111.7571    11.9374   9.362 2.06e-11 ***
## x3           454.0799   125.8293   3.609 0.000884 ***
## x4            -0.5485     0.1793  -3.058 0.004064 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 298.3 on 38 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8027 
## F-statistic: 43.71 on 4 and 38 DF,  p-value: 1.009e-13
model_tidy <- tidy(model)
model_tidy[2,1] <- "AGE"
model_tidy[3,1] <- "BMI"
model_tidy[4,1] <- "SMOKER"
model_tidy[5,1] <- "CHARITY"

gt_table_model <- gt(model_tidy)

gt_table_model <- 
  gt_table_model %>% 
  tab_options(
    table.font.size = 14
  ) %>% 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body()
  ) %>% 
  tab_header(
    title = "Medical COST regressed against AGE + BMI + SMOKER(1/0) + CHARITY($)",
    subtitle = md("Simulated dataset")
  ) %>% tab_source_note(
    source_note = md("Residual standard error: 295.2 on 38 degrees of freedom")
  ) %>% tab_source_note(
    source_note = md("Multiple R-squared:  0.8343,  Adjusted R-squared:  0.8168")
  ) %>% tab_source_note(
    source_note = md("F-statistic: 47.82 on 4 and 38 DF,  p-value: 2.486e-14")
  ) %>% cols_label(
    term = "Coefficient",
    estimate = "Estimate",
    std.error = "Std Error",
    statistic = "t-stat",
    p.value = "p value"
  ) %>% fmt_number(
    columns = vars(estimate, std.error, statistic, p.value),
    decimals = 2
  ) %>% fmt_scientific(
    columns = vars(p.value),
  ) %>% tab_options(
    heading.title.font.size = 14,
    heading.subtitle.font.size = 12
  )


gt_table_model
Medical COST regressed against AGE + BMI + SMOKER(1/0) + CHARITY($)
Simulated dataset
Coefficient Estimate Std Error t-stat p value
(Intercept) −324.21 415.77 −0.78 4.40 × 10−1
AGE 56.95 7.55 7.54 4.60 × 10−9
BMI 111.76 11.94 9.36 2.06 × 10−11
SMOKER 454.08 125.83 3.61 8.84 × 10−4
CHARITY −0.55 0.18 −3.06 4.06 × 10−3
Residual standard error: 295.2 on 38 degrees of freedom
Multiple R-squared: 0.8343, Adjusted R-squared: 0.8168
F-statistic: 47.82 on 4 and 38 DF, p-value: 2.486e-14
mean(results1$y) # cost
## [1] 4150.588
mean(results1$x0) # intercept
## [1] 145.2152
mean(results1$x1) # age
## [1] 38.51163
mean(results1$x2) # bmi
## [1] 22.16166
mean(results1$x3) # smoker
## [1] 0.1627907
mean(results1$x4) # charity
## [1] 490.6912
David Harper
David Harper
Founder & CEO of Bionic Turtle

I teach financial risk and enjoy learning data science

Related