BT Question Set P1-T2-20-18: Multivariate regressions

Last updated on Sep 17, 2020 16 min read R

Multiple regression

Question 1: Fama-french

library(tidyverse)
library(broom)
library(gt)

intercept <- .03
intercept_sig <- .01

x1_mu <- .04
x1_sig <- .01
x1_beta <- 0.4

x2_mu <- .03
x2_sig <- .01
x2_beta <- -0.6

x3_mu <- .03
x3_sig <- .01
x3_beta <- -0.3

noise_mu <- 0
noise_sig <- 0 # low value gets low p-value b/c low noise

size <- 96
set.seed(18)

results <- tibble(
  x0 = rnorm(size, intercept, intercept_sig),
  x1 = rnorm(size, x1_mu, x1_sig),
  x2 = rnorm(size, x2_mu, x2_sig),
  x3 = rnorm(size, x3_mu, x3_sig),
  x1_b = rep(x1_beta, size),
  x2_b = rep(x2_beta, size),
  x3_b = rep(x3_beta, size),
  noise = rnorm(size, 0, noise_sig)
)


results1 <- results %>% mutate(
  y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)

model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0179934 -0.0063017 -0.0002194  0.0069123  0.0252648 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.026900   0.005785   4.650 1.11e-05 ***
## x1           0.501752   0.099805   5.027 2.44e-06 ***
## x2          -0.702932   0.096522  -7.283 1.09e-10 ***
## x3          -0.276534   0.102339  -2.702   0.0082 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00983 on 92 degrees of freedom
## Multiple R-squared:  0.4619, Adjusted R-squared:  0.4444 
## F-statistic: 26.33 on 3 and 92 DF,  p-value: 2.206e-12

model_tidy <- tidy(model)
model_tidy[2,1] <- "MKT"
model_tidy[3,1] <- "SMB"
model_tidy[4,1] <- "HML"

gt_table_model <- gt(model_tidy)

gt_table_model <- 
  gt_table_model %>% 
  tab_options(
    table.font.size = 14
  ) %>% 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body()
  ) %>% 
  tab_header(
    title = "Portfolio excess returns regressed against MKT + SMB + HML",
    subtitle ="i.e., Fama-French three-factor model"
  #) %>% tab_source_note(
  #   source_note = md("the source is ... FRED")
  ) %>% cols_label(
    term = "Coefficient",
    estimate = "Estimate",
    std.error = "Std Error",
    statistic = "t-stat",
    p.value = "p value"
  ) %>% fmt_number(
    columns = vars(estimate, std.error, statistic, p.value),
    decimals = 3
  ) %>% fmt_scientific(
    columns = vars(statistic, p.value),
  ) %>% tab_options(
    heading.title.font.size = 14,
    heading.subtitle.font.size = 12
  )

gt_table_model

Portfolio excess returns regressed against MKT + SMB + HML
i.e., Fama-French three-factor model
Coefficient	Estimate	Std Error	t-stat	p value
(Intercept)	0.027	0.006	4.65	1.11 × 10⁻⁵
MKT	0.502	0.100	5.03	2.44 × 10⁻⁶
SMB	−0.703	0.097	−7.28	1.09 × 10⁻¹⁰
HML	−0.277	0.102	−2.70	8.20 × 10⁻³

Question 2: House Prices

library(tidyverse)
library(broom)
library(gt)

intercept <- 40
intercept_sig <- .01

x1_mu <- 1200
x1_sig <- 30
x1_beta <- 0.35

x2_mu <- 4.5
x2_sig <- 2
x2_beta <- 10.0

x3_mu <- 15
x3_sig <- 4
x3_beta <- -8.0

noise_mu <- 0
noise_sig <- 20 # low value gets low p-value b/c low noise

size <- 96
set.seed(43)

results <- tibble(
  x0 = rnorm(size, intercept, intercept_sig),
  x1 = rnorm(size, x1_mu, x1_sig),
  x2 = rnorm(size, x2_mu, x2_sig),
  x3 = rnorm(size, x3_mu, x3_sig),
  x1_b = rep(x1_beta, size),
  x2_b = rep(x2_beta, size),
  x3_b = rep(x3_beta, size),
  noise = rnorm(size, 0, noise_sig)
)


results1 <- results %>% mutate(
  y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)

model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.088 -13.508  -1.795  13.819  41.934 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 110.95667   76.98277   1.441    0.153    
## x1            0.29138    0.06283   4.638 1.16e-05 ***
## x2            9.53971    0.85291  11.185  < 2e-16 ***
## x3           -8.06268    0.48275 -16.702  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.28 on 92 degrees of freedom
## Multiple R-squared:  0.8378, Adjusted R-squared:  0.8325 
## F-statistic: 158.4 on 3 and 92 DF,  p-value: < 2.2e-16

model_tidy <- tidy(model)
model_tidy[2,1] <- "SQFEET"
model_tidy[3,1] <- "ROOMS"
model_tidy[4,1] <- "AGE"

gt_table_model <- gt(model_tidy)

gt_table_model <- 
  gt_table_model %>% 
  tab_options(
    table.font.size = 14
  ) %>% 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body()
  ) %>% 
  tab_header(
    title = "House Price regressed against ft^2 (SQFEET) + ROOMS(#) + AGE(years)",
    subtitle = md("House Price in Thousands **($000)** of dollars")
  #) %>% tab_source_note(
  #   source_note = md("the source is ... FRED")
  ) %>% cols_label(
    term = "Coefficient",
    estimate = "Estimate",
    std.error = "Std Error",
    statistic = "t-stat",
    p.value = "p value"
  ) %>% fmt_number(
    columns = vars(estimate, std.error, statistic, p.value),
    decimals = 3
  ) %>% fmt_scientific(
    columns = vars(p.value),
  ) %>% tab_options(
    heading.title.font.size = 14,
    heading.subtitle.font.size = 12
  )

gt_table_model

House Price regressed against ft^2 (SQFEET) + ROOMS(#) + AGE(years)
House Price in Thousands ($000) of dollars
Coefficient	Estimate	Std Error	t-stat	p value
(Intercept)	110.957	76.983	1.441	1.53 × 10⁻¹
SQFEET	0.291	0.063	4.638	1.16 × 10⁻⁵
ROOMS	9.540	0.853	11.185	7.64 × 10⁻¹⁹
AGE	−8.063	0.483	−16.702	1.33 × 10⁻²⁹

mean(results1$y) # price

## [1] 386.051

mean(results1$x0) # intercept

## [1] 40.0002

mean(results1$x1) # sqfeet

## [1] 1203.809

mean(results1$x2) # rooms

## [1] 4.548471

mean(results1$x3) # age

## [1] 14.76695

Question 3: Insurance

library(tidyverse)
library(broom)
library(gt)

intercept <- 150
intercept_sig <- 40

# age
x1_mu <- 38
x1_sig <- 7
x1_beta <- 50

# bmi
x2_mu <- 22
x2_sig <- 4
x2_beta <- 100

# smoker
x3_mu <- 15
x3_sig <- 0.5
x3_beta <- 535

# spend
x4_mu <- 500
x4_sig <- 250
x4_beta <- -0.4


noise_mu <- 0
noise_sig <- 300 # low value gets low p-value b/c low noise

size <- 43
set.seed(12)

results <- tibble(
  x0 = rnorm(size, intercept, intercept_sig),
  x1 = round(rnorm(size, x1_mu, x1_sig)),
  x2 = rnorm(size, x2_mu, x2_sig),
  x3 = round(runif(size)-.35), # smoker = 1, non = 0
  x4 = rnorm(size, x4_mu, x4_sig),
  x1_b = rep(x1_beta, size),
  x2_b = rep(x2_beta, size),
  x3_b = rep(x3_beta, size),
  x4_b = rep(x4_beta, size),
  noise = rnorm(size, 0, noise_sig)
)


results1 <- results %>% mutate(
  y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + x4_b * x4 + noise
)

model <- lm(y ~ x1 + x2 + x3 + x4, data = results1)
summary(model)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = results1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -635.96 -169.65   15.59  219.73  543.51 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -324.2133   415.7736  -0.780 0.440348    
## x1            56.9518     7.5517   7.542 4.60e-09 ***
## x2           111.7571    11.9374   9.362 2.06e-11 ***
## x3           454.0799   125.8293   3.609 0.000884 ***
## x4            -0.5485     0.1793  -3.058 0.004064 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 298.3 on 38 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8027 
## F-statistic: 43.71 on 4 and 38 DF,  p-value: 1.009e-13

model_tidy <- tidy(model)
model_tidy[2,1] <- "AGE"
model_tidy[3,1] <- "BMI"
model_tidy[4,1] <- "SMOKER"
model_tidy[5,1] <- "CHARITY"

gt_table_model <- gt(model_tidy)

gt_table_model <- 
  gt_table_model %>% 
  tab_options(
    table.font.size = 14
  ) %>% 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body()
  ) %>% 
  tab_header(
    title = "Medical COST regressed against AGE + BMI + SMOKER(1/0) + CHARITY($)",
    subtitle = md("Simulated dataset")
  ) %>% tab_source_note(
    source_note = md("Residual standard error: 295.2 on 38 degrees of freedom")
  ) %>% tab_source_note(
    source_note = md("Multiple R-squared:  0.8343,  Adjusted R-squared:  0.8168")
  ) %>% tab_source_note(
    source_note = md("F-statistic: 47.82 on 4 and 38 DF,  p-value: 2.486e-14")
  ) %>% cols_label(
    term = "Coefficient",
    estimate = "Estimate",
    std.error = "Std Error",
    statistic = "t-stat",
    p.value = "p value"
  ) %>% fmt_number(
    columns = vars(estimate, std.error, statistic, p.value),
    decimals = 2
  ) %>% fmt_scientific(
    columns = vars(p.value),
  ) %>% tab_options(
    heading.title.font.size = 14,
    heading.subtitle.font.size = 12
  )


gt_table_model

Medical COST regressed against AGE + BMI + SMOKER(1/0) + CHARITY($)
Simulated dataset
Coefficient	Estimate	Std Error	t-stat	p value
(Intercept)	−324.21	415.77	−0.78	4.40 × 10⁻¹
AGE	56.95	7.55	7.54	4.60 × 10⁻⁹
BMI	111.76	11.94	9.36	2.06 × 10⁻¹¹
SMOKER	454.08	125.83	3.61	8.84 × 10⁻⁴
CHARITY	−0.55	0.18	−3.06	4.06 × 10⁻³
Residual standard error: 295.2 on 38 degrees of freedom
Multiple R-squared: 0.8343, Adjusted R-squared: 0.8168
F-statistic: 47.82 on 4 and 38 DF, p-value: 2.486e-14

mean(results1$y) # cost

## [1] 4150.588

mean(results1$x0) # intercept

## [1] 145.2152

mean(results1$x1) # age

## [1] 38.51163

mean(results1$x2) # bmi

## [1] 22.16166

mean(results1$x3) # smoker

## [1] 0.1627907

mean(results1$x4) # charity

## [1] 490.6912

David Harper

Founder & CEO of Bionic Turtle

I teach financial risk and enjoy learning data science

BT Question Set P1-T2-20-18: Multivariate regressions

Multiple regression

Question 1: Fama-french

Question 2: House Prices

Question 3: Insurance

David Harper

Founder & CEO of Bionic Turtle

Related