# BT Question Set P1-T2-20-18: Multivariate regressions

## Multiple regression

### Question 1: Fama-french

``````library(tidyverse)
library(broom)
library(gt)

intercept <- .03
intercept_sig <- .01

x1_mu <- .04
x1_sig <- .01
x1_beta <- 0.4

x2_mu <- .03
x2_sig <- .01
x2_beta <- -0.6

x3_mu <- .03
x3_sig <- .01
x3_beta <- -0.3

noise_mu <- 0
noise_sig <- 0 # low value gets low p-value b/c low noise

size <- 96
set.seed(18)

results <- tibble(
x0 = rnorm(size, intercept, intercept_sig),
x1 = rnorm(size, x1_mu, x1_sig),
x2 = rnorm(size, x2_mu, x2_sig),
x3 = rnorm(size, x3_mu, x3_sig),
x1_b = rep(x1_beta, size),
x2_b = rep(x2_beta, size),
x3_b = rep(x3_beta, size),
noise = rnorm(size, 0, noise_sig)
)

results1 <- results %>% mutate(
y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)

model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)``````
``````##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
##
## Residuals:
##        Min         1Q     Median         3Q        Max
## -0.0179934 -0.0063017 -0.0002194  0.0069123  0.0252648
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.026900   0.005785   4.650 1.11e-05 ***
## x1           0.501752   0.099805   5.027 2.44e-06 ***
## x2          -0.702932   0.096522  -7.283 1.09e-10 ***
## x3          -0.276534   0.102339  -2.702   0.0082 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00983 on 92 degrees of freedom
## Multiple R-squared:  0.4619, Adjusted R-squared:  0.4444
## F-statistic: 26.33 on 3 and 92 DF,  p-value: 2.206e-12``````
``````model_tidy <- tidy(model)
model_tidy[2,1] <- "MKT"
model_tidy[3,1] <- "SMB"
model_tidy[4,1] <- "HML"

gt_table_model <- gt(model_tidy)

gt_table_model <-
gt_table_model %>%
tab_options(
table.font.size = 14
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body()
) %>%
tab_header(
title = "Portfolio excess returns regressed against MKT + SMB + HML",
subtitle ="i.e., Fama-French three-factor model"
#) %>% tab_source_note(
#   source_note = md("the source is ... FRED")
) %>% cols_label(
term = "Coefficient",
estimate = "Estimate",
std.error = "Std Error",
statistic = "t-stat",
p.value = "p value"
) %>% fmt_number(
columns = vars(estimate, std.error, statistic, p.value),
decimals = 3
) %>% fmt_scientific(
columns = vars(statistic, p.value),
) %>% tab_options(
heading.title.font.size = 14,
heading.subtitle.font.size = 12
)

gt_table_model``````
Portfolio excess returns regressed against MKT + SMB + HML
i.e., Fama-French three-factor model
Coefficient Estimate Std Error t-stat p value
(Intercept) 0.027 0.006 4.65 1.11 × 10−5
MKT 0.502 0.100 5.03 2.44 × 10−6
SMB −0.703 0.097 −7.28 1.09 × 10−10
HML −0.277 0.102 −2.70 8.20 × 10−3

### Question 2: House Prices

``````library(tidyverse)
library(broom)
library(gt)

intercept <- 40
intercept_sig <- .01

x1_mu <- 1200
x1_sig <- 30
x1_beta <- 0.35

x2_mu <- 4.5
x2_sig <- 2
x2_beta <- 10.0

x3_mu <- 15
x3_sig <- 4
x3_beta <- -8.0

noise_mu <- 0
noise_sig <- 20 # low value gets low p-value b/c low noise

size <- 96
set.seed(43)

results <- tibble(
x0 = rnorm(size, intercept, intercept_sig),
x1 = rnorm(size, x1_mu, x1_sig),
x2 = rnorm(size, x2_mu, x2_sig),
x3 = rnorm(size, x3_mu, x3_sig),
x1_b = rep(x1_beta, size),
x2_b = rep(x2_beta, size),
x3_b = rep(x3_beta, size),
noise = rnorm(size, 0, noise_sig)
)

results1 <- results %>% mutate(
y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + noise
)

model <- lm(y ~ x1 + x2 + x3, data = results1)
summary(model)``````
``````##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = results1)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -40.088 -13.508  -1.795  13.819  41.934
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110.95667   76.98277   1.441    0.153
## x1            0.29138    0.06283   4.638 1.16e-05 ***
## x2            9.53971    0.85291  11.185  < 2e-16 ***
## x3           -8.06268    0.48275 -16.702  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.28 on 92 degrees of freedom
## Multiple R-squared:  0.8378, Adjusted R-squared:  0.8325
## F-statistic: 158.4 on 3 and 92 DF,  p-value: < 2.2e-16``````
``````model_tidy <- tidy(model)
model_tidy[2,1] <- "SQFEET"
model_tidy[3,1] <- "ROOMS"
model_tidy[4,1] <- "AGE"

gt_table_model <- gt(model_tidy)

gt_table_model <-
gt_table_model %>%
tab_options(
table.font.size = 14
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body()
) %>%
tab_header(
title = "House Price regressed against ft^2 (SQFEET) + ROOMS(#) + AGE(years)",
subtitle = md("House Price in Thousands **(\$000)** of dollars")
#) %>% tab_source_note(
#   source_note = md("the source is ... FRED")
) %>% cols_label(
term = "Coefficient",
estimate = "Estimate",
std.error = "Std Error",
statistic = "t-stat",
p.value = "p value"
) %>% fmt_number(
columns = vars(estimate, std.error, statistic, p.value),
decimals = 3
) %>% fmt_scientific(
columns = vars(p.value),
) %>% tab_options(
heading.title.font.size = 14,
heading.subtitle.font.size = 12
)

gt_table_model``````
House Price regressed against ft^2 (SQFEET) + ROOMS(#) + AGE(years)
House Price in Thousands (\$000) of dollars
Coefficient Estimate Std Error t-stat p value
(Intercept) 110.957 76.983 1.441 1.53 × 10−1
SQFEET 0.291 0.063 4.638 1.16 × 10−5
ROOMS 9.540 0.853 11.185 7.64 × 10−19
AGE −8.063 0.483 −16.702 1.33 × 10−29
``mean(results1\$y) # price``
``## [1] 386.051``
``mean(results1\$x0) # intercept``
``## [1] 40.0002``
``mean(results1\$x1) # sqfeet``
``## [1] 1203.809``
``mean(results1\$x2) # rooms``
``## [1] 4.548471``
``mean(results1\$x3) # age``
``## [1] 14.76695``

### Question 3: Insurance

``````library(tidyverse)
library(broom)
library(gt)

intercept <- 150
intercept_sig <- 40

# age
x1_mu <- 38
x1_sig <- 7
x1_beta <- 50

# bmi
x2_mu <- 22
x2_sig <- 4
x2_beta <- 100

# smoker
x3_mu <- 15
x3_sig <- 0.5
x3_beta <- 535

# spend
x4_mu <- 500
x4_sig <- 250
x4_beta <- -0.4

noise_mu <- 0
noise_sig <- 300 # low value gets low p-value b/c low noise

size <- 43
set.seed(12)

results <- tibble(
x0 = rnorm(size, intercept, intercept_sig),
x1 = round(rnorm(size, x1_mu, x1_sig)),
x2 = rnorm(size, x2_mu, x2_sig),
x3 = round(runif(size)-.35), # smoker = 1, non = 0
x4 = rnorm(size, x4_mu, x4_sig),
x1_b = rep(x1_beta, size),
x2_b = rep(x2_beta, size),
x3_b = rep(x3_beta, size),
x4_b = rep(x4_beta, size),
noise = rnorm(size, 0, noise_sig)
)

results1 <- results %>% mutate(
y = x0 +x1_b * x1 + x2_b * x2 + x3_b * x3 + x4_b * x4 + noise
)

model <- lm(y ~ x1 + x2 + x3 + x4, data = results1)
summary(model)``````
``````##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = results1)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -635.96 -169.65   15.59  219.73  543.51
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -324.2133   415.7736  -0.780 0.440348
## x1            56.9518     7.5517   7.542 4.60e-09 ***
## x2           111.7571    11.9374   9.362 2.06e-11 ***
## x3           454.0799   125.8293   3.609 0.000884 ***
## x4            -0.5485     0.1793  -3.058 0.004064 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 298.3 on 38 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8027
## F-statistic: 43.71 on 4 and 38 DF,  p-value: 1.009e-13``````
``````model_tidy <- tidy(model)
model_tidy[2,1] <- "AGE"
model_tidy[3,1] <- "BMI"
model_tidy[4,1] <- "SMOKER"
model_tidy[5,1] <- "CHARITY"

gt_table_model <- gt(model_tidy)

gt_table_model <-
gt_table_model %>%
tab_options(
table.font.size = 14
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body()
) %>%
tab_header(
title = "Medical COST regressed against AGE + BMI + SMOKER(1/0) + CHARITY(\$)",
subtitle = md("Simulated dataset")
) %>% tab_source_note(
source_note = md("Residual standard error: 295.2 on 38 degrees of freedom")
) %>% tab_source_note(
source_note = md("Multiple R-squared:  0.8343,  Adjusted R-squared:  0.8168")
) %>% tab_source_note(
source_note = md("F-statistic: 47.82 on 4 and 38 DF,  p-value: 2.486e-14")
) %>% cols_label(
term = "Coefficient",
estimate = "Estimate",
std.error = "Std Error",
statistic = "t-stat",
p.value = "p value"
) %>% fmt_number(
columns = vars(estimate, std.error, statistic, p.value),
decimals = 2
) %>% fmt_scientific(
columns = vars(p.value),
) %>% tab_options(
heading.title.font.size = 14,
heading.subtitle.font.size = 12
)

gt_table_model``````
Medical COST regressed against AGE + BMI + SMOKER(1/0) + CHARITY(\$)
Simulated dataset
Coefficient Estimate Std Error t-stat p value
(Intercept) −324.21 415.77 −0.78 4.40 × 10−1
AGE 56.95 7.55 7.54 4.60 × 10−9
BMI 111.76 11.94 9.36 2.06 × 10−11
SMOKER 454.08 125.83 3.61 8.84 × 10−4
CHARITY −0.55 0.18 −3.06 4.06 × 10−3
Residual standard error: 295.2 on 38 degrees of freedom
Multiple R-squared: 0.8343, Adjusted R-squared: 0.8168
F-statistic: 47.82 on 4 and 38 DF, p-value: 2.486e-14
``mean(results1\$y) # cost``
``## [1] 4150.588``
``mean(results1\$x0) # intercept``
``## [1] 145.2152``
``mean(results1\$x1) # age``
``## [1] 38.51163``
``mean(results1\$x2) # bmi``
``## [1] 22.16166``
``mean(results1\$x3) # smoker``
``## [1] 0.1627907``
``mean(results1\$x4) # charity``
``## [1] 490.6912``
##### David Harper
###### Founder & CEO of Bionic Turtle

I teach financial risk and enjoy learning data science