Exercise 1 (longley Macroeconomic Data)

The built-in dataset longley contains macroeconomic data for predicting employment. We will attempt to model the Employed variable.

View(longley)
?longley

(a) What is the largest correlation between any pair of predictors in the dataset?

round(cor(longley), 3)
##              GNP.deflator   GNP Unemployed Armed.Forces Population  Year
## GNP.deflator        1.000 0.992      0.621        0.465      0.979 0.991
## GNP                 0.992 1.000      0.604        0.446      0.991 0.995
## Unemployed          0.621 0.604      1.000       -0.177      0.687 0.668
## Armed.Forces        0.465 0.446     -0.177        1.000      0.364 0.417
## Population          0.979 0.991      0.687        0.364      1.000 0.994
## Year                0.991 0.995      0.668        0.417      0.994 1.000
## Employed            0.971 0.984      0.502        0.457      0.960 0.971
##              Employed
## GNP.deflator    0.971
## GNP             0.984
## Unemployed      0.502
## Armed.Forces    0.457
## Population      0.960
## Year            0.971
## Employed        1.000

\(\color{blue}{\text{The largest correlation between any pair of predictors is between $Year$ and $GNP$ which is $0.995$}}\)

(b) Fit a model with Employed as the response and the remaining variables as predictors. Calculate and report the variance inflation factor (VIF) for each of the predictors. Which variable has the largest VIF? Do any of the VIFs suggest multicollinearity?

employed_model_add <- lm(Employed ~ ., data=longley)
summary(employed_model_add)
## 
## Call:
## lm(formula = Employed ~ ., data = longley)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4101 -0.1577 -0.0282  0.1016  0.4554 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.48e+03   8.90e+02   -3.91  0.00356 ** 
## GNP.deflator  1.51e-02   8.49e-02    0.18  0.86314    
## GNP          -3.58e-02   3.35e-02   -1.07  0.31268    
## Unemployed   -2.02e-02   4.88e-03   -4.14  0.00254 ** 
## Armed.Forces -1.03e-02   2.14e-03   -4.82  0.00094 ***
## Population   -5.11e-02   2.26e-01   -0.23  0.82621    
## Year          1.83e+00   4.55e-01    4.02  0.00304 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.305 on 9 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.992 
## F-statistic:  330 on 6 and 9 DF,  p-value: 4.98e-10
library(faraway)
vif(employed_model_add)
## GNP.deflator          GNP   Unemployed Armed.Forces   Population         Year 
##      135.532     1788.513       33.619        3.589      399.151      758.981

\(\color{blue}{\text{The VIFs for each of the predictors are fairly high other than $Armed.Forces$ meaning there is likely multicollinearity.}}\)

vif(employed_model_add)[which.max(vif(employed_model_add))]
##  GNP 
## 1789

\(\color{blue}{\text{GNP has the largest VIF of 1788.5135.}}\)

(c) What proportion of the observed variation in Population is explained by a linear relationship with the other predictors?

population_model_no_emp <- lm(Population ~ . - Employed, data=longley)
summary(population_model_no_emp)$r.squared
## [1] 0.9975

\(\color{blue}{\text{99.7495% of the observed variation in $Population$ is explained by a linear relationship with the other predictors }}\)

(d) Calculate the partial correlation coefficient for Population and Employed with the effects of the other predictors removed.

employed_model_no_pop <- lm(Employed ~ . - Population, data = longley)
cor(resid(population_model_no_emp), resid(employed_model_no_pop))
## [1] -0.07514

\(\color{blue}{\text{The partial correlation coefficient for $Population$ and $Employed$ -0.0751}}\)

(e) Fit a new model with Employed as the response and the predictors from the model in (b) that were significant. (Use \(\alpha = 0.05\).) Calculate and report the variance inflation factor for each of the predictors. Which variable has the largest VIF? Do any of the VIFs suggest multicollinearity?

employed_model_small <- lm(Employed ~  Unemployed + Armed.Forces + Year, data = longley)
vif(employed_model_small)
##   Unemployed Armed.Forces         Year 
##        3.318        2.223        3.891

\(\color{blue}{\text{$Year$ has the largest VIF. None of the VIF values seem to be high enough to suggest multicollinearity}}\)

(f) Use an \(F\)-test to compare the models in parts (b) and (e). Report the following:

anova(employed_model_small, employed_model_add)
## Analysis of Variance Table
## 
## Model 1: Employed ~ Unemployed + Armed.Forces + Year
## Model 2: Employed ~ GNP.deflator + GNP + Unemployed + Armed.Forces + Population + 
##     Year
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)
## 1     12 1.323                         
## 2      9 0.836  3     0.487 1.75   0.23

(g) Check the assumptions of the model chosen in part (f). Do any assumptions appear to be violated?

plot_fitted_resid = function(model, pointcol = "dodgerblue", linecol = "darkorange") {
  plot(fitted(model), resid(model), 
       col = pointcol, pch = 20, cex = 1.5,
       xlab = "Fitted", ylab = "Residuals", main = "Fitted vs. Residuals")
       
  abline(h = 0, col = linecol, lwd = 2)
}

plot_qq = function(model, pointcol = "dodgerblue", linecol = "darkorange") {
  qqnorm(resid(model), col = pointcol, pch = 20, cex = 1.5)
  qqline(resid(model), col = linecol, lwd = 2)
}
par(mfrow = c(1, 2))
plot_fitted_resid(employed_model_small)
plot_qq(employed_model_small)

\(\color{blue}{\text{It does not appear that the assumptions are clearly violated.}}\) \(\color{blue}{\text{Given the small size of the data set, it is hard to tell. There are a couple of outliers that should likely be scrutinized from the Fitted vs. Residual plot.}}\) \(\color{blue}{\text{The tails on the Normal Q-Q Plot are slightly off the line so normality could be questioned, however the tails are not that far off, and it is only a few points. }}\)


Exercise 2 (Credit Data)

For this exercise, use the Credit data from the ISLR package. Use the following code to remove the ID variable which is not useful for modeling.

library(ISLR)
data(Credit)
Credit = subset(Credit, select = -c(ID))

Use ?Credit to learn about this dataset.

#?Credit

(a) Find a “good” model for balance using the available predictors. Use any methods seen in class except transformations of the response. The model should:

Store your model in a variable called mod_a. Run the two given chunks to verify your model meets the requested criteria. If you cannot find a model that meets all criteria, partial credit will be given for meeting at least some of the criteria.

library(lmtest)

get_bp_decision = function(model, alpha) {
  decide = unname(bptest(model)$p.value < alpha)
  ifelse(decide, "Reject", "Fail to Reject")
}

get_sw_decision = function(model, alpha) {
  decide = unname(shapiro.test(resid(model))$p.value < alpha)
  ifelse(decide, "Reject", "Fail to Reject")
}

get_num_params = function(model) {
  length(coef(model))
}

get_loocv_rmse = function(model) {
  sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
}

get_adj_r2 = function(model) {
  summary(model)$adj.r.squared
}

\(\color{blue}{\text{Helper method for checking all the conditions in the problem.}}\)

check_all_conditions =  function(model, model_name, test_type = "bp", loocv_limit = 135, adj_r2_limit = 0.90, max_param = 10) {
  df_conditions = data.frame(loocv = double(), loocv_accept = logical(),
                             adj_r2 = double(), adj_r2_accept = logical(),
                             test_decision=character(), test_decision_accept = logical(),
                             num_params = integer(), num_params_accept = logical(),
                             overall_accept = logical())
  df_conditions[model_name, "loocv"] = get_loocv_rmse(model)
  df_conditions[model_name, "loocv_accept"] = df_conditions[model_name, "loocv"] < loocv_limit
  df_conditions[model_name, "adj_r2"] = get_adj_r2(model)
  df_conditions[model_name, "adj_r2_accept"] = df_conditions[model_name, "adj_r2"] > adj_r2_limit
  if(test_type == "sw") {
    df_conditions["test_decision"] = get_sw_decision(model, alpha = 0.01)
    df_conditions[model_name, "test_decision_accept"] = grepl("Fail", df_conditions["test_decision"], fixed=TRUE)
  }else{
    df_conditions["test_decision"] = get_bp_decision(model, alpha = 0.01)
    df_conditions[model_name, "test_decision_accept"] = grepl("Fail", df_conditions["test_decision"], fixed=TRUE)
  }
  df_conditions[model_name, "num_params"] = get_num_params(model)
  df_conditions[model_name, "num_params_accept"] = get_num_params(model) < max_param
  df_conditions[model_name, "overall_accept"] = all(df_conditions[model_name, "loocv_accept"], df_conditions[model_name, "adj_r2_accept"], df_conditions[model_name, "test_decision_accept"], df_conditions[model_name, "num_params_accept"], na.rm = FALSE)
  return(df_conditions)
}
balance_add = lm(Balance ~ . , data = Credit)
check_all_conditions(balance_add, "balance_add", "bp")
##             loocv loocv_accept adj_r2 adj_r2_accept test_decision
## balance_add 100.4         TRUE 0.9538          TRUE        Reject
##             test_decision_accept num_params num_params_accept overall_accept
## balance_add                FALSE         12             FALSE          FALSE

\(\color{blue}{\text{Obviously, there are too many parameters, so we cannot us the full additive model. }}\) \(\color{blue}{\text{The addititive model does not meet the Breusch-Pagan test with an $\alpha$ of $0.01$. Let's look for colinearity.}}\)

pairs(Credit, col = "dodgerblue")