
Adam J Sullivan
Assistant Professor of Biostatistics
Brown University
anscombe$x2sq <- anscombe$x2^2
mod2a <- lm(y2 ~ x2 + x2sq, data=anscombe)
tidy(mod2a, conf.int=T)[,-c(3,4)]
glance(mod2a)
plot(mod2a,1)
term | estimate | p.value | conf.low | conf.high |
---|---|---|---|---|
(Intercept) | -5.9957343 | 0 | -6.005719 | -5.9857494 |
x2 | 2.7808392 | 0 | 2.778441 | 2.7832375 |
x2sq | -0.1267133 | 0 | -0.126845 | -0.1265816 |
r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
---|---|---|---|---|---|---|---|---|---|---|
0.9999995 | 0.9999993 | 0.0016725 | 7378133 | 0 | 3 | 56.47107 | -104.9421 | -103.3506 | 2.24e-05 | 8 |
When fitting a regression model we will take these steps to verify the validity of the model:
\[E[Y|x] = \beta_0 + \beta_1 x + \beta_2 x^2\]
x2
has been transformed to a square term, the \(beta\) values are still linear. plot(mod2,1)
plot(mod2a, 2)
mean(mod2a$residuals)
## [1] 0
\[D_i= \dfrac{\sum_{j=1}^n \left( \hat{y}_j - \hat{y}_{j(i)}\right)^2}{(p+1)\hat{\sigma}^2}\]
\[DFFITS_i=\dfrac{\hat{Y_i} - \hat{Y_{i(i)}}}{\hat{\sigma}_{(i)}\sqrt{h_{ii}}}\]
set.seed(12345)
X = runif(150, .5, 3.5)
beta0 = 1.0
beta1 = 1.5
sigma = 0.7
Y = beta0 + beta1*X + sigma*rnorm(150) # The regular process
# Contaminated data: Four cases
X.suspect1 = 1.5; Y.suspect1 = 3.3
X.suspect2 = 1.5; Y.suspect2 = 9.7
X.suspect3 = 9.0; Y.suspect3 = 14.5
X.suspect4 = 9.3; Y.suspect4 = 0.6
Y.all1 = c(Y, Y.suspect1); X.all1 = c(X, X.suspect1)
Y.all2 = c(Y, Y.suspect2); X.all2 = c(X, X.suspect2)
Y.all3 = c(Y, Y.suspect3); X.all3 = c(X, X.suspect3)
Y.all4 = c(Y, Y.suspect4); X.all4 = c(X, X.suspect4)
out1 <- lm(data=data, Y.all1~X.all1 )
out2 <- lm(data=data, Y.all2~X.all2 )
out3 <- lm(data=data, Y.all3~X.all3 )
out4 <- lm(data=data, Y.all4~X.all4 )
library(olsrr)
ols_plot_cooksd_bar(out1)
ols_plot_dfbetas(out1)
ols_plot_dffits(out1)
library(olsrr)
ols_plot_cooksd_bar(out4)
ols_plot_dfbetas(out4)
ols_plot_dffits(out4)
library(broom)
out4a <- lm(data=data[-151,], Y.all4~X.all4 )
tidy4a <- tidy(out4a, conf.int = T)
tidy4 <- tidy(out4, conf.int = T)
knitr::kable(bind_rows(tidy4, tidy4a)[-c(3,4)])
glance4a <- glance(out4a)
glance4 <- glance(out4)
knitr::kable(bind_rows(glance4, glance4a))
term | estimate | p.value | conf.low | conf.high |
---|---|---|---|---|
(Intercept) | 2.3977868 | 0 | 1.9682861 | 2.827287 |
X.all4 | 0.8313212 | 0 | 0.6518475 | 1.010795 |
(Intercept) | 1.2495210 | 0 | 0.9491648 | 1.549877 |
X.all4 | 1.4092337 | 0 | 1.2773737 | 1.541094 |
r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
---|---|---|---|---|---|---|---|---|---|---|
0.3598977 | 0.3556017 | 1.1853180 | 83.7753 | 0 | 2 | -238.9247 | 483.8494 | 492.9013 | 209.34182 | 149 |
0.7508560 | 0.7491726 | 0.7272015 | 446.0339 | 0 | 2 | -164.0513 | 334.1026 | 343.1345 | 78.26566 | 148 |
library(car)
mmps(out4)
mmps(out4a)
x <- dataframe$variable_of)interest
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
x[x < (qnt[1] - H)] <- caps[1]
x[x > (qnt[2] + H)] <- caps[2]