
Adam J Sullivan
Assistant Professor of Biostatistics
Brown University
library(readr)
ree <- read_csv("ree.csv")
ree
## # A tibble: 13 x 2
## CF Healthy
## <dbl> <dbl>
## 1 1153 996
## 2 1132 1080
## 3 1165 1182
## 4 1460 1452
## 5 1634 1162
## 6 1493 1619
## 7 1358 1140
## 8 1453 1123
## 9 1185 1113
## 10 1824 1463
## 11 1793 1632
## 12 1930 1614
## 13 2075 1836
BDSA
PackageSIGN.test(x ,y, md=0, alternative = "two.sidesd",
conf.level=0.95)
x
is a vector of valuesy
is an optional vector of values.md
is median and defaults to 0. alternative
is way to specific type of test.conf.level
specifies \(1-\alpha\).# install.packages("devtools")
# devtools::install_github('alanarnholt/BSDA')
library(BSDA)
attach(ree)
SIGN.test(CF, Healthy)
detach()
##
## Dependent-samples Sign-Test
##
## data: CF and Healthy
## S = 10, p-value = 0.02
## alternative hypothesis: true median difference is not equal to 0
## 95 percent confidence interval:
## 25.4 324.5
## sample estimates:
## median of x-y
## 161
##
## Achieved and Interpolated Confidence Intervals:
##
## Conf.Level L.E.pt U.E.pt
## Lower Achieved CI 0.908 52.0 316
## Interpolated CI 0.950 25.4 324
## Upper Achieved CI 0.978 8.0 330
library(tidyverse)
ree <- ree %>%
mutate(diff = CF - Healthy)
ree
## # A tibble: 13 x 3
## CF Healthy diff
## <dbl> <dbl> <dbl>
## 1 1153 996 157
## 2 1132 1080 52
## 3 1165 1182 -17
## 4 1460 1452 8
## 5 1634 1162 472
## 6 1493 1619 -126
## 7 1358 1140 218
## 8 1453 1123 330
## 9 1185 1113 72
## 10 1824 1463 361
## 11 1793 1632 161
## 12 1930 1614 316
## 13 2075 1836 239
binom.test(2,13)
##
## Exact binomial test
##
## data: 2 and 13
## number of successes = 2, number of trials = 10, p-value = 0.02
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.0192 0.4545
## sample estimates:
## probability of success
## 0.154
8
-17
52
-76
Subtraction | Positive Ranks | Negative Ranks |
---|---|---|
8 | 1 | |
-17 | -2 | |
52 | 3 | |
-76 | -4 | |
Sum | 4 | -6 |
\[ z =\dfrac{W_{smaller}- \dfrac{n(n+1)}{4}}{\sqrt{\dfrac{n(n+1)(2n+1)}{24}-t^3-\dfrac{t}{48}}}\]
attach(ree)
wilcox.test(CF, Healthy, paired=T)
detach(ree)
##
## Wilcoxon signed rank test
##
## data: CF and Healthy
## V = 80, p-value = 0.005
## alternative hypothesis: true location shift is not equal to 0
\[ z = \dfrac{W_s-\dfrac{n_s(n_s+n_L + 1)}{2}}{\sqrt{\dfrac{n_sn_L(n_s+n_L+1)}{12}}}\]
mtcars
library(tidyverse)
cars <- as_data_frame(mtcars)
cars
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # ... with 22 more rows
mpg
and am
mpg
: Miles Per Gallon on Averageam
attach(cars)
wilcox.test(mpg, am)
detach(cars)
##
## Wilcoxon rank sum test with continuity correction
##
## data: mpg and am
## W = 1000, p-value = 3e-12
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(formula, data, subset, ...)
formula
is y~x
or can enter outcome,group
instead.data
is the dataframe of interest.subset
if you wish to test subset of data. BSDA
package. Arthriti
Variable | Description |
---|---|
time |
Time in Days until patient felt relief |
treatment |
Factor with three levels A , B , and C |
library(BSDA)
Arthriti
## # A tibble: 51 x 2
## time treatment
## <int> <fct>
## 1 40 A
## 2 35 A
## 3 47 A
## 4 52 A
## 5 31 A
## 6 61 A
## 7 92 A
## 8 46 A
## 9 50 A
## 10 49 A
## # ... with 41 more rows
kruskal.test(time~treatment, data=Arthriti)
##
## Kruskal-Wallis rank sum test
##
## data: time by treatment
## Kruskal-Wallis chi-squared = 2, df = 2, p-value = 0.4
cor()
function. #Pearson from Monotonic Decreasing
cor(x2,y2, method="pearson")
#Spearman from Monotonic Decreasing
cor(x2,y2, method="spearman")
## [1] -0.887
## [1] -0.996
library(tidyverse)
library(dslabs)
data("polls_2008")
qplot(day, margin, data = polls_2008)
resid <- ifelse(lm(margin~day, data = polls_2008)$resid > 0, "+", "-")
polls_2008 %>%
mutate(resid = resid) %>%
ggplot(aes(day, margin)) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_point(aes(color = resid), size = 3)
poll_2008
data is to assume that public opinion remained approximately the same within a week's time. With this assumption in place, we have several data points with the same expected value. span <- 3.5
tmp <- polls_2008 %>%
crossing(center = polls_2008$day) %>%
mutate(dist = abs(day - center)) %>%
filter(dist <= span)
tmp %>% filter(center %in% c(-125, -55)) %>%
ggplot(aes(day, margin)) +
geom_point(data = polls_2008, size = 3, alpha = 0.5, color = "grey") +
geom_point(size = 2) +
geom_smooth(aes(group = center),
method = "lm", formula=y~1, se = FALSE) +
facet_wrap(~center)
span <- 7
fit <- with(polls_2008,
ksmooth(day, margin, x.points = day, kernel="box", bandwidth = span))
polls_2008 %>% mutate(smooth = fit$y) %>%
ggplot(aes(day, margin)) +
geom_point(size = 3, alpha = .5, color = "grey") +
geom_line(aes(day, smooth), color="red")
\[ \hat{f}(x_0) = \sum_{i=1}^N w_0(x_i) Y_i \]
kernel=box
in our call to the function ksmooth
. ksmooth
function provides a "smoother" option which uses the normal density to assign weights:x_0 <- -125
tmp <- with(data.frame(day = seq(min(polls_2008$day), max(polls_2008$day), .25)),
ksmooth(day, 1*I(day == x_0), kernel = "normal", x.points = day, bandwidth = span))
data.frame(x = tmp$x, w_0 = tmp$y) %>%
mutate(w_0 = w_0/sum(w_0)) %>%
ggplot(aes(x, w_0)) +
geom_line()
The final code and resulting plot look like this:
span <- 7
fit <- with(polls_2008,
ksmooth(day, margin, x.points = day, kernel="normal", bandwidth = span))
polls_2008 %>% mutate(smooth = fit$y) %>%
ggplot(aes(day, margin)) +
geom_point(size = 3, alpha = .5, color = "grey") +
geom_line(aes(day, smooth), color="red")
ksmooth
, shown above. library(broom)
fit <- loess(margin ~ day, degree=1, span = span, data=polls_2008)
loess_fit <- augment(fit)
if(!file.exists(file.path("loess-animation.gif"))){
p <- ggplot(tmp, aes(day, margin)) +
scale_size(range = c(0, 3)) +
geom_smooth(aes(group = center, frame = center, weight = weight), method = "lm", se = FALSE) +
geom_point(data = polls_2008, size = 3, alpha = .5, color = "grey") +
geom_point(aes(size = weight, frame = center)) +
geom_line(aes(x=day, y = .fitted, frame = day, cumulative = TRUE),
data = loess_fit, color = "red")
gganimate(p, filename = file.path("loess-animation.gif"), interval= .1)
}
if(knitr::is_html_output()){
knitr::include_graphics(file.path("loess-animation.gif"))
} else{
centers <- quantile(tmp$center, seq(1,6)/6)
tmp_loess_fit <- crossing(center=centers, loess_fit) %>%
group_by(center) %>%
filter(day <= center) %>%
ungroup()
tmp %>% filter(center %in% centers) %>%
ggplot() +
geom_smooth(aes(day, margin), method = "lm", se = FALSE) +
geom_point(aes(day, margin, size = weight), data = polls_2008, size = 3, alpha = .5, color = "grey") +
geom_point(aes(day, margin)) +
geom_line(aes(x=day, y = .fitted), data = tmp_loess_fit, color = "red") +
facet_wrap(~center, nrow = 2)
}
total_days <- diff(range(polls_2008$day))
span <- 21/total_days
fit <- loess(margin ~ day, degree=1, span = span, data=polls_2008)
polls_2008 %>% mutate(smooth = fit$fitted) %>%
ggplot(aes(day, margin)) +
geom_point(size = 3, alpha = .5, color = "grey") +
geom_line(aes(day, smooth), color="red")
spans <- c(.66, 0.25, 0.15, 0.10)
fits <- data_frame(span = spans) %>%
group_by(span) %>%
do(broom::augment(loess(margin ~ day, degree=1, span = .$span, data=polls_2008)))
tmp <- fits %>%
crossing(span = spans, center = polls_2008$day) %>%
mutate(dist = abs(day - center)) %>%
filter(rank(dist) / n() <= span) %>%
mutate(weight = (1 - (dist / max(dist)) ^ 3) ^ 3)
if(!file.exists(file.path( "loess-multi-span-animation.gif"))){
p <- ggplot(tmp, aes(day, margin)) +
scale_size(range = c(0, 2)) +
geom_smooth(aes(group = center, frame = center, weight = weight), method = "lm", se = FALSE) +
geom_point(data = polls_2008, size = 2, alpha = .5, color = "grey") +
geom_line(aes(x=day, y = .fitted, frame = day, cumulative = TRUE),
data = fits, color = "red") +
geom_point(aes(size = weight, frame = center)) +
facet_wrap(~span)
gganimate(p, filename = file.path( "loess-multi-span-animation.gif"), interval= .1)
}
if(knitr::is_html_output()){
knitr::include_graphics(file.path("loess-multi-span-animation.gif"))
} else{
centers <- quantile(tmp$center, seq(1,4)/4)
tmp_fits <- crossing(center=centers, fits) %>%
group_by(center) %>%
filter(day <= center) %>%
ungroup()
tmp %>% filter(center %in% centers) %>%
ggplot() +
geom_smooth(aes(day, margin), method = "lm", se = FALSE) +
geom_point(aes(day, margin, size = weight), data = polls_2008, size = 3, alpha = .5, color = "grey") +
geom_point(aes(day, margin)) +
geom_line(aes(x=day, y = .fitted), data = tmp_fits, color = "red") +
facet_grid(span ~ center)
}
## Error in fortify(data): object 'fits' not found
loess
and the typical bin smoother.
loess
keeps the number of points used in the local fit the same. loess
uses a weighted approach. Basically, instead of using least squares, we minimize a weighted version. loess
has the option of fitting the local model robustly. An iterative algorithm is implemented in which, after fitting a model in one iteration, outliers are detected and down-weighted for the next iteration. To use this option, we use the argument family="symmetric"
.--- .class #id
span
argument, which expects a proportion. N
is the number of data points and span=0.5
, then for a given \(x\), loess
will use the 0.5 * N
closest points to \(x\) for the fit. -- .class #id
\[ \sum_{i=1}^N w_0(x_i) \left[Y_i - \left\{\beta_0 + \beta_1 (x_i-x_0)\right\}\right]^2 \]
-- .class #id
ggplot
uses loess in its geom_smooth
function: polls_2008 %>% ggplot(aes(day, margin)) +
geom_point() +
geom_smooth()
polls_2008 %>% ggplot(aes(day, margin)) +
geom_point() +
geom_smooth(color="red", span = 0.15,
method.args = list(degree=1))