I have used codes from the book Dalgaard (2008) for this lab.
Descriptive statistics
x <- rnorm(50)
mean_x <- mean(x)
sd_x <- sd(x)
var_x <- var(x)
median_x <- median(x)
# Moments
library(moments)
skewness_x <- skewness(x)
kurtosis_x <- kurtosis(x)
list(mean = mean_x, sd = sd_x, var = var_x, median = median_x,
skewness = skewness_x, kurtosis = kurtosis_x)
## $mean
## [1] 0.1626365
##
## $sd
## [1] 0.9969765
##
## $var
## [1] 0.9939621
##
## $median
## [1] 0.2662041
##
## $skewness
## [1] -0.3337814
##
## $kurtosis
## [1] 5.17371
quantile_x <- quantile(x)
pvec <- seq(0, 1, 0.1)
quantile_values <- quantile(x, pvec)
list(quantiles = quantile_x, pvec = pvec, quantile_values = quantile_values)
## $quantiles
## 0% 25% 50% 75% 100%
## -3.2331521 -0.3589715 0.2662041 0.6624521 2.9191401
##
## $pvec
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
##
## $quantile_values
## 0% 10% 20% 30% 40% 50%
## -3.23315213 -1.07956513 -0.45383165 -0.28370927 0.04854668 0.26620409
## 60% 70% 80% 90% 100%
## 0.41526247 0.60485247 0.71271795 1.19187526 2.91914013
data()
head(Nile)
## [1] 1120 1160 963 1210 1160 1160
summary(Nile)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 456.0 798.5 893.5 919.4 1032.5 1370.0
library('ISwR')
attach(juul)
names(juul)
## [1] "age" "menarche" "sex" "igf1" "tanner" "testvol"
mean(igf1)
## [1] NA
mean(igf1,na.rm=T)
## [1] 340.168
summary(igf1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 25.0 202.2 313.5 340.2 462.8 915.0 321
summary(juul)
## age menarche sex igf1 tanner
## Min. : 0.170 No :369 M :621 Min. : 25.0 I :515
## 1st Qu.: 9.053 Yes :335 F :713 1st Qu.:202.2 II :103
## Median :12.560 NA's:635 NA's: 5 Median :313.5 III : 72
## Mean :15.095 Mean :340.2 IV : 81
## 3rd Qu.:16.855 3rd Qu.:462.8 V :328
## Max. :83.000 Max. :915.0 NA's:240
## NA's :5 NA's :321
## testvol
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 3.000
## Mean : 7.896
## 3rd Qu.:15.000
## Max. :30.000
## NA's :859
detach(juul)
juul$sex <- factor(juul$sex,labels=c("M","F"))
juul$menarche <- factor(juul$menarche,labels=c("No","Yes"))
juul$tanner <- factor(juul$tanner,labels=c("I","II","III","IV","V"))
attach(juul)
summary(juul)
## age menarche sex igf1 tanner
## Min. : 0.170 No :369 M :621 Min. : 25.0 I :515
## 1st Qu.: 9.053 Yes :335 F :713 1st Qu.:202.2 II :103
## Median :12.560 NA's:635 NA's: 5 Median :313.5 III : 72
## Mean :15.095 Mean :340.2 IV : 81
## 3rd Qu.:16.855 3rd Qu.:462.8 V :328
## Max. :83.000 Max. :915.0 NA's:240
## NA's :5 NA's :321
## testvol
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 3.000
## Mean : 7.896
## 3rd Qu.:15.000
## Max. :30.000
## NA's :859
juul <- transform(juul,sex=factor(sex,labels=c("M","F")),
menarche=factor(menarche,labels=c("No","Yes")),
tanner=factor(tanner,labels=c("I","II","III","IV","V")))
Graphics for single data
mid.age <- c(2.5, 7.5, 13, 16.5, 17.5, 19, 22.5, 44.5, 70.5)
acc.count <- c(28, 46, 58, 20, 31, 64, 149, 316, 103)
age.acc <- rep(mid.age, acc.count)
brk <- c(0, 5, 10, 16, 17, 18, 20, 25, 60, 80)
hist(age.acc, breaks = brk)
Q-Q plot
x <- rnorm(10000)
qqnorm(x)
qqline(x, col = 2,lwd=2)
library(ggplot2)
data <- data.frame(x)
ggplot(data, aes(sample = x)) +
stat_qq() +
stat_qq_line(col = "red")
sample_data <- ToothGrowth
qqnorm(sample_data$len)
qqline(sample_data$len, col = 2, lwd = 2)
Box plot
par(mfrow=c(1,2))
boxplot(IgM)
boxplot(log(IgM))
par(mfrow=c(1,1))
Summary statistics by group
xbar <- tapply(igf1, tanner, mean, na.rm=T)
s <- tapply(igf1, tanner, sd, na.rm=T)
n <- tapply(igf1, tanner, length)
cbind(mean=xbar, std.dev=s, n=n)
## mean std.dev n
## I 207.4727 90.27237 515
## II 352.6714 122.59332 103
## III 483.2222 152.28664 72
## IV 513.0172 119.09594 81
## V 465.3344 134.41867 328
aggregate(juul[c("age","igf1")], juul["sex"], mean, na.rm=T)
by(juul, juul["sex"], summary)
## sex: M
## age menarche sex igf1 tanner
## Min. : 0.17 No : 0 M:621 Min. : 29.0 I :291
## 1st Qu.: 8.85 Yes : 0 F: 0 1st Qu.:176.0 II : 55
## Median :12.38 NA's:621 Median :280.0 III : 34
## Mean :15.38 Mean :310.9 IV : 41
## 3rd Qu.:16.77 3rd Qu.:430.2 V :124
## Max. :83.00 Max. :915.0 NA's: 76
## NA's :145
## testvol
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 3.000
## Mean : 7.896
## 3rd Qu.:15.000
## Max. :30.000
## NA's :141
## ------------------------------------------------------
## sex: F
## age menarche sex igf1 tanner
## Min. : 0.25 No :369 M: 0 Min. : 25.0 I :224
## 1st Qu.: 9.30 Yes :335 F:713 1st Qu.:233.0 II : 48
## Median :12.80 NA's: 9 Median :352.0 III : 38
## Mean :14.84 Mean :368.1 IV : 40
## 3rd Qu.:16.93 3rd Qu.:483.0 V :204
## Max. :75.12 Max. :914.0 NA's:159
## NA's :176
## testvol
## Min. : NA
## 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :713
Graphics for grouped data
attach(energy)
expend.lean <- expend[stature=="lean"]
expend.obese <- expend[stature=="obese"]
par(mfrow=c(2,1))
hist(expend.lean,breaks=10,xlim=c(5,13),ylim=c(0,4),col="white")
hist(expend.obese,breaks=10,xlim=c(5,13),ylim=c(0,4),col="grey")
par(mfrow=c(1,1))
boxplot(expend ~ stature)
Hypothesis Testing
# One-sample t-test
t_test_result <- t.test(igf1 ~ sex, data = juul, var.equal = TRUE)
t_test_result
##
## Two Sample t-test
##
## data: igf1 by sex
## t = -5.3949, df = 1011, p-value = 8.54e-08
## alternative hypothesis: true difference in means between group M and group F is not equal to 0
## 95 percent confidence interval:
## -78.02476 -36.40325
## sample estimates:
## mean in group M mean in group F
## 310.8866 368.1006
Tables
caff.marital <- matrix(c(652,1537,598,242,36,46,38,21,218,327,106,67),
nrow=3,byrow=T)
colnames(caff.marital) <- c("0","1-150","151-300",">300")
rownames(caff.marital) <- c("Married","Prev.married","Single")
caff.marital
## 0 1-150 151-300 >300
## Married 652 1537 598 242
## Prev.married 36 46 38 21
## Single 218 327 106 67
names(dimnames(caff.marital)) <- c("marital","consumption")
caff.marital
## consumption
## marital 0 1-150 151-300 >300
## Married 652 1537 598 242
## Prev.married 36 46 38 21
## Single 218 327 106 67
as.data.frame(as.table(caff.marital))
table(menarche,tanner)
## tanner
## menarche I II III IV V
## No 221 43 32 14 2
## Yes 1 1 5 26 202
xtabs(~ tanner + sex, data=juul)
## sex
## tanner M F
## I 291 224
## II 55 48
## III 34 38
## IV 41 40
## V 124 204
total.caff <- margin.table(caff.marital,2)
total.caff
## consumption
## 0 1-150 151-300 >300
## 906 1910 742 330
barplot(total.caff, col="white")
par(mfrow=c(2,2))
barplot(caff.marital, col="white")
barplot(t(caff.marital), col="white")
barplot(t(caff.marital), col="white", beside=T)
barplot(prop.table(t(caff.marital),2), col="white", beside=T)
par(mfrow=c(1,1))
barplot(prop.table(t(caff.marital),2),beside=T,
legend.text=colnames(caff.marital),
col=c("white","grey80","grey50","black"))
Piecharts
opar <- par(mfrow=c(2,2),mex=0.8, mar=c(1,1,2,1))
slices <- c("white","grey80","grey50","black")
pie(caff.marital["Married",], main="Married", col=slices)
pie(caff.marital["Prev.married",],
main="Previously married", col=slices)
pie(caff.marital["Single",], main="Single", col=slices)
par(opar)