### MT05 week 3 script

M <- 40; p <- 0.2
z <- rbinom(1000, size=M, p=p)
cdf.compare(z, distribution = "binom", size=M, p=p)
cdf.compare(z, distribution = "norm", mean=M*p, sd=sqrt(M*p*(1-p)))

# change M and p and repeat

x <- rexp(25); y <- rexp(25)
t.test(x, y)
t.test(x, y)$statistic
## let's assemble results from 1000 trials
res <- numeric(1000)
for(i in 1:1000) {
  x <- rexp(25); y <- rexp(25)
  res[i] <- t.test(x, y)$statistic
}
library(MASS)
truehist(res)
lines(density(res, width="sj"))
xx <- seq(-4, 4, len=100)
lines(xx, dt(xx, df=48), col=4)

cdf.compare(res, distribution="t", df=48)

quantile(res, p=c(0.025, 0.975))
qt(c(0.025, 0.975), df=48)
table(cut(res, breaks=c(-Inf, qt(0.025, df=48), qt(0.975, df=48), Inf)))


library(car)
?Davis

summary(Davis)
pairs(Davis)
attach(Davis)
plot(height, weight)
identify(height, weight)
detach("Davis")

nDavis <- Davis # make a copy
nDavis[31, 2:3] <- Davis[31, 3:2] # replace 31 by correct value!
pairs(nDavis)

sex <- nDavis[,1]
pairs(nDavis[-1],
      panel = function(x, y) {
         points(x[sex=="M"], y[sex=="M"], pch=25)
         points(x[sex=="F"], y[sex=="F"], pch=26)
      })

pairs(nDavis[-1],
      panel = function(x, y) {
         points(x[sex=="M"], y[sex=="M"], col=6)
         points(x[sex=="F"], y[sex=="F"], col=4)
      })

trellis.device()
splom(~nDavis[-1])
splom(~nDavis[-1], groups = sex, panel=panel.superpose)
splom(~nDavis[-1] | sex, strip=function(...) strip.default(style=1, ...))
dev.off()

cor(nDavis[2:5], na.method="available")

cor(nDavis[sex == "M", 2:5], na.method="available")
cor(nDavis[sex == "F", 2:5], na.method="available")

attach(nDavis)
cor.test(height, weight)
cor.CI <- function(x, y) {
  r <- cor.test(x, y)$estimate
  n <- length(x)
  c(tanh(c(atanh(r) + qnorm(0.025)/sqrt(n-3),
           atanh(r) + qnorm(0.975)/sqrt(n-3))))
}
cor.CI(height, weight)

mu <- sapply(nDavis[2:3], mean)
Sigma <- var(nDavis[2:3])

library(MASS)  # for mvrnorm
?mvrnorm
sim <- mvrnorm(200, mu=mu, Sigma=Sigma)
plot(sim)
cor.CI(sim[, "height"], sim[, "weight"])

set.seed(some_value)  # so you can repeat this
res <- matrix(, 1000, 2)
for(i in 1:1000) {
  sim <- mvrnorm(200, mu=mu, Sigma=Sigma)
  res[i, ] <- cor.CI(sim[, "height"], sim[, "weight"])
}

covered <- (res[, 1] < 0.771) & (res[, 2] > 0.771)
table(covered)

plot(c(0.6, 0.9), c(1, 100), xlab="confidence interval", ylab="", type="n")
abline(v = 0.771, lty=2)
res100 <- res[1:100, ]
res100 <- res100[sort.list(res100[, 1]), ]  # sort on left end
segments(res100[, 1], 1:100, res100[, 2], 1:100)

nDavis0 <- na.omit(nDavis)  #  to get rid of the missing values
attach(nDavis0)
t.test(height, repht)

M <- 20
(nos <- sample(seq(along=height), size = 20, replace = F))
t.test(height[nos], repht[nos])

tstat <- numeric(250)
for(i in 1:250) {
  nos <- sample(seq(along=height), size = M, replace = F)
  tstat[i] <- t.test(height[nos], repht[nos])$statistic
}
truehist(tstat)
lines(density(tstat, width="sj"))

### now change M

library(Sleuth)
attach(case0102)
bankF <- as.vector(SALARY[SEX=="FEMALE"])
bankM <- as.vector(SALARY[SEX=="MALE"])
detach()
bankM
bankF

var.test(bankM, bankF)
t.test(bankM, bankF, alternative = "greater")
t.test(bankM, bankF, alternative = "greater", var.equal = F)

wilcox.test(bankM, bankF, alternative="greater")

stem(bankF)
stem(bankM)
par(mfrow=c(2,1))
truehist(bankF, xlim=c(3000, 7000))
rug(jitter(bankF))
lines(density(bankF, n=500, width="sj"))
truehist(bankM, xlim=c(4000, 9000))
rug(jitter(bankM))
lines(density(bankM, n=500, width="sj"))
par(mfrow=c(1,1))

salary <- case0102[, "SALARY"]
M <- 1000
tstat <- numeric(M)
for(i in 1:M) {
   male <- sample(32+61, 32)
   tstat[i] <- t.test(salary[male], salary[-male],
     alternative = "greater", var.equal = F)$statistic
}
summary(tstat)
par(mfrow=c(1,2))
truehist(tstat)
xx <- seq(-4, 4, len=100); lines(xx, dt(xx, df=51))
cdf.compare(tstat, distribution = "t", df = 51)
par(mfrow=c(1,1))