#########################################################
###################### Recap  ###########################
#########################################################

load("../week3/20151013result.rdata")

subframe <- select(result, measurement, first_pulse, subject)

subframe <- filter(result, response_time < 5000) %>%
  select(measurement, first_pulse, subject)


sumframe <- group_by(result, subject) %>%
  summarise(right.perc = sum(accuracy == 1)/n(),
            mean.resp.time = mean(response_time, na.rm = T))

head(sumframe)

p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point()

#########################################################
################### Exercises ###########################
#########################################################

## load the data and run the following four lines to
## create a new variable containing the sex of the
## person in the video


require(car)
require(stringr)

result$video2 <- str_replace_all(result$video,"\\.avi|[0-9]", "")
result$video.sex <- recode(result$video2,"c('Alexandra','Anahita','Anna','Birgit','Carolin','Charlotte','Franziska','Gabi','Hannelore','Isabel','Jana','Julia','Juliane','Kathrin','Katja','Klara','Lina','Mailin','Marie','Lara','Laura','Lena','Mona','Natalie','Nina','Olga','Sabine','Sabrina','Silke','Stephanie')='female';c('Achim','Anton','Aziz','Ben','Bernhard','Marius','Markus','Matthias','Michael','Nabeel','Paul','Peter','Richard','Roland','Rolf','Stephan','Thomas','Tim','Tobias','Uwe','Christoph','Daniel','David','Felix','Gerd','Hannes','Holger','Ivko','Klaus','Leo') = 'male'")


## use dplyr to summarize your data per time point
## and per person:
## calculate the 1. proportion of right answers and
## 2. the mean response time per
## person and time point
## useing group_by() and summarize()


## now visualize the proportion dependent on time:
## use ggplot() and geom_boxplot()
## map time to x and the proportion to y using aes()
## inside of ggplot()


## repeat the exercise,  but this time group additional by
## the sex of the person in the video


## visualize for each of the trials (1-48) the mean time
## and the percentage of right answers
## use facet_wrap to plot separate plots for each time point


#########################################################
################### Modelling ###########################
#########################################################


setwd("/media/mandy/Samsung_T1/mpicbs/2015kurs2/week4/")

require(dplyr)
## Anova
oneway <- read.table("data/anovagarden.txt",header = T)
names(oneway) <- c("ozone","garden")

oneway

t(oneway %>% group_by(garden) %>% summarise(mean=mean(ozone)))

mean(oneway$ozone)
sum((oneway$ozone-mean(oneway$ozone))**2)

(oneway$ozone-mean(oneway$ozone))**2


### Anova Grafiken
plot(oneway$ozone,pch=19)
abline(h=mean(oneway$ozone))
for(i in 1:14){lines(c(i,i),c(mean(oneway$ozone),oneway$ozone[i]))}

plot(oneway$ozone,pch=19,ylim=c(4,12))
abline(h=mean(oneway$ozone))
for(i in 1:14){lines(c(i,i),c(mean(oneway$ozone),oneway$ozone[i]))}
text(x=1:14,y=4,labels=(oneway$ozone-mean(oneway$ozone))**2,cex = 1.8)

oneway <- oneway %>% group_by(garden) %>% mutate(mean=mean(ozone))

par("mar")
par(mar=c(5,4,0,2)+0.1)
plot(oneway$ozone,pch=21,bg=oneway$garden)
abline(h=tapply(oneway$ozone,oneway$garden,mean),col=c(1,2))
for(i in 1:14){lines(c(i,i),c(oneway$mean[i],oneway$ozone[i]),col=oneway$garden[i])}


plot(oneway$ozone,pch=21,bg=oneway$garden,ylim = c(4,12))
abline(h=tapply(oneway$ozone,oneway$garden,mean),col=c(1,2))
for(i in 1:14){lines(c(i,i),c(oneway$mean[i],oneway$ozone[i]),col=oneway$garden[i])}
text(x=1:14,y=4,labels=(oneway$ozone-oneway$mean)**2,cex = 1.8)

mm <- aov(ozone ~ garden, data=oneway)

plot(seq(0.1,20,by=0.1),
     df(seq(0.1,20,by=0.1),1,12),
     type="l",
     xlab = "F",
     ylab = "density")
x <- seq(15.75,20,by=0.1)
polygon(c(x,rev(x)),c(df(x,1,12),rep(0,length(x))),col="firebrick2")
x <- seq(0.01,15.75,by=0.1)
polygon(c(x,rev(x)),c(df(x,1,12),rep(0,length(x))),col="darkgreen")

abline(v=15.75)

## anova syntax R
mm <- lm(ozone ~ garden, data=oneway)
anova(mm)

m2 <- aov(ozone ~ garden, data=oneway)
summary(m2)

########################################################################
######################## Exer Anova    #################################
########################################################################


## install and load the granovaGG package (a package for visualization
## of ANOVAs)


## load the arousal data frame and use the stack()} command to bring
## the data in the long form.


## Do a anova analysis. Is there a difference at least 2 of the gddroups?


## If indicated do a post-hoc test.


## Visualize your results


#########################################################
###################### lin regression  ##################
#########################################################


load("data/births.rdata")


# metric response...
m.1 <- lm(bweight ~ gestwks, data=births)
coef(m.1)

summary(m.1)

## exercise

## create a scatter plot using ggplot the independent
## variable on the x-axis and the dependent variable on
## the y-axis

## ggplot knows lm


# extractor funcions
summary(m.1)

coef(m.1)
confint(m.1)

coef(summary(m.1))

## other useful functions

## illustrate meaning of fitted and residuals
tmpdf <- data.frame(gestwks=births$gestwks, bweight=births$bweight , fitted=fitted(m), residuals=resid(m.1))

m.2 <- lm(bweight ~ gestwks, data=births, na.action = na.exclude)
tmpdf <- data.frame(gestwks=births$gestwks, bweight=births$bweight , fitted=fitted(m.2), residuals=resid(m.2))

ggplot(tmpdf, aes( x = gestwks, y = bweight)) +
  geom_point() +
  geom_line(aes(y = fitted))


## illustrate meaning of predict
ga <- 10:60
pred.birth.weight <- predict(m.1,newdata=data.frame(gestwks=ga))
tmpdf <- data.frame(gestwks=ga,bweight=pred.birth.weight)

## illustrate meaning of deviance
m0 <- lm(bweight~1,data=births)
deviance(m0) ## total sum of sqares
deviance(m.1) ## error sum of squares of our model

(deviance(m0)-deviance(m))/deviance(m0) ## r-squared: slightly different because there are some missing values

btmp <- births[complete.cases(births[,c("bweight","gestwks")]),]
m0 <- lm(bweight~1,data=btmp) ## repeat with this model (missing values excluded) you get the appropriate r-squared

(deviance(m0)-deviance(m))/deviance(m0) ## r-squared: slightly different because there are some missing values

## ANOVA
## slide explanatory v is factor
m.aov <- lm(bweight ~ hyp, data=births)
coef(m.aov) ## gives a mean and a difference of means

by(births$bweight,births$hyp,mean) ## gives the two means

m.aov2 <- lm(bweight ~ -1 + hyp, data=births) ## gives also the two means
coef(m.aov2)


summary(m.aov)

## What is the appropriate plot to visualize the effect of hyp?

ggplot(births, aes(x = hyp, y = bweight)) +
  geom_boxplot() +
  stat_summary(fun.data = "mean_cl_boot")


## What is the most common test to test these effect?
t.test(bweight ~ hyp, data = births)


## model with both gestwks and hyp
m.3 <- lm(bweight ~ hyp + gestwks, data=births)
summary(m.3)

coef(m.3)
confint(m.3)

m.4 <- lm(bweight ~ hyp + gestwks - 1, data=births)
summary(m.4)

coef(m.4)
confint(m.4)


## interaction models, scaled
m.5 <- lm(bweight ~ hyp * gestwks, data=births)


births$gwsc <- births$gestwks-40
m.6 <- lm(bweight ~ hyp * gwsc, data=births)

summary(m)

## aov
m <- lm(bweight ~ gestwks, data=births)
anova(m)

sum(anova(m)$Sum)

anova(m)$Sum[1]/sum(anova(m)$Sum)

summary(m)
summary(m)$r.squared

#########################################################
#################### Exercises  #########################
#########################################################

## gambling

## Fit a regression model with the expenditure on gambling
## as the response and the sex, status, income
## and verbal score as predictors. Present the output.

require(faraway)


## What percentage of variation in the response is explained
## by these predictors?


## Which observation has the largest (positive) residual?
## Give the case number.


## Compute the correlation of the residuals with the fitted values.
## use cor() or cor.test()


## Compute the correlation of the residuals with the income.


## For all other predictors held constant, what would be the
## difference in predicted expenditure on gambling for a male
## compared to a female?


##################### additional stuff ###########################
### jackknife

leverage <- function(x){1/length(x)+(x-mean(x))**2/sum((x-mean(x))**2)}
jtmp <- numeric()
for(i in 1:nrow(births)){
  m <- lm(bweight ~ hyp + gestwks, data=births[-i,])
  jtmp[i] <- summary(m)$r.squared
}

m1a <- lm(bweight ~ hyp + gestwks, data=births[-1,])


png("model1.png",width=1000,height=600)
par(cex.lab=2,cex.axis=1.5, mar=c(5,5,4,2)+0.1)
plot(births$gestwks,births$bweight,col=births$hyp,cex=2,pch=".",xlim=c(30,43),ylim=c(1500,4200))
abline(coef(m)[c(1,3)],lwd=2)
abline(coef(m)[1]+coef(m)[2],coef(m)[3],col="red",lwd=2)
text(x=30,y=1600,"A",cex=2)
text(x=32,y=1600,"B",col="red",cex=2)
text(36.5,2300,"hyp=hyper",col="red",cex=2)
text(34,2600,"hyp=normal",cex=2)
dev.off()

## slide a multivariable model
m <- lm(bweight ~ hyp + gestwks, data=births)
summary(m)

## slide interaction between
m <- lm(bweight ~ hyp * gestwks, data=births)

png("model2.png",width=1000,height=800)
par(cex.lab=2,cex.axis=1.5, mar=c(5,5,4,2)+0.1)
plot(births$gestwks,births$bweight,col=births$hyp,cex=2,pch=".",xlim=c(30,43),ylim=c(1500,4200))
abline(coef(m)[c(1,3)],lwd=2)
abline(coef(m)[1]+coef(m)[2],coef(m)[3]+coef(m)[4],col="red",lwd=2)
text(x=30,y=1680,"A",cex=2)
text(x=32,y=1500,"B",col="red",cex=2)
text(36.5,2300,"hyp=hyper",col="red",cex=2)
text(34,2600,"hyp=normal",cex=2)
dev.off()