#################################################################
###################### Recap Exercises ##########################
#################################################################

## use dir() with the pattern argument to get a list of all file
## names containing the number 25497; store the list in a variable
## named files or something similar


## use lapply() on this list to read in the respective files;
## again store the result in a appropriately names variable
## (remember: we had to set the header and skip arguments)


## on the resulting list run simultaneously dim() to get the number
## of rows and columns (again using lapply()): How many rows
## and columns we have in these data frames


## now use names() as function to get the column names of each
## of the data frame


####################################################
##################### reduce #######################
####################################################

tmp <- lapply(data.list,names)
lapply(2:length(data.list), function(x) all(tmp[[1]] == tmp[[x]]))

lapply(data.list,ncol)

res <- Reduce(rbind,data.list)


res <- Reduce(function(x,y){
    names(y) <- names(x)
    rbind(x,y)
}, data.list)


res <- Reduce(function(x,y){
    names(x) <- names(y) <- paste0("col",1:ncol(x))
    rbind(x,y)
}, data.list)


## read Lines
require(lubridate)
files <- dir("../logfiles", full.names = T)
filename <- files[1]

xx <- readLines(filename)
head(xx)

d1 <- read.table(text = xx[4:length(xx)],
                 sep = "\t",header=T)
d2 <- read.table(text = xx[1:2],sep = "\t",header=T)

tmp <- lapply(files,function(filename){
       xx <- readLines(filename)
       d1 <- read.table(text = xx[4:length(xx)],sep = "\t",fill = T,header=T)
       names(d1)[3] <- "trial"
       d2 <- read.table(text = xx[1:2],sep = "\t",fill = T,header=T)
       d1$timepoint <- as.character(d2$timepoint.)
       d1$datetime <- mdy_hms(d2$date.)
       return(d1)
   })

system.time(result <- Reduce(rbind,tmp))

## add the subject and the feedback duration to the resulting data frame
## save data frame result using the following syntax (change path and
## name to your needs)


save(result, file = "../20151013result.rdata")

## load the fake subject data (fakepersdat.rdata)
## using load(), merge theme to the result data frame


####################################################
###################### dplyr  ######################
####################################################

require(dplyr)
## filter
prob22029 <- filter(result, subject == "22029_39")
table(prob22029$subject)

prob22029 <- filter(result, subject == "22029_39",
                    accuracy == 1)
table(prob22029$subject,prob22029$accuracy)


prob22029 <- filter(result, subject == "22029_39",
                    accuracy == 1) %>% droplevels
table(prob22029$subject,prob22029$accuracy)

prob22029 <- filter(result, subject == "22029_39"|accuracy == 1)
table(prob22029$subject,prob22029$accuracy)


fast <- filter(result, response_time < 5000)
summary(fast$response_time)

## select
subframe <- select(result, measurement, first_pulse, subject)
nrow(subframe)

subframe <- filter(result, response_time < 5000) %>%
    select(measurement, first_pulse, subject)
nrow(subframe)

## arrange
head(select(arr.frame,name,measurement,timepoint,response_time))
## head(arr.frame[,c("name","measurement","timepoint","response_time")])

arr.frame <- arrange(result, response_time)

head(select(arr.frame,name,measurement,timepoint,response_time))

arr.frame <- arrange(result, response_time) %>%
    filter(response_time > 0)
head(select(arr.frame,name,measurement,timepoint,response_time))


## mutate/summariz(s)e
require(stringr)
subframe <- filter(result,subject == "22074_39") %>%
    mutate(video2 = str_replace(video, "\\.avi", ""),
           video3 = str_replace(video2, "[0-9]", ""),
           concern_time = concern_time_ended - concern_time_started )

table(subframe$subject)

summary(subframe$concern_time)
table(subframe$video3)

## summariz(s)e
sumframe <- summarise(result,
                      right.perc = sum(accuracy == 1)/n(),
                      mean.resp.time = mean(response_time, na.rm = T))


## group_by and summarise
sumframe <- group_by(result, subject) %>%
    summarise(right.perc = sum(accuracy == 1)/n(),
              mean.resp.time = mean(response_time, na.rm = T))

head(sumframe)


sumframe <- group_by(result, subject, timepoint) %>%
    summarise(right.perc = sum(accuracy == 1)/n(),
              mean.resp.time = mean(response_time, na.rm = T))

head(sumframe)


sumframe <- group_by(result, subject, timepoint) %>%
    summarise(right.perc = sum(accuracy == 1)/n(),
              mean.resp.time = mean(response_time, na.rm = T)) %>%
    arrange(right.perc,desc(mean.resp.time))
head(sumframe)


sumframe <- group_by(result, subject, timepoint) %>%
    summarise(right.perc = sum(accuracy == 1)/n(),
              mean.resp.time = mean(response_time, na.rm = T)) %>%
    ungroup() %>% 
    arrange(right.perc,desc(mean.resp.time))
head(sumframe)


###########################################################################
#########################  dplyr exercises ################################
###########################################################################
require(dplyr)
## use select() and filter() in combination (%>%) to select
## all rows belonging to the T0 or the T3 test, keep subject,
## timepoint, accuracy, response_button, response_time,
## affect_time_ended and concern_pos_started column.
## Create a new data frame with a appropriate name.


## add three new variables containing the counts of each of the possible
## values of accuracy. Use mutate() and something like sum(accuracy==1).


## use group_by() and summarise() to extract the minimum and maximum
## response_time per person from the original data frame


## look at the resulting data frame. What is the problem and the
## possible reason. And what is the solution 


## repeat the last exercise, but now group per person and time point


## reshaping
require(reshape2)
sum.df2.wide <- dcast(sum.df2,
                      subject ~ timepoint,
                      value.var = c("min.resp.time"))


sum.df2 <- as.data.frame(sum.df2)
sum.df2.wide <- reshape(sum.df2,
                        timevar = "timepoint",
                        idvar = "subject",
                        direction = "wide")


###########################################################################
#########################  graphics #######################################
###########################################################################

## traditional graphics

x <- rnorm(40)
y <- factor(sample(c("yes","now"),40,replace=T))
par(mfrow=c(2,2))
boxplot(x~y)
boxplot(x~y,boxwex=0.5)
z <- sample(1:5)
barplot(z)
barplot(z, horiz=T)


require(ggplot2)
## create a new object
po <- ggplot()
summary(po)

## show structure of the object
str(po)

## example data
x1 <- 1:10; y1 <- 1:10; z1 <- 10:1
l1 <- LETTERS[1:10]
a <- 10; b <- (0:-9)/10:1
ex <- data.frame(x1=x1,y1=y1,z=z1,l=l1,a=a,b=b)

## create a new ggplot object containing the data
po <- ggplot(ex,aes(x=x1,y=y1))
summary(po)

## scatter plot
p1 <- po + geom_point()


## second example data frame
ex2 <- data.frame(x1=sample(1:20),
                  y1=sample(1:10),
                  l=letters[1:20])
head(ex2,10)

## replace data in po
pn <- p1 %+% ex2 
pn + geom_line()


## add a text layer
my.text <- geom_text(aes(label=l), 
                         hjust=1.1, 
                         vjust=-0.2)
pn + geom_path() + my.text


### add lines
## one line
p1 + geom_abline(intercept=10,slope=-1,
                colour=rgb(.5,.5,.9))
## two lines
p1 + geom_abline(intercept=c(10,1),slope=c(-1,-2),
                colour=rgb(.5,.5,.9))
## more lines -> takes only the first intercept
p1 + geom_abline(intercept=10:1,slope=-(10:1)/10,
                colour=rgb(.5,.5,.9))

p1 +
  geom_abline(aes(slope=b,intercept=a,colour=x1)) + 
  scale_x_continuous(limits=c(0,10))

p1 + geom_hline(yintercept=1:10)
p1 + geom_hline(yintercept=1:10) + 
    geom_vline(xintercept=1:10)


###########################################################################
################## Exercises ggplot2 ######################################
###########################################################################
require(ggplot2)
load("../20151013sumdf.rdata")

## use the last summary data frame from the dplyr exercise (with
## subject and timepoint as grouping variables) to create a scatter
## plot (geom_point()) colouring the points different per timepoint


require(ggplot2)
head(result)


## Map the category variable to the x-axis and the
## affect_pos_rated to the y-axis, use geom_boxplot()


## use facet\_wrap as above to facet the plot per timepoint


## create a scatter plot with affect\_pos\_started on the x-axis
## and affect_pos_rated on the y-axis                           


## add a geom_smooth()


## add a colour aesthetics


###################################################################


ggplot(result, aes(x = timepoint, y = as.numeric((accuracy == 1)))) +
    geom_point(stat = "summary", fun.y = "mean")


require(dplyr)

res.l <- split(result,result$subject)

res.pl <- lapply(res.l, function(x) {
    ggplot(x, aes(x = timepoint, y = as.numeric((accuracy == 1)))) +
        geom_point(stat = "summary", fun.y = "mean") +
            ggtitle(x$subject[1])
})

pdf("all.pdf")
lapply(res.pl, print)
dev.off()

require(gridExtra)
ggsave("arrange2x2.pdf", do.call(marrangeGrob, c(res.pl, list(nrow=2, ncol=2))))


result$accuracy <- factor(result$accuracy,
                          levels = 0:1,
                          labels = c("wrong","right"))