attachment:session4code.r von RstatisTik/RstatisTikPortal/RcourSe

   1 setwd("/media/mvogel/Volume/transcend/mpicbs/2015kurs/session4")
   2 
   3 ## load the data (file: session4data.rdata)
   4 ## make a new summary data frame (per subject and time) containing:
   5 ###### the number of trials
   6 ###### the number correct trials (absolute and relative)
   7 ###### the mean TTime and the standard deviation of TTime
   8 ###### the respective standard error of the mean
   9 ##  keep the information about Sex and Age\_PRETEST
  10 
  11 load("session4data.rdata")
  12 
  13 require(dplyr)
  14 sumdf <- data %>%
  15     group_by(Subject,Sex,Age_PRETEST,testid) %>%
  16     summarise(count=n(),
  17               n.corr = sum(Stim.Type=="hit"),
  18               perc.corr = n.corr/count,
  19               mean.ttime = mean(TTime),
  20               sd.ttime = sd(TTime),
  21               se.ttime = sd.ttime/sqrt(count))
  22 
  23 
  24 ## make a plot with time on the x-axis and TTime on the y-axis
  25 ## showing the means and the 95\% confidence intervals (geom_pointrange())
  26 ## hint: you can use operations inside aes()
  27 
  28 require(ggplot2)
  29 
  30 p1 <- ggplot(sumdf,aes(x=testid,
  31                  y=mean.ttime,
  32                  ymin=mean.ttime - 1.96*se.ttime,
  33                  ymax=mean.ttime + 1.96*se.ttime)) +
  34        geom_pointrange() +
  35        facet_wrap(~Subject)
  36 
  37 ## second variant
  38 
  39 sumdf$conlow <- sumdf$mean.ttime - 1.96 * sumdf$se.ttime
  40 sumdf$conup <- sumdf$mean.ttime + 1.96 * sumdf$se.ttime
  41 
  42 require(ggplot2)
  43 p1 <- ggplot(sumdf,aes(x=testid,
  44                  y=mean.ttime,
  45                  ymin=conlow,
  46                  ymax=conup)) +
  47        geom_pointrange() +
  48        facet_wrap(~Subject)
  49 
  50 
  51 ## add the number of trials and the percentage of
  52 ## correct ones using geom\_text()
  53 
  54 p1 + geom_text(y=0,aes(label=n.corr),angle=90,size=3) +
  55     geom_text(y=60000,aes(label=round(perc.corr,2)),angle=90,size=3) +
  56     scale_y_continuous(limits=c(0,75000))
  57 
  58 
  59 require(stringr)
  60 p1 + geom_text(y=0, aes(label=n.corr),size=4) +
  61     geom_text(y=70000,
  62               aes(label=str_pad(round(perc.corr,2),width = 4,side = "right",pad=0)),
  63               angle=90,hjust=1,size=4) +
  64     scale_y_continuous(limits=c(0,70000))
  65 
  66 
  67 ## Write a function which takes a vector, the poulation standard deviation
  68 ## and the population mean as arguments
  69 ## and which gives the Z score as result. 
  70 
  71 
  72 ztest <- function(x,x.sd,mu=0){
  73     sqrt(length(x)) * (mean(x)-mu)/x.sd
  74 }
  75 
  76 set.seed(1)
  77 ztest(rnorm(100),x.sd = 1)
  78 
  79 ## add a line to your function that allows you to also process numeric
  80 ## vectors containing missing values!
  81 
  82 ztest <- function(x,x.sd,mu=0){
  83     x <- x[!is.na(x)]
  84     if(length(x) < 3) stop("too few values in x")
  85     sqrt(length(x)) * (mean(x)-mu)/x.sd
  86 }
  87 
  88 set.seed(1)
  89 ztest(rnorm(100),x.sd = 1)
  90 
  91 ## the function pnorm(Z) gives the probability of x <= Z. Change your
  92 ## function so has the p-value as result.
  93 
  94 ztest <- function(x,x.sd,mu=0){
  95     x <- x[!is.na(x)]
  96     if(length(x) < 3) stop("too few values in x")
  97     z <- sqrt(length(x)) * (mean(x)-mu)/x.sd
  98     2*pnorm(-abs(z))
  99 }
 100 
 101 set.seed(1)
 102 ztest(rnorm(100),x.sd = 1)
 103 
 104 ## now let the result be a named vector containing the estimated
 105 ## difference, Z, p and the n.
 106 
 107 
 108 ztest <- function(x,x.sd,mu=0){
 109     x <- x[!is.na(x)]
 110     if(length(x) < 3) stop("too few values in x")
 111     est.diff <- mean(x)-mu
 112     z <- sqrt(length(x)) * (est.diff)/x.sd
 113     c(diff=est.diff,Z=z,pval=2*pnorm(-abs(z)),n=length(x))
 114 }
 115 
 116 set.seed(1)
 117 ztest(rnorm(100),x.sd = 1)
 118 
 119 ######################################################################
 120 ####################### Simulation exerc #############################
 121 ######################################################################
 122 
 123 ## Now sample 100 values from a Normal distribution with mean 10
 124 ## and standard deviation 2 and use a z-test to compare it against
 125 ## the population mean 10. What is the p-value?
 126 
 127 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)["pval"]
 128 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)["diff"]
 129 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)[c("pval","diff")]
 130 
 131 
 132 
 133 ## Now do the sampling and the testing 1000 times, what would be the
 134 ## number of statistically significant results? Use replicate()
 135 ## (which is a wrapper of tapply()) or a for() loop! Record at
 136 ## least the p-values and estimated differences! Transform the
 137 ## result into a data frame.
 138 
 139 ### replicate()
 140 res <- replicate(1000, ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10))
 141 res <- as.data.frame(t(res))
 142 
 143 
 144 ### replicate(,simplify=F) 
 145 res <- replicate(1000, ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10),simplify = F)
 146 res <- as.data.frame(Reduce(rbind,res))
 147 
 148 
 149 ## for() loop
 150 res <- matrix(numeric(2000),ncol=2)
 151 for(i in seq.int(1000)){
 152     res[i,] <- ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)[c("pval","diff")] }
 153 res <- as.data.frame(res)
 154 xnames(res) <- c("pval","diff")
 155 
 156 ## Use table() to count the p-vals below 0.05.
 157 table(res$pval < 0.05)
 158 
 159 ## What is the smallest absolute difference with a p-value below 0.05?
 160 tapply(abs(res$diff),res$pval < 0.05,summary)
 161 min(abs(res$diff[res$pval<0.05]))
 162 
 163 ## Repeat the last exercise, only change the sample size to 1000 in each of the 1000 samples! How many p-value below 0.05? What is now the smallest absolute difference with a p-value below 0.05?
 164 res2 <- replicate(1000, ztest(rnorm(1000,mean=10,sd=2),
 165                              x.sd=2,mu=10))
 166 res2 <- as.data.frame(t(res2))
 167 
 168 ## Use table() to count the p-vals below 0.05.
 169 table(res2$pval < 0.05)
 170 
 171 ## What is the smallest absolute difference with a p-value below 0.05?
 172 tapply(abs(res2$diff),res$pval < 0.05,summary)
 173 
 174 ## Concatenate the both resulting data frames from above using rbind()
 175 res <- rbind(res,res2)
 176 
 177 ## Plot the distributions of the pvals and the difference per
 178 ## sample size. Use \texttt{ggplot2} with an appropriate
 179 ## geom (density/histogram)
 180 
 181 
 182 require(ggplot2)
 183 ggplot(res,aes(x=pval)) +
 184     geom_histogram(bin=0.1,fill="forestgreen") +
 185     facet_grid(~ n)
 186 
 187 ggsave("hist.png")
 188 
 189 ggplot(res,aes(x=diff,colour=factor(n))) +
 190     geom_density(size=3)
 191 
 192 ggsave("dens.png")
 193 
 194 
 195 ggplot(res,aes(x=diff,y=pval)) +
 196     geom_point() +
 197     geom_hline(yintercept=0.05) +
 198     facet_grid(~ n)
 199 
 200 ggsave("point.png")
 201 
 202 ggplot(res,aes(x=diff,y=pval)) +
 203     geom_hex() +
 204     geom_hline(yintercept=0.05) +
 205     facet_grid(~ n)
 206 
 207 ggsave("dens2d.png")
 208 
 209 ######################################################################
 210 ####################### T-Test #######################################
 211 ######################################################################
 212 
 213 
 214 ## one sample
 215 set.seed(1)
 216 x <- rnorm(12)
 217 
 218 t.test(x,mu=0)
 219 
 220 t.test(x,mu=1)
 221 
 222 
 223 ## two sample Welch or Satterthwaite test 
 224 set.seed(1)
 225 x <- rnorm(12)
 226 y <- rnorm(12)
 227 g <- sample(c("A","B"),12,replace = T)
 228 t.test(x, y)
 229 t.test(x ~ g)
 230 t.test(x, y, var.equal = T)
 231 
 232 
 233 ######################################################################
 234 ####################### Exercises  ###################################
 235 ######################################################################
 236 
 237 ## use a t-test to compare TTime according to Stim.Type,
 238 ## visualize it. What is the problem?
 239 
 240 t.test(data$TTime ~ data$Stim.Type)
 241 
 242 ggplot(data,aes(x=Stim.Type,y=TTime)) +
 243     geom_boxplot()
 244 
 245 ## now do the same for Subject 1 on pre and post test (use filter()
 246 ## or indexing to get the resp. subsets)
 247 
 248 t.test(data$TTime[data$Subject==1 & data$testid=="test1"] ~
 249        data$Stim.Type[data$Subject==1 & data$testid=="test1"])
 250 
 251 t.test(data$TTime[data$Subject==1 & data$testid=="test2"] ~
 252        data$Stim.Type[data$Subject==1 & data$testid=="test2"])
 253 
 254 
 255 
 256 
 257 ## use the following code to do the test on every subset Subject
 258 ## and testid, try to figure what is happening in each step:
 259 
 260 data$Stim.Type <- droplevels(data$Stim.Type)
 261 
 262 data.l <- split(data,list(data$Subject,data$testid))
 263 
 264 tmp.l <- lapply(data.l,function(x) {
 265     if(min(table(x$Stim.Type)) < 5) return(NULL)
 266     tob <- t.test(x$TTime ~ x$Stim.Type)
 267     tmp <- data.frame(
 268         Subject = unique(x$Subject),
 269         testid = unique(x$testid),
 270         mean.group.1 = tob$estimate[1],
 271         mean.group.2 = tob$estimate[2],
 272         name.test.stat = tob$statistic,
 273         conf.lower = tob$conf.int[1],
 274         conf.upper = tob$conf.int[2],
 275         pval = tob$p.value,
 276         alternative = tob$alternative,
 277         tob$method)})
 278 
 279 res <- Reduce(rbind,tmp.l)
 280 
 281 ## plots
 282 
 283 ggplot(data,aes(x=testid,y=TTime)) +
 284     geom_boxplot(aes(fill=Stim.Type)) +
 285     facet_wrap(~Subject)
 286 
 287 
 288 ggplot(data,aes(x=factor(Subject),y=TTime)) +
 289     geom_boxplot(aes(fill=Stim.Type)) +
 290     facet_wrap(~testid)
 291 
 292 
 293 ## how many tests have a statistically significant result?
 294 table(res$pval < 0.05)
 295 
 296 prop.table(table(res$pval < 0.05))
 297 
 298 ## Is there a tendency? What could be the next step?
 299 
 300 
 301 ## hypothesis: if the child tries to answer correctly it thinks about it 
 302 ## if it does not know the answer immediately. So the difference of
 303 ## TTime resp. correct/incorrect should be higher as the percentage of
 304 ## correct answers is higher
 305 
 306 
 307 tmp.l <- lapply(data.l,function(x) {
 308     if(min(table(x$Stim.Type)) < 5) return(NULL)
 309     tob <- t.test(x$TTime ~ x$Stim.Type)
 310     tmp <- data.frame(
 311         Subject = unique(x$Subject),
 312         testid = unique(x$testid),
 313         perc.corr = sum(x$Stim.Type=="hit")/sum(!is.na(x$Stim.Type)),
 314         mean.group.1 = tob$estimate[1],
 315         mean.group.2 = tob$estimate[2],
 316         name.test.stat = tob$statistic,
 317         conf.lower = tob$conf.int[1],
 318         conf.upper = tob$conf.int[2],
 319         pval = tob$p.value,
 320         alternative = tob$alternative,
 321         tob$method)})
 322 
 323 res <- Reduce(rbind,tmp.l)
 324 
 325 ggplot(res,aes(x=perc.corr,y=mean.group.1 - mean.group.2)) +
 326     geom_point() +
 327     geom_smooth(se=F) +
 328     geom_smooth(data = res[res$perc.corr < 0.65,],se=F, method="lm",colour="black") +
 329     geom_smooth(data = res[res$perc.corr > 0.65,],se=F, method="lm",colour="black") +
 330     annotate("rect",xmin=0.65,xmax=Inf,ymin=-Inf,ymax = Inf,fill="blue",alpha=0.1) + annotate("segment",x=0.65,xend=0.65,y=-20000,yend=-5000,size=3,arrow=arrow())
 331 
 332 
 333 ########################################################################
 334 ############## Exercises stats ggplot2    ##############################
 335 ########################################################################
 336 
 337 
 338 require(Hmisc)
 339 ggplot(data,aes(x=testid,y=TTime)) +
 340     stat_summary(fun.data="mean_se",mult=1.96,geom="pointrange") +
 341     stat_bin(y=0,aes(label=..count..),geom="text",position="identity") +
 342     scale_y_continuous(limits=c(0,75000)) +
 343     facet_wrap(~Subject)
 344 
 345 tdf <- data.frame(x=runif(1000),y=runif(1000))
 346 
 347 ggplot(tdf,aes(x=x,y=y)) + stat_quantile()
 348
Quick Links

Search Wiki

Page Tools

Dateianhang 'session4code.r'

Gespeicherte Dateianhänge