welcome: please sign in
location: attachment:session4code.r von RstatisTik/RstatisTikPortal/RcourSe

Dateianhang 'session4code.r'

Herunterladen

   1 setwd("/media/mvogel/Volume/transcend/mpicbs/2015kurs/session4")
   2 
   3 ## load the data (file: session4data.rdata)
   4 ## make a new summary data frame (per subject and time) containing:
   5 ###### the number of trials
   6 ###### the number correct trials (absolute and relative)
   7 ###### the mean TTime and the standard deviation of TTime
   8 ###### the respective standard error of the mean
   9 ##  keep the information about Sex and Age\_PRETEST
  10 
  11 load("session4data.rdata")
  12 
  13 require(dplyr)
  14 sumdf <- data %>%
  15     group_by(Subject,Sex,Age_PRETEST,testid) %>%
  16     summarise(count=n(),
  17               n.corr = sum(Stim.Type=="hit"),
  18               perc.corr = n.corr/count,
  19               mean.ttime = mean(TTime),
  20               sd.ttime = sd(TTime),
  21               se.ttime = sd.ttime/sqrt(count))
  22 
  23 
  24 ## make a plot with time on the x-axis and TTime on the y-axis
  25 ## showing the means and the 95\% confidence intervals (geom_pointrange())
  26 ## hint: you can use operations inside aes()
  27 
  28 require(ggplot2)
  29 
  30 p1 <- ggplot(sumdf,aes(x=testid,
  31                  y=mean.ttime,
  32                  ymin=mean.ttime - 1.96*se.ttime,
  33                  ymax=mean.ttime + 1.96*se.ttime)) +
  34        geom_pointrange() +
  35        facet_wrap(~Subject)
  36 
  37 ## second variant
  38 
  39 sumdf$conlow <- sumdf$mean.ttime - 1.96 * sumdf$se.ttime
  40 sumdf$conup <- sumdf$mean.ttime + 1.96 * sumdf$se.ttime
  41 
  42 require(ggplot2)
  43 p1 <- ggplot(sumdf,aes(x=testid,
  44                  y=mean.ttime,
  45                  ymin=conlow,
  46                  ymax=conup)) +
  47        geom_pointrange() +
  48        facet_wrap(~Subject)
  49 
  50 
  51 ## add the number of trials and the percentage of
  52 ## correct ones using geom\_text()
  53 
  54 p1 + geom_text(y=0,aes(label=n.corr),angle=90,size=3) +
  55     geom_text(y=60000,aes(label=round(perc.corr,2)),angle=90,size=3) +
  56     scale_y_continuous(limits=c(0,75000))
  57 
  58 
  59 require(stringr)
  60 p1 + geom_text(y=0, aes(label=n.corr),size=4) +
  61     geom_text(y=70000,
  62               aes(label=str_pad(round(perc.corr,2),width = 4,side = "right",pad=0)),
  63               angle=90,hjust=1,size=4) +
  64     scale_y_continuous(limits=c(0,70000))
  65 
  66 
  67 ## Write a function which takes a vector, the poulation standard deviation
  68 ## and the population mean as arguments
  69 ## and which gives the Z score as result. 
  70 
  71 
  72 ztest <- function(x,x.sd,mu=0){
  73     sqrt(length(x)) * (mean(x)-mu)/x.sd
  74 }
  75 
  76 set.seed(1)
  77 ztest(rnorm(100),x.sd = 1)
  78 
  79 ## add a line to your function that allows you to also process numeric
  80 ## vectors containing missing values!
  81 
  82 ztest <- function(x,x.sd,mu=0){
  83     x <- x[!is.na(x)]
  84     if(length(x) < 3) stop("too few values in x")
  85     sqrt(length(x)) * (mean(x)-mu)/x.sd
  86 }
  87 
  88 set.seed(1)
  89 ztest(rnorm(100),x.sd = 1)
  90 
  91 ## the function pnorm(Z) gives the probability of x <= Z. Change your
  92 ## function so has the p-value as result.
  93 
  94 ztest <- function(x,x.sd,mu=0){
  95     x <- x[!is.na(x)]
  96     if(length(x) < 3) stop("too few values in x")
  97     z <- sqrt(length(x)) * (mean(x)-mu)/x.sd
  98     2*pnorm(-abs(z))
  99 }
 100 
 101 set.seed(1)
 102 ztest(rnorm(100),x.sd = 1)
 103 
 104 ## now let the result be a named vector containing the estimated
 105 ## difference, Z, p and the n.
 106 
 107 
 108 ztest <- function(x,x.sd,mu=0){
 109     x <- x[!is.na(x)]
 110     if(length(x) < 3) stop("too few values in x")
 111     est.diff <- mean(x)-mu
 112     z <- sqrt(length(x)) * (est.diff)/x.sd
 113     c(diff=est.diff,Z=z,pval=2*pnorm(-abs(z)),n=length(x))
 114 }
 115 
 116 set.seed(1)
 117 ztest(rnorm(100),x.sd = 1)
 118 
 119 ######################################################################
 120 ####################### Simulation exerc #############################
 121 ######################################################################
 122 
 123 ## Now sample 100 values from a Normal distribution with mean 10
 124 ## and standard deviation 2 and use a z-test to compare it against
 125 ## the population mean 10. What is the p-value?
 126 
 127 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)["pval"]
 128 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)["diff"]
 129 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)[c("pval","diff")]
 130 
 131 
 132 
 133 ## Now do the sampling and the testing 1000 times, what would be the
 134 ## number of statistically significant results? Use replicate()
 135 ## (which is a wrapper of tapply()) or a for() loop! Record at
 136 ## least the p-values and estimated differences! Transform the
 137 ## result into a data frame.
 138 
 139 ### replicate()
 140 res <- replicate(1000, ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10))
 141 res <- as.data.frame(t(res))
 142 
 143 
 144 ### replicate(,simplify=F) 
 145 res <- replicate(1000, ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10),simplify = F)
 146 res <- as.data.frame(Reduce(rbind,res))
 147 
 148 
 149 ## for() loop
 150 res <- matrix(numeric(2000),ncol=2)
 151 for(i in seq.int(1000)){
 152     res[i,] <- ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)[c("pval","diff")] }
 153 res <- as.data.frame(res)
 154 xnames(res) <- c("pval","diff")
 155 
 156 ## Use table() to count the p-vals below 0.05.
 157 table(res$pval < 0.05)
 158 
 159 ## What is the smallest absolute difference with a p-value below 0.05?
 160 tapply(abs(res$diff),res$pval < 0.05,summary)
 161 min(abs(res$diff[res$pval<0.05]))
 162 
 163 ## Repeat the last exercise, only change the sample size to 1000 in each of the 1000 samples! How many p-value below 0.05? What is now the smallest absolute difference with a p-value below 0.05?
 164 res2 <- replicate(1000, ztest(rnorm(1000,mean=10,sd=2),
 165                              x.sd=2,mu=10))
 166 res2 <- as.data.frame(t(res2))
 167 
 168 ## Use table() to count the p-vals below 0.05.
 169 table(res2$pval < 0.05)
 170 
 171 ## What is the smallest absolute difference with a p-value below 0.05?
 172 tapply(abs(res2$diff),res$pval < 0.05,summary)
 173 
 174 ## Concatenate the both resulting data frames from above using rbind()
 175 res <- rbind(res,res2)
 176 
 177 ## Plot the distributions of the pvals and the difference per
 178 ## sample size. Use \texttt{ggplot2} with an appropriate
 179 ## geom (density/histogram)
 180 
 181 
 182 require(ggplot2)
 183 ggplot(res,aes(x=pval)) +
 184     geom_histogram(bin=0.1,fill="forestgreen") +
 185     facet_grid(~ n)
 186 
 187 ggsave("hist.png")
 188 
 189 ggplot(res,aes(x=diff,colour=factor(n))) +
 190     geom_density(size=3)
 191 
 192 ggsave("dens.png")
 193 
 194 
 195 ggplot(res,aes(x=diff,y=pval)) +
 196     geom_point() +
 197     geom_hline(yintercept=0.05) +
 198     facet_grid(~ n)
 199 
 200 ggsave("point.png")
 201 
 202 ggplot(res,aes(x=diff,y=pval)) +
 203     geom_hex() +
 204     geom_hline(yintercept=0.05) +
 205     facet_grid(~ n)
 206 
 207 ggsave("dens2d.png")
 208 
 209 ######################################################################
 210 ####################### T-Test #######################################
 211 ######################################################################
 212 
 213 
 214 ## one sample
 215 set.seed(1)
 216 x <- rnorm(12)
 217 
 218 t.test(x,mu=0)
 219 
 220 t.test(x,mu=1)
 221 
 222 
 223 ## two sample Welch or Satterthwaite test 
 224 set.seed(1)
 225 x <- rnorm(12)
 226 y <- rnorm(12)
 227 g <- sample(c("A","B"),12,replace = T)
 228 t.test(x, y)
 229 t.test(x ~ g)
 230 t.test(x, y, var.equal = T)
 231 
 232 
 233 ######################################################################
 234 ####################### Exercises  ###################################
 235 ######################################################################
 236 
 237 ## use a t-test to compare TTime according to Stim.Type,
 238 ## visualize it. What is the problem?
 239 
 240 t.test(data$TTime ~ data$Stim.Type)
 241 
 242 ggplot(data,aes(x=Stim.Type,y=TTime)) +
 243     geom_boxplot()
 244 
 245 ## now do the same for Subject 1 on pre and post test (use filter()
 246 ## or indexing to get the resp. subsets)
 247 
 248 t.test(data$TTime[data$Subject==1 & data$testid=="test1"] ~
 249        data$Stim.Type[data$Subject==1 & data$testid=="test1"])
 250 
 251 t.test(data$TTime[data$Subject==1 & data$testid=="test2"] ~
 252        data$Stim.Type[data$Subject==1 & data$testid=="test2"])
 253 
 254 
 255 
 256 
 257 ## use the following code to do the test on every subset Subject
 258 ## and testid, try to figure what is happening in each step:
 259 
 260 data$Stim.Type <- droplevels(data$Stim.Type)
 261 
 262 data.l <- split(data,list(data$Subject,data$testid))
 263 
 264 tmp.l <- lapply(data.l,function(x) {
 265     if(min(table(x$Stim.Type)) < 5) return(NULL)
 266     tob <- t.test(x$TTime ~ x$Stim.Type)
 267     tmp <- data.frame(
 268         Subject = unique(x$Subject),
 269         testid = unique(x$testid),
 270         mean.group.1 = tob$estimate[1],
 271         mean.group.2 = tob$estimate[2],
 272         name.test.stat = tob$statistic,
 273         conf.lower = tob$conf.int[1],
 274         conf.upper = tob$conf.int[2],
 275         pval = tob$p.value,
 276         alternative = tob$alternative,
 277         tob$method)})
 278 
 279 res <- Reduce(rbind,tmp.l)
 280 
 281 ## plots
 282 
 283 ggplot(data,aes(x=testid,y=TTime)) +
 284     geom_boxplot(aes(fill=Stim.Type)) +
 285     facet_wrap(~Subject)
 286 
 287 
 288 ggplot(data,aes(x=factor(Subject),y=TTime)) +
 289     geom_boxplot(aes(fill=Stim.Type)) +
 290     facet_wrap(~testid)
 291 
 292 
 293 ## how many tests have a statistically significant result?
 294 table(res$pval < 0.05)
 295 
 296 prop.table(table(res$pval < 0.05))
 297 
 298 ## Is there a tendency? What could be the next step?
 299 
 300 
 301 ## hypothesis: if the child tries to answer correctly it thinks about it 
 302 ## if it does not know the answer immediately. So the difference of
 303 ## TTime resp. correct/incorrect should be higher as the percentage of
 304 ## correct answers is higher
 305 
 306 
 307 tmp.l <- lapply(data.l,function(x) {
 308     if(min(table(x$Stim.Type)) < 5) return(NULL)
 309     tob <- t.test(x$TTime ~ x$Stim.Type)
 310     tmp <- data.frame(
 311         Subject = unique(x$Subject),
 312         testid = unique(x$testid),
 313         perc.corr = sum(x$Stim.Type=="hit")/sum(!is.na(x$Stim.Type)),
 314         mean.group.1 = tob$estimate[1],
 315         mean.group.2 = tob$estimate[2],
 316         name.test.stat = tob$statistic,
 317         conf.lower = tob$conf.int[1],
 318         conf.upper = tob$conf.int[2],
 319         pval = tob$p.value,
 320         alternative = tob$alternative,
 321         tob$method)})
 322 
 323 res <- Reduce(rbind,tmp.l)
 324 
 325 ggplot(res,aes(x=perc.corr,y=mean.group.1 - mean.group.2)) +
 326     geom_point() +
 327     geom_smooth(se=F) +
 328     geom_smooth(data = res[res$perc.corr < 0.65,],se=F, method="lm",colour="black") +
 329     geom_smooth(data = res[res$perc.corr > 0.65,],se=F, method="lm",colour="black") +
 330     annotate("rect",xmin=0.65,xmax=Inf,ymin=-Inf,ymax = Inf,fill="blue",alpha=0.1) + annotate("segment",x=0.65,xend=0.65,y=-20000,yend=-5000,size=3,arrow=arrow())
 331 
 332 
 333 ########################################################################
 334 ############## Exercises stats ggplot2    ##############################
 335 ########################################################################
 336 
 337 
 338 require(Hmisc)
 339 ggplot(data,aes(x=testid,y=TTime)) +
 340     stat_summary(fun.data="mean_se",mult=1.96,geom="pointrange") +
 341     stat_bin(y=0,aes(label=..count..),geom="text",position="identity") +
 342     scale_y_continuous(limits=c(0,75000)) +
 343     facet_wrap(~Subject)
 344 
 345 tdf <- data.frame(x=runif(1000),y=runif(1000))
 346 
 347 ggplot(tdf,aes(x=x,y=y)) + stat_quantile()
 348 

Gespeicherte Dateianhänge

Um Dateianhänge in eine Seite einzufügen sollte unbedingt eine Angabe wie attachment:dateiname benutzt werden, wie sie auch in der folgenden Liste der Dateien erscheint. Es sollte niemals die URL des Verweises ("laden") kopiert werden, da sich diese jederzeit ändern kann und damit der Verweis auf die Datei brechen würde.
  • [laden | anzeigen] (2015-03-01 15:22:07, 5.1 KB) [[attachment:data.zip]]
  • [laden | anzeigen] (2015-04-14 06:00:29, 2.4 KB) [[attachment:extractpundreshape.r]]
  • [laden | anzeigen] (2025-04-29 01:36:53, 1.0 KB) [[attachment:latex_052ab3c2b91917f116074bdf465a245c9a74ebb6_p1.png]]
  • [laden | anzeigen] (2015-04-14 05:59:46, 0.5 KB) [[attachment:readpresent2.r]]
  • [laden | anzeigen] (2015-03-15 20:12:10, 3.2 KB) [[attachment:readpresentation_0.1.tar.gz]]
  • [laden | anzeigen] (2015-03-31 09:57:32, 10.1 KB) [[attachment:session4code.r]]
 Alle Dateien | Ausgewählte Dateien: löschen verschieben auf Seite copy to page

Sie dürfen keine Anhänge an diese Seite anhängen!