Dateianhang 'session4code.r'
Herunterladen 1 setwd("/media/mvogel/Volume/transcend/mpicbs/2015kurs/session4")
2
3 ## load the data (file: session4data.rdata)
4 ## make a new summary data frame (per subject and time) containing:
5 ###### the number of trials
6 ###### the number correct trials (absolute and relative)
7 ###### the mean TTime and the standard deviation of TTime
8 ###### the respective standard error of the mean
9 ## keep the information about Sex and Age\_PRETEST
10
11 load("session4data.rdata")
12
13 require(dplyr)
14 sumdf <- data %>%
15 group_by(Subject,Sex,Age_PRETEST,testid) %>%
16 summarise(count=n(),
17 n.corr = sum(Stim.Type=="hit"),
18 perc.corr = n.corr/count,
19 mean.ttime = mean(TTime),
20 sd.ttime = sd(TTime),
21 se.ttime = sd.ttime/sqrt(count))
22
23
24 ## make a plot with time on the x-axis and TTime on the y-axis
25 ## showing the means and the 95\% confidence intervals (geom_pointrange())
26 ## hint: you can use operations inside aes()
27
28 require(ggplot2)
29
30 p1 <- ggplot(sumdf,aes(x=testid,
31 y=mean.ttime,
32 ymin=mean.ttime - 1.96*se.ttime,
33 ymax=mean.ttime + 1.96*se.ttime)) +
34 geom_pointrange() +
35 facet_wrap(~Subject)
36
37 ## second variant
38
39 sumdf$conlow <- sumdf$mean.ttime - 1.96 * sumdf$se.ttime
40 sumdf$conup <- sumdf$mean.ttime + 1.96 * sumdf$se.ttime
41
42 require(ggplot2)
43 p1 <- ggplot(sumdf,aes(x=testid,
44 y=mean.ttime,
45 ymin=conlow,
46 ymax=conup)) +
47 geom_pointrange() +
48 facet_wrap(~Subject)
49
50
51 ## add the number of trials and the percentage of
52 ## correct ones using geom\_text()
53
54 p1 + geom_text(y=0,aes(label=n.corr),angle=90,size=3) +
55 geom_text(y=60000,aes(label=round(perc.corr,2)),angle=90,size=3) +
56 scale_y_continuous(limits=c(0,75000))
57
58
59 require(stringr)
60 p1 + geom_text(y=0, aes(label=n.corr),size=4) +
61 geom_text(y=70000,
62 aes(label=str_pad(round(perc.corr,2),width = 4,side = "right",pad=0)),
63 angle=90,hjust=1,size=4) +
64 scale_y_continuous(limits=c(0,70000))
65
66
67 ## Write a function which takes a vector, the poulation standard deviation
68 ## and the population mean as arguments
69 ## and which gives the Z score as result.
70
71
72 ztest <- function(x,x.sd,mu=0){
73 sqrt(length(x)) * (mean(x)-mu)/x.sd
74 }
75
76 set.seed(1)
77 ztest(rnorm(100),x.sd = 1)
78
79 ## add a line to your function that allows you to also process numeric
80 ## vectors containing missing values!
81
82 ztest <- function(x,x.sd,mu=0){
83 x <- x[!is.na(x)]
84 if(length(x) < 3) stop("too few values in x")
85 sqrt(length(x)) * (mean(x)-mu)/x.sd
86 }
87
88 set.seed(1)
89 ztest(rnorm(100),x.sd = 1)
90
91 ## the function pnorm(Z) gives the probability of x <= Z. Change your
92 ## function so has the p-value as result.
93
94 ztest <- function(x,x.sd,mu=0){
95 x <- x[!is.na(x)]
96 if(length(x) < 3) stop("too few values in x")
97 z <- sqrt(length(x)) * (mean(x)-mu)/x.sd
98 2*pnorm(-abs(z))
99 }
100
101 set.seed(1)
102 ztest(rnorm(100),x.sd = 1)
103
104 ## now let the result be a named vector containing the estimated
105 ## difference, Z, p and the n.
106
107
108 ztest <- function(x,x.sd,mu=0){
109 x <- x[!is.na(x)]
110 if(length(x) < 3) stop("too few values in x")
111 est.diff <- mean(x)-mu
112 z <- sqrt(length(x)) * (est.diff)/x.sd
113 c(diff=est.diff,Z=z,pval=2*pnorm(-abs(z)),n=length(x))
114 }
115
116 set.seed(1)
117 ztest(rnorm(100),x.sd = 1)
118
119 ######################################################################
120 ####################### Simulation exerc #############################
121 ######################################################################
122
123 ## Now sample 100 values from a Normal distribution with mean 10
124 ## and standard deviation 2 and use a z-test to compare it against
125 ## the population mean 10. What is the p-value?
126
127 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)["pval"]
128 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)["diff"]
129 ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)[c("pval","diff")]
130
131
132
133 ## Now do the sampling and the testing 1000 times, what would be the
134 ## number of statistically significant results? Use replicate()
135 ## (which is a wrapper of tapply()) or a for() loop! Record at
136 ## least the p-values and estimated differences! Transform the
137 ## result into a data frame.
138
139 ### replicate()
140 res <- replicate(1000, ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10))
141 res <- as.data.frame(t(res))
142
143
144 ### replicate(,simplify=F)
145 res <- replicate(1000, ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10),simplify = F)
146 res <- as.data.frame(Reduce(rbind,res))
147
148
149 ## for() loop
150 res <- matrix(numeric(2000),ncol=2)
151 for(i in seq.int(1000)){
152 res[i,] <- ztest(rnorm(100,mean=10,sd=2),x.sd=2,mu=10)[c("pval","diff")] }
153 res <- as.data.frame(res)
154 xnames(res) <- c("pval","diff")
155
156 ## Use table() to count the p-vals below 0.05.
157 table(res$pval < 0.05)
158
159 ## What is the smallest absolute difference with a p-value below 0.05?
160 tapply(abs(res$diff),res$pval < 0.05,summary)
161 min(abs(res$diff[res$pval<0.05]))
162
163 ## Repeat the last exercise, only change the sample size to 1000 in each of the 1000 samples! How many p-value below 0.05? What is now the smallest absolute difference with a p-value below 0.05?
164 res2 <- replicate(1000, ztest(rnorm(1000,mean=10,sd=2),
165 x.sd=2,mu=10))
166 res2 <- as.data.frame(t(res2))
167
168 ## Use table() to count the p-vals below 0.05.
169 table(res2$pval < 0.05)
170
171 ## What is the smallest absolute difference with a p-value below 0.05?
172 tapply(abs(res2$diff),res$pval < 0.05,summary)
173
174 ## Concatenate the both resulting data frames from above using rbind()
175 res <- rbind(res,res2)
176
177 ## Plot the distributions of the pvals and the difference per
178 ## sample size. Use \texttt{ggplot2} with an appropriate
179 ## geom (density/histogram)
180
181
182 require(ggplot2)
183 ggplot(res,aes(x=pval)) +
184 geom_histogram(bin=0.1,fill="forestgreen") +
185 facet_grid(~ n)
186
187 ggsave("hist.png")
188
189 ggplot(res,aes(x=diff,colour=factor(n))) +
190 geom_density(size=3)
191
192 ggsave("dens.png")
193
194
195 ggplot(res,aes(x=diff,y=pval)) +
196 geom_point() +
197 geom_hline(yintercept=0.05) +
198 facet_grid(~ n)
199
200 ggsave("point.png")
201
202 ggplot(res,aes(x=diff,y=pval)) +
203 geom_hex() +
204 geom_hline(yintercept=0.05) +
205 facet_grid(~ n)
206
207 ggsave("dens2d.png")
208
209 ######################################################################
210 ####################### T-Test #######################################
211 ######################################################################
212
213
214 ## one sample
215 set.seed(1)
216 x <- rnorm(12)
217
218 t.test(x,mu=0)
219
220 t.test(x,mu=1)
221
222
223 ## two sample Welch or Satterthwaite test
224 set.seed(1)
225 x <- rnorm(12)
226 y <- rnorm(12)
227 g <- sample(c("A","B"),12,replace = T)
228 t.test(x, y)
229 t.test(x ~ g)
230 t.test(x, y, var.equal = T)
231
232
233 ######################################################################
234 ####################### Exercises ###################################
235 ######################################################################
236
237 ## use a t-test to compare TTime according to Stim.Type,
238 ## visualize it. What is the problem?
239
240 t.test(data$TTime ~ data$Stim.Type)
241
242 ggplot(data,aes(x=Stim.Type,y=TTime)) +
243 geom_boxplot()
244
245 ## now do the same for Subject 1 on pre and post test (use filter()
246 ## or indexing to get the resp. subsets)
247
248 t.test(data$TTime[data$Subject==1 & data$testid=="test1"] ~
249 data$Stim.Type[data$Subject==1 & data$testid=="test1"])
250
251 t.test(data$TTime[data$Subject==1 & data$testid=="test2"] ~
252 data$Stim.Type[data$Subject==1 & data$testid=="test2"])
253
254
255
256
257 ## use the following code to do the test on every subset Subject
258 ## and testid, try to figure what is happening in each step:
259
260 data$Stim.Type <- droplevels(data$Stim.Type)
261
262 data.l <- split(data,list(data$Subject,data$testid))
263
264 tmp.l <- lapply(data.l,function(x) {
265 if(min(table(x$Stim.Type)) < 5) return(NULL)
266 tob <- t.test(x$TTime ~ x$Stim.Type)
267 tmp <- data.frame(
268 Subject = unique(x$Subject),
269 testid = unique(x$testid),
270 mean.group.1 = tob$estimate[1],
271 mean.group.2 = tob$estimate[2],
272 name.test.stat = tob$statistic,
273 conf.lower = tob$conf.int[1],
274 conf.upper = tob$conf.int[2],
275 pval = tob$p.value,
276 alternative = tob$alternative,
277 tob$method)})
278
279 res <- Reduce(rbind,tmp.l)
280
281 ## plots
282
283 ggplot(data,aes(x=testid,y=TTime)) +
284 geom_boxplot(aes(fill=Stim.Type)) +
285 facet_wrap(~Subject)
286
287
288 ggplot(data,aes(x=factor(Subject),y=TTime)) +
289 geom_boxplot(aes(fill=Stim.Type)) +
290 facet_wrap(~testid)
291
292
293 ## how many tests have a statistically significant result?
294 table(res$pval < 0.05)
295
296 prop.table(table(res$pval < 0.05))
297
298 ## Is there a tendency? What could be the next step?
299
300
301 ## hypothesis: if the child tries to answer correctly it thinks about it
302 ## if it does not know the answer immediately. So the difference of
303 ## TTime resp. correct/incorrect should be higher as the percentage of
304 ## correct answers is higher
305
306
307 tmp.l <- lapply(data.l,function(x) {
308 if(min(table(x$Stim.Type)) < 5) return(NULL)
309 tob <- t.test(x$TTime ~ x$Stim.Type)
310 tmp <- data.frame(
311 Subject = unique(x$Subject),
312 testid = unique(x$testid),
313 perc.corr = sum(x$Stim.Type=="hit")/sum(!is.na(x$Stim.Type)),
314 mean.group.1 = tob$estimate[1],
315 mean.group.2 = tob$estimate[2],
316 name.test.stat = tob$statistic,
317 conf.lower = tob$conf.int[1],
318 conf.upper = tob$conf.int[2],
319 pval = tob$p.value,
320 alternative = tob$alternative,
321 tob$method)})
322
323 res <- Reduce(rbind,tmp.l)
324
325 ggplot(res,aes(x=perc.corr,y=mean.group.1 - mean.group.2)) +
326 geom_point() +
327 geom_smooth(se=F) +
328 geom_smooth(data = res[res$perc.corr < 0.65,],se=F, method="lm",colour="black") +
329 geom_smooth(data = res[res$perc.corr > 0.65,],se=F, method="lm",colour="black") +
330 annotate("rect",xmin=0.65,xmax=Inf,ymin=-Inf,ymax = Inf,fill="blue",alpha=0.1) + annotate("segment",x=0.65,xend=0.65,y=-20000,yend=-5000,size=3,arrow=arrow())
331
332
333 ########################################################################
334 ############## Exercises stats ggplot2 ##############################
335 ########################################################################
336
337
338 require(Hmisc)
339 ggplot(data,aes(x=testid,y=TTime)) +
340 stat_summary(fun.data="mean_se",mult=1.96,geom="pointrange") +
341 stat_bin(y=0,aes(label=..count..),geom="text",position="identity") +
342 scale_y_continuous(limits=c(0,75000)) +
343 facet_wrap(~Subject)
344
345 tdf <- data.frame(x=runif(1000),y=runif(1000))
346
347 ggplot(tdf,aes(x=x,y=y)) + stat_quantile()
348
Gespeicherte Dateianhänge
Um Dateianhänge in eine Seite einzufügen sollte unbedingt eine Angabe wie attachment:dateiname benutzt werden, wie sie auch in der folgenden Liste der Dateien erscheint. Es sollte niemals die URL des Verweises ("laden") kopiert werden, da sich diese jederzeit ändern kann und damit der Verweis auf die Datei brechen würde.Sie dürfen keine Anhänge an diese Seite anhängen!