#!/usr/bin/Rscript # Genomik und Bioinformatik II # Projekt 1 (Microarray analysis and simulation of data) # written by Loibl, Norgall, Stemmer library(Biobase) library(ALL) data(ALL) # load data e <- exprs(ALL) p <- pData(ALL) summary(pData(ALL)) # plot different simulated distributions (and original data to compare) dev.new() plot(density(rlnorm(5000,1.65,0.35)), main="Plot of different simulated distributions"); lines(density(e), col=2) lines(density(rnorm(50000,5.6,sd(e))), col=3) lines(density(rgamma(5000,6)), col=4) legend(10,0.15,c("log normal distribution", "orignal data", "normal distribution", "gamma distribution"), col=1:4, pch=19) # simulate data (using log normal distribution) sim <- rlnorm(prod(dim(e)),1.65,0.35) dim(sim) <- dim(e) dev.new() qqplot(sample(e, 10000),sample(sim, 10000),main="qq-plot of original and simulated data") #qqplot(e,sim,main="qq-plot of original and simulated data") # full data set - grab a coffee, this will take some time... ;-) abline(c(1,1), col=3) # correlation coefficient corr <- function(x,y) sum((x-mean(x))*(y-mean(y))) / sqrt(sum((x-mean(x))^2)*sum((y-mean(y))^2)) correlatedGenes <- sapply(1:dim(e)[1], function(row) corr(e[row,], sim[row,])) dev.new() plot(density(correlatedGenes), main="Distribution of the pairwise correlation between\nthe original and the simulated data for all genes") abline(v=mean(correlatedGenes), col=3) # plot expressions of a gene for all patients plot_expressions <- function(genes, legend=FALSE, fn=FALSE, xlab="Patient", ylab="Expression Level") { if(!is.function(fn)) fn <- function(x) x for(gene in genes) { plot(fn(e[gene,]), col=1+(substr(p$BT,1,1)=="T"), pch=(p$sex=="F")*18+1, xlab=xlab, ylab=ylab, main=paste("Gene", gene)) if(legend) { legend(100,5,c("male","female"), pch=c(1,19)) legend(100,6.5,c("B-cell ALL","T-cell ALL"), col=c(1,2), pch=19) } } } # plot the given sample probesets dev.new() plot_expressions("38355_at", TRUE) dev.new() plot_expressions("38319_at", TRUE) # density dev.new() plot_expressions("38355_at", legend=FALSE, fn=density, ylab="Density", xlab="") dev.new() plot_expressions("38319_at", legend=FALSE, fn=density, ylab="Density", xlab="") # find interesting genes without knowledge about the data interestingGenes <- names(tail(sort(apply(e,1,var)), 16)) dev.new() par(mfrow=c(4,4)) plot_expressions(interestingGenes) # find similar genes: # genes that differ between male and female patients cat("Genes that differ between male and female patients:", paste(names(tail(sort(abs(apply(e[,which(p$sex=="F")], 1, mean) - apply(e[,which(p$sex=="M")], 1, mean)))))), "\n") dev.new(width=7, height=12) par(mfrow=c(4,2)) plot_expressions(names(tail(sort(abs(apply(e[,which(substr(p$BT,1,1)=="B")], 1, mean) - apply(e[,which(substr(p$BT,1,1)=="T")], 1, mean)))))) par(xpd=NA) legend(0,-2,c("male","female"), pch=c(1,19)) legend(50,-2,c("B-cell ALL","T-cell ALL"), col=c(1,2), pch=19) # genes that differ between B-cell ALL and T-cell ALL cat("Genes that differ between B-cell ALL and T-cell ALL:", paste(names(tail(sort(abs(apply(e[,which(substr(p$BT,1,1)=="B")], 1, mean) - apply(e[,which(substr(p$BT,1,1)=="T")], 1, mean)))))), "\n") dev.new(width=7, height=12) par(mfrow=c(4,2)) plot_expressions(names(tail(sort(abs(apply(e[,which(p$sex=="F")], 1, mean) - apply(e[,which(p$sex=="M")], 1, mean)))))) par(xpd=NA) legend(0,-2,c("male","female"), pch=c(1,19)) legend(50,-2,c("B-cell ALL","T-cell ALL"), col=c(1,2), pch=19)