#**************************************************************************************** # # Gene co-Expression Network Analysis Package # # # Objective: # Automatic construction of gene co-expression networks & # Decomposition of a network into tightly co-expressed modules # # Input: DNA microarray data file # # Output: Topological overlap matrix (TOM) and gene co-expressed modules # # Authors: Bin Zhang, Steve Horvath # Array Data Analysis Group (ADAG) # Departments of Human Genetics and Biostatistics # University of California at Los Angeles # # Contact: shorvath@mednet.ucla.edu and Bin Zhang [binzhang.ucla@gmail.com] # # Copyright @2004-2008 ADAG,UCLA # # Comment by Steve Horvath: This code is very similar to the network code posted in the tutorial of our network webpage: # http://www.genetics.ucla.edu/labs/horvath/GeneralFramework/ # # Main differences: # 1) this code is not interactive. It allows you to run a batch analysis of the network. # And it outputs all graphs and figures into a directory. # 2) The code is older than the one at our main webpage. # 3) This code underlies the figures and tables in the following manuscript # Carlson MRJ, Zhang B, Fang Z, Mischel PS, Horvath S, and Nelson SF (2006) # Gene Connectivity, Function, and Sequence Conservation: Predictions from Modular Yeast Co-Expression Networks. # Biomed Central Genomics. # 4) It contains the function cutreeDynamic for dynamic tree cutting see function cutreeDynamic. #**************************************************************************************** #------------------------------------------------------------------------------------------------------------------ # Function: cut hierarchical clusering tree, base don internal structure of ordered dendrogram # use the information of dendrogram height with the mean height of a module to determine the cut points, # if this doesn't split the module, try the differences with (max_height+mean_height)/2 # the whole process can be iterated until number of clusters become stable (if deepSplit is set as TRUE) # A description of the details of the algorithm could be found at # http://www.genetics.ucla.edu/labs/horvath/binzhang/DynamicTreeCut. # The algorithm makes use of the structure of the dendrogram. The algorithm is based on an adaptive process of cluster decomposition # and combination. The process is iterated until the number of clusters becomes stable. # Input: # hierclust ~ hierarchical clustering object # deepSplit ~ perform iterative searching of sub clusters if set to be TRUE, otherwise, do # minModuleSize ~ min size of module # minAttachModuleSize ~ min size of module to be attached, as a major module with larger size # startpoint of current module # Output: module colors for every node # #------------------------------------------------------------------------------------------------------------------ cutreeDynamic = function(hierclust, maxTreeHeight=1, deepSplit=TRUE, minModuleSize=50, minAttachModuleSize=100) { #dim(hierclust$merge) #length(hierclust$height) #length(hierclust$order) #hierclust$merge[1:10, ] #round(hierclust$height[1:20], 2) #orderHei = hierclust$height[hierclust$order] #orderMerge = hierclust$merge[, ] #plclust(hierclust, labels=F, xlab="",ylab="",main="",sub="") if(maxTreeHeight >=1){ staticCutCluster = cutTreeStatic(hiercluster=hierclust, heightcutoff=0.99, minsize1=minModuleSize) }else{ staticCutCluster = cutTreeStatic(hiercluster=hierclust, heightcutoff=maxTreeHeight, minsize1=minModuleSize) } #get tree height for every singleton #node_index tree_height demdroHeiAll= rbind( cbind(hierclust$merge[,1], hierclust$height), cbind(hierclust$merge[,2], hierclust$height) ) #singletons will stand at the front of the list myorder = order(demdroHeiAll[,1]) #get # of singletons no.singletons = length(hierclust$order) #> demdroHei.sort[1:10,] # [,1] [,2] #[1,] -3000 0.7943783 #[2,] -2999 0.7863851 demdroHeiAll.sort = demdroHeiAll[myorder, ] demdroHei.sort = demdroHeiAll.sort[c(1:no.singletons), ] #finally, we got tree height for each of the singleton inorder of 1 to no.singletons # [,1] [,2] #[1,] 1 0.8389184 #[2,] 2 0.8772433 #[3,] 3 0.8308482 demdroHei = demdroHei.sort[seq(no.singletons, 1, by=-1), ] demdroHei[,1] = -demdroHei[,1] # combine with prelimilary cluster-cutoff results demdroHei = cbind(demdroHei, as.integer(staticCutCluster)) # re-order the order based on the dendrogram order hierclust$order demdroHei.order = demdroHei[hierclust$order, ] # get start and end posiiton of every cluster # [1,] 173 315 # [2,] 676 793 static.clupos = locateCluster(demdroHei.order[, 3]) static.no = dim(static.clupos)[1] static.clupos2 = static.clupos static.no2 = static.no #split individual cluster if there are sub clusters embedded mcycle=1 while(1==1){ clupos = NULL for (i in c(1:static.no)){ mydemdroHei.order = demdroHei.order[ c(static.clupos[i,1]:static.clupos[i,2]), ] #index to [1, clusterSize] mydemdroHei.order[, 1] = mydemdroHei.order[, 1] - static.clupos[i, 1] + 1 #cat("Cycle ", as.character(mcycle), "cluster (", static.clupos[i,1], static.clupos[i,2], ")\n") #cat("i=", as.character(i), "\n") iclupos = processInvididualCluster(mydemdroHei.order, cminModuleSize = minModuleSize, cminAttachModuleSize = minAttachModuleSize) iclupos[,1] = iclupos[,1] + static.clupos[i, 1] -1 #recover the original index iclupos[,2] = iclupos[,2] + static.clupos[i, 1] -1 clupos = rbind(clupos, iclupos) #put in the final output buffer } if(deepSplit==FALSE){ break } if(dim(clupos)[1] != static.no) { static.clupos = clupos static.no = dim(static.clupos)[1] }else{ break } mcycle = mcycle + 1 #static.clupos } final.cnt = dim(clupos)[1] #assign colors for modules module.assign = rep(0, no.singletons) module.cnt=1 for (i in c(1:final.cnt )) { sdx = clupos[i, 1] #module start point edx = clupos[i, 2] #module end point module.size = edx - sdx +1 if(module.size 0) cuts.bool[1] = TRUE cuts.bool[no.cnodes] = TRUE if(sum(cuts.bool)==2){ if (useMean==0){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=1) }else if(useMean==1){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=-1) }else{ new.clupos = rbind(c(1, no.cnodes)) } return (new.clupos) } #a good candidate cluster-end point should have significant # of ahead nodes with head < meanHei cutindex =c(1:no.cnodes)[cuts.bool] no.cutps = length(cutindex) runlens = rep(999, no.cutps) cuts.bool2 = cuts.bool for(i in c(2:(no.cutps-1)) ){ seq = c( (cutindex[i-1]+1):cutindex[i] ) runlens[i] = runlengthSign(heidiff[seq], leftOrright=-1, mysign=-1) if(runlens[i] < minTailRunlength){ #cat("run length=", runlens[i], "\n") cuts.bool2[ cutindex[i] ] = FALSE } } #attach SMALL cluster to the left-side BIG cluster if the small one has smaller mean height cuts.bool3=cuts.bool2 if(sum(cuts.bool2) > 3) { curj = 2 while (1==1){ cutindex2 =c(1:no.cnodes)[cuts.bool2] no.clus = length(cutindex2) -1 if (curj>no.clus){ break } pre.sdx = cutindex2[ curj-1 ]+1 #previous module start point pre.edx = cutindex2[ curj ] #previous module end point pre.module.size = pre.edx - pre.sdx +1 pre.module.hei = mean(clusterDemdroHei[c(pre.sdx:pre.edx) , 2]) cur.sdx = cutindex2[ curj ]+1 #previous module start point cur.edx = cutindex2[ curj+1 ] #previous module end point cur.module.size = cur.edx - cur.sdx +1 cur.module.hei = mean(clusterDemdroHei[c(cur.sdx:cur.edx) , 2]) #merge to the leftside major module, don't change the current index "curj" #if( (pre.module.size >minAttachModuleSize)&(cur.module.hei 2){ if( (cutindex2[no.cutps] - cutindex2[no.cutps-1]+1) < cminModuleSize ){ cuts.bool2[ cutindex2[no.cutps-1] ] =FALSE } } if(1==2){ myseqnce = c(2300:3000) cutdisp = ifelse(cuts.bool2==T, "red","grey" ) #re-order to the normal one with sequential signleton index par(mfrow=c(3,1), mar=c(0,0,0,0) ) plot(h1row, labels=F, xlab="",ylab="",main="",sub="",axes = F) barplot(heidiff[myseqnce], col= "black", space=0, border=F,main="", axes = F, axisnames = F) barplot(height=rep(1, length(cutdisp[myseqnce])), col= as.character(cutdisp[myseqnce]), space=0, border=F,main="", axes = F, axisnames = F) par(mfrow=c(1,1), mar=c(5, 4, 4, 2) + 0.1) } cutindex2 = c(1:no.cnodes)[cuts.bool2] cutindex2[1]=cutindex2[1]-1 #the first no.cutps2 = length(cutindex2) if(no.cutps2 > 2){ new.clupos = cbind( cutindex2[c(1:(no.cutps2-1))]+1, cutindex2[c(2:no.cutps2)] ) }else{ new.clupos = cbind( 1, no.cnodes) } if ( dim(new.clupos)[1] == 1 ){ if (useMean==0){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=1) }else if(useMean==1){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=-1) } } new.clupos } findClustersSignificant=function(mysequence, modulecolor) { modnames= names( table(modulecolor) ) mysize = length(modulecolor) validseq = rep(TRUE, mysize) for (each in modnames ){ mybool = (modulecolor==each) mymodulesig = mysequence[mybool] mydiff = abs(mymodulesig - mymodulesig[1]) if(sum(mydiff)==0){ validseq = ifelse(mybool==TRUE, FALSE, validseq) } } validseq } #leftOrright >0 : running length (with same sign) to right, otherwise to the left #mysign = -1: negative value, mysign = -1: positive value runlengthSign = function(mysequence, leftOrright=-1, mysign=-1){ seqlen = length(mysequence) if(leftOrright<0){ pseq = rev(mysequence) }else{ pseq = mysequence } if(mysign<0){ #see where the first POSITIVE number occurs nonezero.bool = (pseq > 0) }else{ #see where the first NEGATIVE number occur nonezero.bool = (pseq < 0) } if( sum(nonezero.bool) > 0){ runlength = min( c(1:seqlen)[nonezero.bool] ) - 1 }else{ runlength = 0 } } #delta >0 : shift to right, otherwise to the left shiftSequence = function(mysequence, delta){ seqlen = length(mysequence) if(delta>0){ finalseq=c(mysequence[1:delta], mysequence[1:(seqlen-delta)]) }else{ posdelta = -delta finalseq=c(mysequence[(posdelta+1):seqlen], mysequence[(seqlen-posdelta+1):seqlen]) } finalseq } #no of neighbors behind and before the point used for average averageSequence=function(mysequence, noneighbors){ sumseq = mysequence for(i in c(1:noneighbors)){ iseq = shiftSequence(mysequence, i) sumseq = sumseq + iseq iseq = shiftSequence(mysequence, -i) sumseq = sumseq + iseq } sumseq = sumseq/(1+2*noneighbors) } #delta >0 : shift to right, otherwise to the left shiftSequence = function(mysequence, delta){ seqlen = length(mysequence) if(delta>0){ finalseq=c(mysequence[1:delta], mysequence[1:(seqlen-delta)]) }else{ posdelta = -delta finalseq=c(mysequence[(posdelta+1):seqlen], mysequence[(seqlen-posdelta+1):seqlen]) } finalseq } #find the middle of each cluster and label the middle position with the corrsponding color getDisplayColorSequence=function(colordered){ mylen = length(colordered) colordered2 = c(colordered[1], colordered[1:(mylen-1)] ) colordiff = (colordered != colordered2) colordiff[1] = TRUE colordiff[mylen] = TRUE #mydispcolor = ifelse(colordiff==TRUE, colordered, "") mydispcolor = rep("", mylen) mytrueseq = c(1:mylen)[colordiff] for (i in c(1:(length(mytrueseq)-1)) ){ midi = (mytrueseq[i] + mytrueseq[i+1])/2 mydispcolor[midi] = colordered[midi] } fdispcolor = ifelse(mydispcolor=="grey", "", mydispcolor) fdispcolor } #use height cutoff to remove cutTreeStatic = function(hiercluster,heightcutoff=0.99, minsize1=50) { # here we define modules by using a height cut-off for the branches labelpred= cutree(hiercluster,h=heightcutoff) sort1=-sort(-table(labelpred)) sort1 modulename= as.numeric(names(sort1)) modulebranch= sort1 > minsize1 no.modules=sum(modulebranch) colorhelp = rep(-1, length(labelpred) ) if ( no.modules==0){ print("No mudule detected\n") } else{ for (i in c(1:no.modules)) { colorhelp=ifelse(labelpred==modulename[i],i ,colorhelp) } } colorhelp } #locate the start/end positions of each cluster in the ordered cluster label sequence #where "-1" indicating no cluster #3-1 -1 1 1 1 1 2 2 2 #3 3 -1-1 1 1 1 1 2 2 2 (shift) #--------------------------------- #0-4 0 2 0 0 0 1 0 0 0 (difference) # * * @ locateCluster = function(clusterlabels) { no.nodes = length(clusterlabels) clusterlabels.shift = c(clusterlabels[1], c(clusterlabels[1:(no.nodes-1)]) ) #a non-zero point is the start point of a cluster and it previous point is the end point of the previous cluster label.diff = abs(clusterlabels - clusterlabels.shift) #process the first and last positions as start/end points if they belong to a cluster instead of no cluster "-1" if(clusterlabels[1] >0) {label.diff[1]=1} if(clusterlabels[no.nodes]>0) {label.diff[no.nodes]=1} flagpoints.bool = label.diff > 0 flagpoints = c(1:no.nodes)[flagpoints.bool] no.points = length(flagpoints) myclupos=NULL for(i in c(1:(no.points-1)) ){ idx = flagpoints[i] if(clusterlabels[idx]>0){ myclupos = rbind(myclupos, c(idx, flagpoints[i+1]-1) ) } } myclupos } #row-wise reorder a matrix based on "orderedList", default key column in the #matrix is the first column orderMergedMatrix = function(disorderMatrix, orderedList, keyCol=1){ no.samples = length(orderedList) cnt = 1 seqc =c(1:no.samples) rightOrder = rep(0, no.samples) orderedMatrix = NULL for(each in orderedList){ whichTrue = ( as.character(each)==as.character(disorderMatrix[, keyCol]) ) idx = sum(whichTrue * seqc) #cat(as.character(each), " : ", as.character(disorderMatrix[idx,keyCol]),as.character(idx), "\n" ) rightOrder[idx] = cnt cnt = cnt + 1 orderedMatrix = rbind(orderedMatrix, disorderMatrix[idx,]) } colnames(orderedMatrix) <- colnames(disorderMatrix) orderedMatrix } #------------------------------------------------------------------------- #Function: convert the upper #Input: # mvector ~ adjacency vector with first column is # the ROW index (i) of the vector in the original matrix A=[a(i,j)] # mcutoff ~ color codes (modules) for genes # mgraphfname ~ file for storing output tripples of (i, j, a(i,j)), j>i # mfudgefactor~ fudge factor to spread (<1) or compress (>1) a network #Output: tripples of (i, j, a(i,j)), j>i #------------------------------------------------------------------------- vectorToPairs=function(mvector, mcutoff, mgraphfname,mfudgefactor=1) { #mvector = kdatEdge[12, ] #mcutoff =cutoff #mgraphfname = graphfname i = as.integer(mvector[1]) #actual data vector datv = as.numeric(as.vector(mvector[-c(1)])) mno.nodes = length(datv) #node index index <- c((i+1):mno.nodes) av = datv[index] bv = av > mcutoff no.selected = sum(bv) ret=c(0,0,0) if(no.selected>0){ c1=rep(i, no.selected) c2=index[bv] c3=as.numeric(av[bv])^mfudgefactor ret = as.matrix(cbind(c1,c2,c3)) write.table(ret, mgraphfname, append=T, sep=" ", quote=F, col.names=F, row.names=F) } #cat(i," :",as.character(no.selected), "\n") ret } #orderMergedMatrix = function(disorderMatrix, orderedList, keyCol=1){ # orderM = order(disorderMatrix[keyCol, ]) # orderL = order(orderedList) # orderL2=order(orderL) # orderM2= orderM[orderL2] # disorderMatrix[orderM2, ] #} #------------------------------------------------------------------------- #Function: compute whithin-module cluster coefficient for each gene #Input: # adjMatrix ~ adjacency matrix # colorcodeC ~ color codes (modules) for genes #Output: in-module cluster coefficients #------------------------------------------------------------------------- computeModuleCC = function(adjMatrix, colorcodeC) { modnames= names( table(colorcodeC) ) no.genes = dim(adjMatrix)[1] clustercoeff = rep(0, no.genes) idxseq = c(1:no.genes) for (each in modnames ){ if(as.character(each)=="grey" ){ next } whichmod = each==colorcodeC icc = computeClusterCoefficient(adjMatrix[whichmod,whichmod]) #indices of genes in the current module idxmod = idxseq * whichmod #put the cc's to the clustercoeff[idxmod] = icc } clustercoeff } #------------------------------------------------------------------------- #Function: compute whithin-module number of connections for each gene #Input: # adjMatrix ~ adjacency matrix # colorcodeC ~ color codes (modules) for genes #Output: in-module number of connections of each gene #------------------------------------------------------------------------- computeModuleLinks = function(adjMatrix, colorcodeC, isAdjacency=TRUE) { modnames= names( table(colorcodeC) ) no.genes = dim(adjMatrix)[1] links = rep(0, no.genes) idxseq = c(1:no.genes) for (each in modnames ){ if(as.character(each)=="grey" ){ next } whichmod = each==colorcodeC module.size = sum(whichmod) modk <- apply(adjMatrix[whichmod,whichmod],2,sum, na.rm=TRUE) if(!isAdjacency){ modk <- (module.size -modk) } #indices of genes in the current module idxmod = idxseq * whichmod #put the links's to the buffer links[idxmod] = modk } links } coxSurvModel = function(exprvector, survector){ exprvector.num = as.numeric(exprvector) mpvalue=1 #for PM/MM truncated cases, a few changes across samples, cox model will crash levels = names( table(exprvector.num) ) if( (var(exprvector.num) >1) & (length(levels) >2) ) { cox1=coxph(survector ~ exprvector.num, na.action="na.omit") mpvalue=1-pchisq(cox1$score, 1) } mpvalue } coxSurvModelComplex = function(exprvector, survector){ exprvector.num = as.numeric(exprvector) mpvalue =1 sde = -1 lower.95=-1 upper.95=-1 hr =-1 #for PM/MM truncated cases, a few changes across samples, cox model will crash levels = names( table(exprvector.num) ) if( (var(exprvector.num) >1) & (length(levels) >2) ) { cox1=coxph(survector ~ exprvector.num, na.action="na.omit") mpvalue=1-pchisq(cox1$score, 1) groups=I(exprvector.num > median(exprvector.num) ) cox2 = coxph(survector ~ groups) sde= sqrt(cox2$var) lower.95=exp(cox2$coef-1.96*sde) upper.95=exp(cox2$coef+1.96*sde) hr =exp(cox2$coef) } outphr = c(mpvalue, hr, lower.95, upper.95, sde) } computeTOM= function(adjMatrix) { no.singletons <- dim(adjMatrix)[1] nolinks.reduced <- apply(adjMatrix, 2, sum, na.rm=TRUE) #Let’s calculate topological overlap matrix numTOM= adjMatrix %*% adjMatrix + adjMatrix dist1 = matrix(NA, no.singletons, no.singletons) diag(dist1) <- 0 for (i in 1:(no.singletons-1) ){ for (j in (i+1):no.singletons){ denomTOMij = min(nolinks.reduced[i], nolinks.reduced[j]) + 1 - adjMatrix[i,j] dist1[i,j] = 1- numTOM[i,j]/denomTOMij dist1[j,i] = dist1[i,j] } } dist1 } computeLinksInNeighbors <- function(x, imatrix) { y= x %*% imatrix %*% x y } computeClusterCoefficient = function(adjMatrix) { no.genes <- dim(adjMatrix)[1] nolinksNeighbors <- c(rep(-666,no.genes)) total.edge <- c(rep(-666,no.genes)) #for (i in 1:no.genes){ # nolinksNeighbors[i] <- adjMatrix[i,] %*% adjMatrix %*% adjMatrix[,i] # #total.edge[i] <- adjMatrix[i,] %*% Pmax %*% adjMatrix[,i] #} nolinksNeighbors <- apply(adjMatrix, 1, computeLinksInNeighbors, imatrix=adjMatrix) plainsum <- apply(adjMatrix, 1, sum) squaresum <- apply(adjMatrix^2, 1, sum) total.edge = plainsum^2 - squaresum cluster.coef = nolinksNeighbors/total.edge cluster.coef = ifelse(total.edge>0,cluster.coef, 0) cluster.coef } pajekColorcode = function(bincolorcode) { colorcodeC=c("turquoise","blue","brown","yellow","green","red","black","pink","magenta","purple","greenyellow","tan","salmon","cyan", "midnightblue", "lightcyan","grey60", "lightgreen", "lightyellow","grey") colorcodeN=c("28", "4", "14", "1", "21", "3", "13", "5", "17", "8", "24", "30", "22", "0", "18", "31", "33", "15", "16", "38" ) rcolors=bincolorcode clevels = length(colorcodeC) for (i in c(1:clevels) ){ whichcolor = rcolors==colorcodeC[i] rcolors = ifelse(whichcolor, colorcodeN[i], rcolors) } rcolors } panel.cor <- function(x, y, digits=2, prefix="", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex * r) } cor1=function(x) { signif(cor(x,use="p"),2) } # this function computes the standard error stderror <- function(x){ sqrt( var(x)/length(x) ) } # Error bars for barplot # written by: Uli Flenker # Institute of Biochemistry # German Sports University # Cologne Carl-Diem-Weg 6 # 50933 Cologne / Germany # Phone 0049/0221/4982-493 # usage: err.bp(as.vector(means), as.vector(stderrs), two.side=F) err.bp<-function(daten,error,two.side=F){ if(!is.numeric(daten)) { stop("All arguments must be numeric") } if(is.vector(daten)){ xval<-(cumsum(c(0.7,rep(1.2,length(daten)-1)))) }else{ if (is.matrix(daten)){ xval<-cumsum(array(c(1,rep(0,dim(daten)[1]-1)), dim=c(1,length(daten))))+0:(length(daten)-1)+.5 }else{ stop("First argument must either be a vector or a matrix") } } MW<-0.25*(max(xval)/length(xval)) ERR1<-daten+error ERR2<-daten-error for(i in 1:length(daten)){ segments(xval[i],daten[i],xval[i],ERR1[i]) segments(xval[i]-MW,ERR1[i],xval[i]+MW,ERR1[i]) if(two.side){ segments(xval[i],daten[i],xval[i],ERR2[i]) segments(xval[i]-MW,ERR2[i],xval[i]+MW,ERR2[i]) } } } #works for binary outcomes krusktest=function( x ) { len1=dim(x)[[2]]-1 out1=rep(666, len1); for (i in c(1:len1) ) {out1[i]= signif( kruskal.test(x[,i+1], x[,1] )$p.value ,2) } data.frame( variable=names(x)[-1] , kruskP=out1) } #access the pvalues of correlation between contiunous vectors CORtest=function( x ) { len1=dim(x)[[2]]-1 out1=rep(666, len1); for (i in c(1:len1) ) {out1[i]= signif( cor.test(x[,i+1], x[,1] ,method="p",use="p")$p.value ,2) } data.frame( variable=names(x)[-1] , cor.Pvalue=out1) } #get the filename without extension getFileName=function(fullfname){ splitted=strsplit(fullfname,'') i= length( splitted[[1]] ) while ( splitted[[1]][i] != '.' ){ i=i-1 } mfname ='' for (j in c(1:(i-1))) mfname = paste(mfname,splitted[[1]][j],sep='') } replaceChars=function(fullfname, oldchar, newchar){ splitted=strsplit(fullfname,'') i= length( splitted[[1]] ) for (j in c(1:i) ){ if (splitted[[1]][j] == oldchar){ splitted[[1]][j] = newchar } } mfname ='' for (j in c(1:i) ) mfname = paste(mfname,splitted[[1]][j],sep='') mfname } appendStringToFile=function(fname, mstring){ fp <- file(fname, "a") cat(mstring, "\n", file=fp) close(fp) } appendListToFile=function(fname, mlist, listTitle=""){ fp <- file(fname, "a") if (length(listTitle)>0){ cat(as.character(listTitle), "\n", file=fp) } no.fields=length(mlist) #write column title for (z in 1:no.fields ){ cat(as.character(names(mlist[z])),"\t", file=fp) } cat("\n", file=fp) for (z in 1:no.fields ){ itab=mlist[z] cat(as.character(itab[[1]]),"\t", file=fp) } cat("\n", file=fp) close(fp) } appendTableToFile=function(fname, mtable, tableTitle=""){ fp <- file(fname, "a") if (length(tableTitle)>0){ cat(as.character(tableTitle), "\n", file=fp) } if ( (!is.na( dim(mtable)[1])) & is.na( dim(mtable)[2]) ) {#only one row in the table #write column title coltitles=names(mtable) for (z in 1:length(coltitles) ){ cat(as.character(coltitles[z]),"\t", file=fp) } cat("\n", file=fp) for (i in 1:(dim(mtable)[1]) ){ cat(as.character(mtable[i]), "\t", file=fp) } cat("\n", file=fp) }else{ # normal table cat(" \t", file=fp) #write column title coltitles=colnames(mtable) for (z in 1:length(coltitles) ){ cat(as.character(coltitles[z]),"\t", file=fp) } cat("\n", file=fp) rowsname = rownames(mtable) for (i in 1:(dim(mtable)[1]) ){ cat(as.character(rowsname[i]), "\t", file=fp) for(j in 1:(dim(mtable)[2])){ cat(as.character(mtable[i, j]), "\t", file=fp) } cat("\n", file=fp) } } cat("\n", file=fp) close(fp) } appendMultiTablesToFile=function(fname, multitables){ #table of tables if(is.na( dim(multitables)[2]) ) { titles=names(multitables) for (i in 1:(dim(multitables)[1]) ){ appendTableToFile(fname, multitables[[i]], as.character(titles[i])) } }else{#single table appendTableToFile(fname, multitables) } } openImgDev=function(imgname, imgtype="png", iwidth = 480, iheight = 480, ipointsize = 12, ibg = "white") { if (imgtype=="ps"){ postscript(imgname,width=iwidth, height=iheight, pointsize=ipointsize, bg=ibg) } #if(imgtype=="png"){ else{ png(imgname, width=iwidth, height=iheight, pointsize=ipointsize, bg=ibg) } trellis.device(new = FALSE, col = TRUE, bg = "white") } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Function: compute sigmoid functions, single/vector inputs ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #compute sigmoid function sigmoid <- function(x,alpha,tau) { y=1.0+exp( -alpha*(x-tau) ) y=1.0/y y } #compute sigmoid values for a matrix of inputs sigmoidMatrix <- function(xmatrix,alpha,tau0) { ey = exp( -alpha*(xmatrix - tau0) ) #ey = 3^( -alpha*(xmatrix - tau0) ) y= 1.0/(1.0+ey) rm(ey) y } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Function: correlation cutoff Computation ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #input is a matrix with Pearson correlations #iteratively call gc() untill there is no change in memory hit #i.e., all freed memory has been released to the system #Usage: 1. immediately call this function after you call a function or # 2. rm() rm(collect_garbage); collect_garbage=function(){ while (gc()[2,4] != gc()[2,4]){} } dichotcut = function(corrlMatrix,noOFsamples, mlogFname, samplingSize=3000, RsquaredCut=0.8) { orgSize = dim(corrlMatrix)[1] #cat('samplingSize', samplingSize) mincutval=0.4 maxcutval=0.9 # perform sampling subsetsize=min(samplingSize, orgSize) subset1=sample(c(1:orgSize),subsetsize,replace=F) cor1 <- corrlMatrix[subset1, subset1] cutvector=seq(mincutval, maxcutval, by=0.05) no.genes <- dim(cor1)[[2]] colname1=c("Cut","p-value", "Adj R^2","Truncated Adj R^2", "slope","mean(k)","median(k)","max(k)") datout=data.frame(matrix(666,nrow=length(cutvector),ncol=length(colname1) )) names(datout)=colname1 for (i in c(1:length(cutvector) ) ){ cut1=cutvector[i] dichotCorhelp <- I(abs(cor1)>cut1 )+0.0 diag(dichotCorhelp)<- 0 nolinkshelp <- apply(dichotCorhelp,2,sum, na.rm=TRUE) # let's check whether there is a scale free topology no.breaks=15 cut2=cut(nolinkshelp,no.breaks) freq1=tapply(nolinkshelp,cut2,length)/length(nolinkshelp) binned.k=tapply(nolinkshelp, cut2,mean) #remove NAs noNAs = !(is.na(freq1) | is.na(binned.k)) freq.noNA= freq1[noNAs] k.noNA = binned.k[noNAs] #remove Zeros noZeros = !(freq.noNA==0 | k.noNA==0) freq.log = as.numeric(log10(freq.noNA[noZeros])) k.log = as.numeric(log10(k.noNA[noZeros])) lm1=lm( freq.log ~ k.log, na.action=na.omit) lm2=lm( freq.log ~ k.log +I(10^freq.log) ); pvalue=2*(1-pt(sqrt(noOFsamples-1)*cut1/sqrt(1-cut1^2),noOFsamples-1)) datout[i,]=signif(c( cut1, pvalue, summary(lm1)$adj.r.squared, summary(lm2)$adj.r.squared, lm1$coef[[2]],mean(nolinkshelp), median(nolinkshelp), max(nolinkshelp) ), 3) } #output data #datout[length(cutvector)+1, ] = c( "Selected Cutoff = ", cutvector[indcut][[1]],"","","","","") print(datout); write.table(datout, mlogFname, sep="\t",quote=FALSE, col.names=TRUE, row.names=FALSE) corrlcutoff=NA msqrcut = RsquaredCut while(is.na(corrlcutoff)){ ind1 = datout[,3] > msqrcut indcut = NA indcut = ifelse(sum(ind1)>0,min(c(1:length(ind1))[ind1]),indcut) corrlcutoff = cutvector[indcut][[1]] msqrcut = msqrcut - 0.05 } corrlcutoff } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Function: correlation POWER Computation ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #input is a matrix with Pearson correlations powercut = function(corrlMatrix,noOFsamples, mlogFname, samplingSize=3000, RsquaredCut=0.8, mincut=1,bystep=0.5) { orgSize = dim(corrlMatrix)[1] #cat('samplingSize', samplingSize) mincutval=mincut maxcutval=10 # perform sampling subsetsize=min(samplingSize, orgSize) subset1=sample(c(1:orgSize),subsetsize,replace=F) cor1 <- corrlMatrix[subset1, subset1] cutvector=seq(mincutval, maxcutval, by=bystep) no.genes <- dim(cor1)[[2]] colname1=c("Cut","p-value", "Adj R^2","Truncated Adj R^2", "slope","mean(k)","median(k)","max(k)") datout=data.frame(matrix(666,nrow=length(cutvector),ncol=length(colname1) )) names(datout)=colname1 for (i in c(1:length(cutvector) ) ){ cut1=cutvector[i] dichotCorhelp <- abs(cor1)^cut1 diag(dichotCorhelp)<- 0 nolinkshelp <- apply(dichotCorhelp,2,sum, na.rm=TRUE) # let's check whether there is a scale free topology no.breaks=15 cut2=cut((nolinkshelp+1),no.breaks) freq1=tapply(nolinkshelp,cut2,length) binned.k=tapply(nolinkshelp+1,cut2,mean) #remove NAs noNAs = !(is.na(freq1) | is.na(binned.k)) freq.noNA= freq1[noNAs] k.noNA = binned.k[noNAs] #remove Zeros noZeros = !(freq.noNA==0 | k.noNA==0) freq.log = as.numeric(log10(freq.noNA[noZeros])) k.log = as.numeric(log10(k.noNA[noZeros])) lm1=lm( freq.log ~ k.log, na.action=na.omit) lm2=lm( freq.log ~ k.log +I(10^freq.log) ); #pvalue=2*(1-pt(sqrt(noOFsamples-1)*cut1/sqrt(1-cut1^2),noOFsamples-1)) pvalue=-1 datout[i,]=signif(c( cut1, pvalue, summary(lm1)$adj.r.squared, summary(lm2)$adj.r.squared, lm1$coef[[2]],mean(nolinkshelp), median(nolinkshelp), max(nolinkshelp) ), 3) } #in case that the largest R^2 is not bigger than RsquaredCut, #we have to reduce it gradually corrlcutoff=NA msqrcut = RsquaredCut while(is.na(corrlcutoff)){ ind1 = datout[,3] > msqrcut indcut = NA indcut = ifelse(sum(ind1)>0,min(c(1:length(ind1))[ind1]),indcut) corrlcutoff = cutvector[indcut][[1]] msqrcut = msqrcut - 0.05 } #output data #datout[length(cutvector)+1, ] = c( "Selected Cutoff = ", cutvector[indcut][[1]],"","","","","") print(datout); write.table(datout, mlogFname, sep="\t",quote=FALSE, col.names=TRUE, row.names=FALSE) corrlcutoff } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Function: correlation Sigmoid Computation, search for alpha ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #input is a matrix with Pearson correlations sigmoidcut = function(corrlMatrix,noOFsamples, mlogFname, samplingSize=3000, RsquaredCut=0.8, tau=0.5, out=0) { orgSize = dim(corrlMatrix)[1] #cat('samplingSize', samplingSize) mincutval=5 maxcutval=16 # perform sampling subsetsize=min(samplingSize, orgSize) subset1=sample(c(1:orgSize),subsetsize,replace=F) cor1 <- corrlMatrix[subset1, subset1] cutvector=seq(mincutval, maxcutval, by=0.2) no.genes <- dim(cor1)[[2]] colname1=c("Cut","p-value", "Adj R^2","Truncated Adj R^2", "slope","mean(k)","median(k)","max(k)") datout=data.frame(matrix(666,nrow=length(cutvector),ncol=length(colname1) )) names(datout)=colname1 for (i in c(1:length(cutvector) ) ){ cut1=cutvector[i] dichotCorhelp <- sigmoidMatrix( abs(cor1), cut1, tau) collect_garbage() diag(dichotCorhelp)<- 0 nolinkshelp <- apply(dichotCorhelp,2,sum, na.rm=TRUE) # let's check whether there is a scale free topology no.breaks=15 cut2=cut((nolinkshelp+1),no.breaks) freq1=tapply(nolinkshelp,cut2,length) binned.k=tapply(nolinkshelp+1,cut2,mean) #remove NAs noNAs = !(is.na(freq1) | is.na(binned.k)) freq.noNA= freq1[noNAs] k.noNA = binned.k[noNAs] #remove Zeros noZeros = !(freq.noNA==0 | k.noNA==0) freq.log = as.numeric(log10(freq.noNA[noZeros])) k.log = as.numeric(log10(k.noNA[noZeros])) lm1=lm( freq.log ~ k.log, na.action=na.omit) lm2=lm( freq.log ~ k.log +I(10^freq.log) ); #pvalue=2*(1-pt(sqrt(noOFsamples-1)*cut1/sqrt(1-cut1^2),noOFsamples-1)) pvalue=-1 datout[i,]=signif(c( cut1, pvalue, summary(lm1)$adj.r.squared, summary(lm2)$adj.r.squared, lm1$coef[[2]],mean(nolinkshelp), median(nolinkshelp), max(nolinkshelp) ), 3) rm(dichotCorhelp) collect_garbage() } #in case that the largest R^2 is not bigger than RsquaredCut, #we have to reduce it gradually corrlcutoff=NA msqrcut = RsquaredCut while(is.na(corrlcutoff)){ ind1 = datout[,4] > msqrcut indcut = NA indcut = ifelse(sum(ind1)>0,min(c(1:length(ind1))[ind1]),indcut) corrlcutoff = cutvector[indcut][[1]] msqrcut = msqrcut - 0.05 } #output data #datout[length(cutvector)+1, ] = c( "Selected Cutoff = ", cutvector[indcut][[1]],"","","","","") print(datout); if (out==0){ returnval = corrlcutoff write.table(datout, mlogFname, sep="\t",quote=FALSE, col.names=TRUE, row.names=FALSE) } else returnval = datout #return returnval } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Function: correlation Sigmoid Computation, search for alpha and tau ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sigmoidcutTauAlpha = function(mcorrlMatrix,mnoOFsamples, mmlogFname, msamplingSize=3000, mRsquaredCut=0.8) { mintau = 0.5 maxtau = 1.0 tauvector=seq(mintau, maxtau, by = 0.02) allData = c() for (i in tauvector ){ idata = sigmoidcut(corrlMatrix=mcorrlMatrix, noOFsamples=mnoOFsamples, mlogFname=mmlogFname, samplingSize=msamplingSize, RsquaredCut=mRsquaredCut, tau=i, out=1) allData = rbind(allData, idata) } write.table(allData, mmlogFname, sep="\t",quote=FALSE, col.names=TRUE, row.names=FALSE) corrlcutoff=NA msqrcut = mRsquaredCut while(is.na(corrlcutoff[1])){ ind1 = allData[,4] > msqrcut indcut = NA indcut = ifelse(sum(ind1)>0,min(c(1:length(ind1))[ind1]),indcut) corrlcutoff = allData[ indcut[[1]], c(1:2)] msqrcut = msqrcut - 0.05 } #a 2-d vector with [1]: tau and [2]: alpha corrlcutoff } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Function: automatically detect & label modules ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #The input is an hclust object. rm(moduleDetectLabel); moduleDetectLabel = function(hiercluster,heightcutoff=0.5,minsize1=20) { # here we define modules by using a height cut-off for the branches labelpred= cutree(hiercluster,h=heightcutoff) sort1=-sort(-table(labelpred)) sort1 modulename= as.numeric(names(sort1)) modulebranch= sort1 > minsize1 no.modules=sum(modulebranch) # now we assume that there are fewer than 10 modules colorcode=c("turquoise","blue","brown","yellow","green","red","black","pink","magenta","purple","greenyellow","tan","salmon","cyan", "midnightblue", "lightcyan","grey60", "lightgreen", "lightyellow") # “grey" means not in any module; colorhelp=rep("grey",length(labelpred)) if ( no.modules==0){ print("No mudule detected\n") } else{ if ( no.modules > length(colorcode) ){ print( paste("Too many modules \n", as.character(no.modules)) ) } labeledModules = min(no.modules, length(colorcode) ) for (i in c(1:labeledModules)) { colorhelp=ifelse(labelpred==modulename[i],colorcode[i],colorhelp) } colorhelp=factor(colorhelp,levels=c(colorcode[1:labeledModules],"grey")) } colorhelp } #"0" is for grey module assignModuleColor = function(labelpred, minsize1=50) { # here we define modules by using a height cut-off for the branches #labelpred= cutree(hiercluster,h=heightcutoff) #"0", grey module doesn't participate color assignment, directly assigned as "grey" labelpredNoZero = labelpred[ labelpred >0 ] sort1=-sort(-table(labelpredNoZero)) sort1 modulename= as.numeric(names(sort1)) modulebranch= sort1 > minsize1 no.modules=sum(modulebranch) # now we assume that there are fewer than 10 modules colorcode=c("turquoise","blue","brown","yellow","green","red","black","pink","magenta","purple","greenyellow","tan","salmon","cyan", "midnightblue", "lightcyan","grey60", "lightgreen", "lightyellow") #"grey" means not in any module; colorhelp=rep("grey",length(labelpred)) if ( no.modules==0){ print("No mudule detected\n") } else{ if ( no.modules > length(colorcode) ){ print( paste("Too many modules \n", as.character(no.modules)) ) #colorcode = c(colorcode, colorcode) } labeledModules = min(no.modules, length(colorcode) ) for (i in c(1:labeledModules)) { colorhelp=ifelse(labelpred==modulename[i],colorcode[i],colorhelp) } colorhelp=factor(colorhelp,levels=c(colorcode[1:labeledModules],"grey")) } colorhelp } #merge the minor cluster into the major cluster merge2Clusters = function(mycolorcode, mainclusterColor, minorclusterColor){ mycolorcode2 = ifelse(as.character(mycolorcode)==minorclusterColor, mainclusterColor, as.character(mycolorcode) ) fcolorcode =factor(mycolorcode2) fcolorcode } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Get Most Varying Genes ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## get nvars most varying genes, consider only genes with less than 20% missing values ## default use of standard deviation ## other option: coefficient of variation (CV), i.e., the standard deviation divided by mean rm(getMostVaryingGenes) getMostVaryingGenes=function(myinputfname, nvars, myHeadCol=8, missingRate=0.2, mysep="\t", vartype=0){ mfname =getFileName(myinputfname) mvgOutFname =paste(mfname, "_", nvars, "mvgenes.xls", sep='') allMatrix <- read.delim(myinputfname,sep=mysep, header=T) dims=dim(allMatrix) genesInfor <- allMatrix[,c(1:myHeadCol)] rowTitles=names(allMatrix) rowTitles=rowTitles[1:dims[2]] dat1 <-t(allMatrix[,-c(1:myHeadCol)]) ## --------- compute genes with missing values naMatrix <- is.na(dat1) nasumVector <- apply(naMatrix,2,sum, na.rm=TRUE) naprobVector <- nasumVector/dim(dat1)[[1]] dat1 <- dat1[,naprobVectorminNumberAnnotation no.annotation =sum( restannotation1) datout=data.frame(matrix(666,nrow=no.annotation,ncol=no.couleur) ) names(datout) =levelscouleur restlevelsannotation1= levelsannotation1[restannotation1] row.names(datout) = restlevelsannotation1 annotation1.chars = as.character(annotation1) for (i in c(1:no.annotation) ) { for (j in c(1:no.couleur) ){ bool.annoted= (annotation1.chars==restlevelsannotation1[i]) bool.colorj = (couleur==levelscouleur[j]) # some genes are not annoted, so we need remove those from our proportion computation bool.annoted.noNAs = bool.annoted[ !is.na(bool.annoted) ] bool.colorj.noNas = bool.colorj[ !is.na(bool.annoted) ] tab1=table(bool.annoted.noNAs, bool.colorj.noNas) pvalue = signif(fisher.test(tab1)$p.value,2) # rest module #bool.annoted.noNAs FALSE TRUE # nofunc FALSE 3257 292 # func TRUE 29 1 portion.rest = tab1[2,1]/(tab1[1,1]+tab1[2,1]) # portion of portion.colorj = tab1[2,2]/(tab1[1,2]+tab1[2,2])#=length(bool.colorj.noNas) #proportion ratioOFportions = portion.colorj/portion.rest if (outformat=="pvalue"){ datout[i,j]=pvalue }else if( outformat=="ratio" ){ datout[i,j]=signif(ratioOFportions,2) }else{ myout = paste(as.character(pvalue),"(", as.character(signif(portion.colorj,2)),",", as.character(signif(portion.rest,2)), ")", sep="") datout[i,j]= myout } }} datout } # end of function fisherPvector # this function computes the standard error rm(stderr1) stderr1 <- function(x){ sqrt( var(x,na.rm=T)/sum(!is.na(x)) ) } choosenew <- function(n,k){ n <- c(n) out1 <- rep(0,length(n)) for (i in c(1:length(n)) ){ if (n[i]1) { Cl[i] <- sample(mincluster, 1) } } # end for for over i in 1:no.test ## now we compute how often m samples are co-clustered tab0 <- table(Cl, clusttest[[kk]]$clustering) pshelp <- rep(10000000, kk) for (l in 1:kk){ ## marginals tab1=tab0 tab1[tab0=thres) {clusterNo <- nn} ##} ##clusterNo } # end of function pamPsClNo kpredictPS2=function(ps,span1=0.75,title1="prediction strength") { kk=c(1:length(ps)) kout=1 if (length(kk)>1) { lm1=loess(ps~ kk,span=span1,degree=2) rank1=rank(round(resid( lm1 ),digits=10)) kouthelp= kk[ rank1==max(rank1) ] kestimate=kouthelp[length(kouthelp)] plot(ps,main=title1,xlab="no. of clusters k",ylab="PSE",ylim=c(0,1)) lines(kk,predict(lm1)) abline(v=kestimate,col="red") abline(h=.8) points(kk,resid(lm1) ,pch="*",col="blue") kestimate } } ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## Function: Make a plot, run a couple tests and output everything to proper files ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scatterCorrelTester=function(rawValA, rawValB, binaryvar=FALSE, PlotsTitle ="", myxlab="",myylab="",outputImage="tmp.png", outputFile=tests_Fname, numPoints=20 ){ if( (sd(rawValA, na.rm = T)==0) | (sd(rawValB, na.rm = T)==0) ){ titleinfor = paste(PlotsTitle, "error in scatterCorrelTester, sd()=0", paste="") appendListToFile(tests_Fname, titleinfor) return } #in general, we will have the thing being tested be rawValA, and K be rawValB cut_factor = cut(rawValB,numPoints) # Now give me the mean proportion of essentiality in each chunk of the essentiality vector mean_rawValA = tapply(rawValA, cut_factor, mean) # Now give me the mean proportion of k in each chunk of the essentiality vector mean_rawValB = tapply(rawValB, cut_factor, mean) #now we need a p-value for this: corrl = signif(cor(rawValB,rawValA),2) #if (binaryvar==FALSE){ corrl.p = signif( cor.test(rawValB,rawValA, method="p",use="p")$p.value ,2) #}else{ # kruskal = kruskal.test(rawValB,rawValA) # corrl.p = signif(kruskal$p.value, 2) #} titleinfor = paste(PlotsTitle, ": r=", as.character(corrl),",p=", as.character(corrl.p), paste="") #Now we need to just make a line fit line =lm(mean_rawValA ~ mean_rawValB) sum1 = summary(line) sum1 #Now put in a title for this part of the data: appendListToFile(tests_Fname, titleinfor) appendListToFile(tests_Fname, sum1) # do a plot() of these two vectors against each other. openImgDev(outputImage,imgtype="png") #openImgDev(outputImage,imgtype="ps") par(mfrow=c(1,1), mar=c(5, 5, 4, 2) + 0.1) plot(mean_rawValB,mean_rawValA, main=titleinfor, xlab=myxlab, ylab=myylab) # and then draw the line abline(line, col="red") par(mfrow=c(1,1), mar=c(5, 4, 4, 2) + 0.1) dev.off() } singlePlot=function(rawValA, rawValB, plotTitle="", myylab="",myxlab="", dataSet="", numPoints=20 ){ #in general, we will have the thing being tested be rawValA, and K be rawValB cut_factor = cut(rawValB,numPoints) # Now give me the mean proportion of essentiality in each chunk of the essentiality vector mean_rawValA = tapply(rawValA, cut_factor, mean) # Now give me the mean proportion of k in each chunk of the essentiality vector mean_rawValB = tapply(rawValB, cut_factor, mean) #Now we need to just make a line fit line =lm(mean_rawValA ~ mean_rawValB) corrl = signif(cor(rawValB,rawValA),2) corrl.p = signif( cor.test(rawValB,rawValA, method="p",use="p")$p.value ,2) titleinfor = paste(as.character(dataSet),",",as.character(plotTitle),",","r=", as.character(corrl),",p=", as.character(corrl.p),sep="") # do a plot() of these two vectors against each other. #openImgDev(outputImage) plot(mean_rawValB,mean_rawValA,main=titleinfor, xlab=myxlab, ylab=myylab) # and then draw the line abline(line, col="red") #dev.off() titleinfor } conservationVectorCleaner = function(rawVal, nasVal) { # I need a way to always filter out the NAs from the mean_e_val vector... no.nas = nasVal # variable log_mean_e_val must be defined for use in plotting conservation plots mean_e_valNoNA = rawVal[no.nas] # 2nd I need to replace the 0 values with extremely small numbers (smaller than you see so e-200) mean_e_valNoNANorZero = ifelse(mean_e_valNoNA==0, 10^(-200), mean_e_valNoNA) # 3rd I need to transform the e value since its range is just HUGE log(mean_e_valNoNANorZero) #R just returns the last "thing" listed in a function }