# This document contains functions that we find useful for gene co-expression network analysis # We recommend to read the tutorial first. # Steve Horvath, Bin Zhang, Jun Dong, Andy Yip # To cite this code or the statistical methods please use # Zhang B, Horvath S (2005) A General Framework for Weighted Gene Co-Expression Network Analysis. # Statistical Applications in Genetics and Molecular Biology. In Press. # Technical Report and software code at: www.genetics.ucla.edu/labs/horvath/CoexpressionNetwork. # CONTENTS # This document contains function for carrying out the following tasks # A) Assessing scale free topology and choosing the parameters of the adjacency function # using the scale free topology criterion (Zhang and Horvath 05) # B) Computing the topological overlap matrix # C) Defining gene modules using clustering procedures # D) Summing up modules by their first principal component (first eigengene) # E) Relating a measure of gene significance to the modules # F) Carrying out a within module analysis (computing intramodular connectivity) # and relating intramodular connectivity to gene significance. # G) Miscellaneous other functions, e.g. for computing the cluster coefficient. # H) Network functions from Bin Zhang for dynamic tree cutting of a hierarchical clustering tree (dendrogram) # I) General statistical functions, e.g. for scatterplots # The functions below are organized according into categories A-G. # One of these days, somebody should turn this into an R library... ##################################################################################################### ################################################################################################################################ # A) Assessing scale free topology and choosing the parameters of the adjacency function # using the scale free topology criterion (Zhang and Horvath 05) ################################################################################################################# # =================================================== #For hard thresholding, we use the signum (step) function if(exists("signum1") ) rm(signum1); signum1=function(corhelp,tau1) { adjmat1= as.matrix(abs(corhelp)>=tau1) dimnames(adjmat1) <- dimnames(corhelp) diag(adjmat1) <- 0 adjmat1} # =================================================== # For soft thresholding, one can use the sigmoid function # But we have focused on the power adjacency function in the tutorial... if (exists("sigmoid1") ) rm(sigmoid1); sigmoid1=function(ss,mu1=0.8,alpha1=20) { 1/(1+exp(-alpha1*(ss-mu1)))} #This function is useful for speeding up the connectivity calculation. #The idea is to partition the adjacency matrix into consecutive baches of a #given size. #In principle, the larger the batch size the faster is the calculation. But #smaller batchsizes require #less memory... # Input: gene expression data set where *rows* correspond to microarray samples #and columns correspond to genes. # If fewer than MinimumNoSamples contain gene expression information for a given # gene, then its connectivity is set to missing. if(exists("SoftConnectivity")) rm(SoftConnectivity); SoftConnectivity=function(datE, power=6,batchsize=1500,MinimumNoSamples=10) { no.genes=dim(datE)[[2]] no.samples=dim(datE)[[1]] sum1=function(x) sum(x,na.rm=T) k=rep(NA,no.genes) no.batches=as.integer(no.genes/ batchsize) if (no.batches>0) { for (i in 1:no.batches) { print(paste("batch number = ", i)) index1=c(1:batchsize)+(i-1)* batchsize ad1=abs(cor(datE[,index1], datE,use="p"))^power ad1[is.na(ad1)]=0 k[index1]=apply(ad1,1,sum1) # If fewer than MinimumNoSamples contain gene expression information for a given # gene, then we set its connectivity to 0. NoSamplesAvailable=apply(!is.na(datE[,index1]),2,sum) k[index1][NoSamplesAvailable< MinimumNoSamples]=NA } # end of for (i in 1:no.batches } # end of if (no.batches>0)... if (no.genes-no.batches*batchsize>0 ) { restindex=c((no.batches*batchsize+1):no.genes) ad1=abs(cor(datE[,restindex], datE,use="p"))^power ad1[is.na(ad1)]=0 k[restindex]=apply(ad1,1,sum1) NoSamplesAvailable=apply(!is.na(datE[,restindex]),2,sum) k[restindex][NoSamplesAvailable< MinimumNoSamples]=NA } # end of if k } # end of function # =================================================== # The function PickHardThreshold can help one to estimate the cut-off value # when using the signum (step) function. # The first column lists the threshold ("cut"), # the second column lists the corresponding p-value based on the Fisher Transform # of the correlation. # The third column reports the resulting scale free topology fitting index R^2. # The fourth column reports the slope of the fitting line, it shoud be negative for # biologically meaningul networks. # The fifth column reports the fitting index for the truncated exponential model. # Usually we ignore it. # The remaining columns list the mean, median and maximum resulting connectivity. # To pick a hard threshold (cut) with the scale free topology criterion: # aim for high scale free R^2 (column 3), high connectivity (col 6) and negative slope # (around -1, col 4). # The output is a list with 2 components. The first component lists a sugggested cut-off # while the second component contains the whole table. # The removeFirst option removes the first point (k=0, P(k=0)) from the regression fit. # no.breaks specifies how many intervals used to estimate the frequency p(k) i.e. the no. of points in the # scale free topology plot. if (exists("PickHardThreshold")) rm(PickHardThreshold); PickHardThreshold=function(datExpr1,RsquaredCut=0.85, cutvector=seq(0.1,0.9,by=0.05) ,removeFirst=FALSE,no.breaks=10) { no.genes <- dim(datExpr1)[[2]] no.genes <- dim(datExpr1)[[2]] no.samples= dim(datExpr1)[[1]] colname1=c("Cut","p-value", "scale law R^2", "slope=" ,"truncated R^2","mean(k)","median(k)","max(k)") datout=data.frame(matrix(666,nrow=length(cutvector),ncol=length(colname1) )) names(datout)=colname1 datout[,1]=cutvector for (i in c(1:length(cutvector) ) ){ cut1=cutvector[i] datout[i,2]=2*(1-pt(sqrt(no.samples-1)*cut1/sqrt(1-cut1^2),no.samples-1))} if(exists("fun1")) rm(fun1) fun1=function(x) { corx=abs(cor(x,datExpr1,use="p")) out1=rep(NA, length(cutvector) ) for (j in c(1:length(cutvector))) {out1[j]=sum(corx>cutvector[j])} out1 } # end of fun1 datk=t(apply(datExpr1,2,fun1)) for (i in c(1:length(cutvector) ) ){ nolinkshelp <- datk[,i]-1 cut2=cut(nolinkshelp,no.breaks) binned.k=tapply(nolinkshelp,cut2,mean) freq1=as.vector(tapply(nolinkshelp,cut2,length)/length(nolinkshelp)) # The following code corrects for missing values etc breaks1=seq(from=min(nolinkshelp),to=max(nolinkshelp),length=no.breaks+1) hist1=hist(nolinkshelp,breaks=breaks1,equidist=F,plot=FALSE,right=TRUE) binned.k2=hist1$mids binned.k=ifelse(is.na(binned.k),binned.k2,binned.k) binned.k=ifelse(binned.k==0,binned.k2,binned.k) freq1=ifelse(is.na(freq1),0,freq1) xx= as.vector(log10(binned.k)) if(removeFirst) {freq1=freq1[-1]; xx=xx[-1]} plot(xx,log10(freq1+.000000001),xlab="log10(k)",ylab="log10(p(k))" ) lm1= lm(as.numeric(log10(freq1+.000000001))~ xx ) lm2=lm(as.numeric(log10(freq1+.000000001))~ xx+I(10^xx) ) datout[i,3]=summary(lm1)$adj.r.squared datout[i,4]=summary(lm1)$coefficients[2,1] datout[i,5]=summary(lm2)$adj.r.squared datout[i,6]=mean(nolinkshelp) datout[i,7]= median(nolinkshelp) datout[i,8]= max(nolinkshelp) } datout=signif(datout,3) print(data.frame(datout)); # the cut-off is chosen as smallest cut with R^2>RsquaredCut ind1=datout[,3]>RsquaredCut indcut=NA indcut=ifelse(sum(ind1)>0,min(c(1:length(ind1))[ind1]),indcut) # this estimates the cut-off value that should be used. # Don't trust it. You need to consider slope and mean connectivity as well! cut.estimate=cutvector[indcut][[1]] list(cut.estimate, data.frame(datout)); } # end of function # =========================================================== # The function PickSoftThreshold allows one to estimate the power parameter when using # a soft thresholding approach with the use of the power function AF(s)=s^Power # The function PickSoftThreshold allows one to estimate the power parameter when using # a soft thresholding approach with the use of the power function AF(s)=s^Power # The removeFirst option removes the first point (k=1, P(k=1)) from the regression fit. if (exists("PickSoftThreshold")) rm(PickSoftThreshold); PickSoftThreshold=function(datExpr1,RsquaredCut=0.85, powervector=c(seq(1,10,by=1),seq(12,20,by=2)), removeFirst=FALSE,no.breaks=10) { no.genes <- dim(datExpr1)[[2]] no.genes <- dim(datExpr1)[[2]] no.samples= dim(datExpr1)[[1]] colname1=c("Power", "scale law R^2" ,"slope", "truncated R^2","mean(k)","median(k)","max(k)") datout=data.frame(matrix(666,nrow=length(powervector),ncol=length(colname1) )) names(datout)=colname1 datout[,1]=powervector if(exists("fun1")) rm(fun1) fun1=function(x) { corx=abs(cor(x,datExpr1,use="p")) out1=rep(NA, length(powervector) ) for (j in c(1:length(powervector))) {out1[j]=sum(corx^powervector[j])} out1 } # end of fun1 datk=t(apply(datExpr1,2,fun1)) for (i in c(1:length(powervector) ) ){ nolinkshelp <- datk[,i]-1 cut2=cut(nolinkshelp,no.breaks) binned.k=tapply(nolinkshelp,cut2,mean) freq1=as.vector(tapply(nolinkshelp,cut2,length)/length(nolinkshelp)) # The following code corrects for missing values etc breaks1=seq(from=min(nolinkshelp),to=max(nolinkshelp),length=no.breaks+1) hist1=hist(nolinkshelp,breaks=breaks1,equidist=F,plot=FALSE,right=TRUE) binned.k2=hist1$mids binned.k=ifelse(is.na(binned.k),binned.k2,binned.k) binned.k=ifelse(binned.k==0,binned.k2,binned.k) freq1=ifelse(is.na(freq1),0,freq1) xx= as.vector(log10(binned.k)) if(removeFirst) {freq1=freq1[-1]; xx=xx[-1]} plot(xx,log10(freq1+.000000001),xlab="log10(k)",ylab="log10(p(k))" ) lm1= lm(as.numeric(log10(freq1+.000000001))~ xx ) lm2=lm(as.numeric(log10(freq1+.000000001))~ xx+I(10^xx) ) datout[i,2]=summary(lm1)$adj.r.squared datout[i,3]=summary(lm1)$coefficients[2,1] datout[i,4]=summary(lm2)$adj.r.squared datout[i,5]=mean(nolinkshelp) datout[i,6]= median(nolinkshelp) datout[i,7]= max(nolinkshelp) } datout=signif(datout,3) print(data.frame(datout)); # the cut-off is chosen as smallest cut with R^2>RsquaredCut ind1=datout[,2]>RsquaredCut indcut=NA indcut=ifelse(sum(ind1)>0,min(c(1:length(ind1))[ind1]),indcut) # this estimates the power value that should be used. # Don't trust it. You need to consider slope and mean connectivity as well! power.estimate=powervector[indcut][[1]] list(power.estimate, data.frame(datout)); } # =================================================== # The function ScaleFreePlot1 creates a plot for checking scale free topology # when truncated1=T is specificed, it provides the R^2 measures for the following # degree distributions: a) scale free topology, b) log-log R^2 and c) truncated exponential R^2 # The function ScaleFreePlot1 creates a plot for checking scale free topology if(exists("ScaleFreePlot1")) rm(ScaleFreePlot1) ; ScaleFreePlot1=function(kk,no.breaks=10,AF1="" ,truncated1=FALSE, removeFirst=FALSE,cex.lab1=1){ cut1=cut(kk,no.breaks) binned.k=tapply(kk,cut1,mean) freq1=tapply(kk,cut1,length)/length(kk) # The following code corrects for missing values etc breaks1=seq(from=min(kk),to=max(kk),length=no.breaks+1) hist1=hist(kk,breaks=breaks1,equidist=F,plot=FALSE,right=TRUE) binned.k2=hist1$mids binned.k=ifelse(is.na(binned.k),binned.k2,binned.k) binned.k=ifelse(binned.k==0,binned.k2,binned.k) freq1=ifelse(is.na(freq1),0,freq1) plot(log10(binned.k),log10(freq1+.000000001),xlab="log10(k)",ylab="log10(p(k))",cex.lab=cex.lab1 ) xx= as.vector(log10(binned.k)) if(removeFirst) {freq1=freq1[-1]; xx=xx[-1]} lm1=lm(as.numeric(log10(freq1+.000000001))~ xx ) lines(xx,predict(lm1),col=1) OUTPUT=data.frame(ScaleFreeRsquared=round(summary(lm1)$adj.r.squared,2),Slope=round(lm1$coefficients[[2]],2)) if (truncated1==TRUE) { lm2=lm(as.numeric(log10(freq1+.000000001))~ xx+I(10^xx) ); OUTPUT=data.frame(ScaleFreeRsquared=round(summary(lm1)$adj.r.squared,2),Slope=round(lm1$coefficients[[2]],2), TruncatedRsquared=round(summary(lm2)$adj.r.squared,2)) print("the red line corresponds to the truncated exponential fit") lines(xx,predict(lm2),col=2); title(paste(AF1, ", scale free R^2=",as.character(round(summary(lm1)$adj.r.squared,2)), ", slope=", round(lm1$coefficients[[2]],2), ", trunc.R^2=",as.character(round(summary(lm2)$adj.r.squared,2))))} else { title(paste(AF1, ", scale R^2=",as.character(round(summary(lm1)$adj.r.squared,2)) , ", slope=", round(lm1$coefficients[[2]],2))) } OUTPUT } # end of function ################################################################################################################# ################################################################################################################################ # B) Computing the topological overlap matrix ################################################################################################################# ################################################################################################################# # =================================================== #The function TOMdist1 computes a dissimilarity # based on the topological overlap matrix (Ravasz et al) # Input: an Adjacency matrix with entries in [0,1] if(exists("TOMdist1")) rm(TOMdist1); TOMdist1=function(adjmat1, maxADJ=FALSE) { diag(adjmat1)=0; adjmat1[is.na(adjmat1)]=0; maxh1=max(as.dist(adjmat1) ); minh1=min(as.dist(adjmat1) ); if (maxh1>1 | minh1 < 0 ) {print(paste("ERROR: the adjacency matrix contains entries that are larger than 1 or smaller than 0!!!, max=",maxh1,", min=",minh1)) } else { if ( max(c(as.dist(abs(adjmat1-t(adjmat1)))))>0 ) {print("ERROR: non-symmetric adjacency matrix!!!") } else { kk=apply(adjmat1,2,sum) maxADJconst=1 if (maxADJ==TRUE) maxADJconst=max(c(as.dist(adjmat1 ))) Dhelp1=matrix(kk,ncol=length(kk),nrow=length(kk)) denomTOM= pmin(as.dist(Dhelp1),as.dist(t(Dhelp1))) +as.dist(maxADJconst-adjmat1); gc();gc(); numTOM=as.dist(adjmat1 %*% adjmat1 +adjmat1); #TOMmatrix=numTOM/denomTOM # this turns the TOM matrix into a dissimilarity out1=1-as.matrix(numTOM/denomTOM) diag(out1)=1 out1 }} } # =================================================== # This function computes a TOMk dissimilarity # which generalizes the topological overlap matrix (Ravasz et al) # Input: an Adjacency matrix with entries in [0,1] # WARNING: ONLY FOR UNWEIGHTED NETWORKS, i.e. the adjacency matrix contains binary entries... # This function is explained in Yip and Horvath (2005) # http://www.genetics.ucla.edu/labs/horvath/GTOM/ if(exists("TOMkdist1")) rm(TOMkdist1); TOMkdist1 = function(adjmat1,k=1){ maxh1=max(as.dist(adjmat1) ); minh1=min(as.dist(adjmat1) ); if (k!=round(abs(k))) { stop("k must be a positive integer!!!", call.=TRUE);} if (maxh1>1 | minh1 < 0 ){ print(paste("ERROR: entries of the adjacency matrix must be between inclusively 0 and 1!!!, max=",maxh1,", min=",minh1))} else { if ( max(c(as.dist(abs(adjmat1-t(adjmat1)))))>0 ) {print("ERROR: non-symmetric adjacency matrix!!!") } else { B <- adjmat1; if (k>=2) { for (i in 2:k) { diag(B) <- diag(B) + 1; B = B %*% adjmat1;}} # this gives the number of paths with length at most k connecting a pair B <- (B>0); # this gives the k-step reachability from a node to another diag(B) <- 0; # exclude each node being its own neighbor B <- B %*% B # this gives the number of common k-step-neighbor that a pair of nodes share Nk <- diag(B); B <- B +adjmat1; # numerator diag(B) <- 1; denomTOM=outer(Nk,Nk,FUN="pmin")+1-adjmat1; diag(denomTOM) <- 1; 1 - B/denomTOM # this turns the TOM matrix into a dissimilarity }} } # IGNORE THIS function... # The function TOMdistROW computes the TOM distance of a gene (node) # with that of all other genes in the network. # WhichRow is an integer that specifies which row of the adjacency matrix # corresponds to the gene of interest. # Output=vector of TOM distances. if (exists("TOMdistROW") ) rm(TOMdistROW) TOMdistROW=function(WhichRow=1, adjmat1, maxADJ=FALSE) { diag(adjmat1)=0; maxh1=max(as.dist(adjmat1) ); minh1=min(as.dist(adjmat1) ); if (maxh1>1 | minh1 < 0 ) {print(paste("ERROR: the adjacency matrix contains entries that are larger than 1 or smaller than 0!!!, max=",maxh1,", min=",minh1)) } else { kk=apply(adjmat1,2,sum) numTOM=adjmat1[WhichRow,] %*% adjmat1 +adjmat1[WhichRow,]; numTOM[WhichRow]=1 maxADJconst=1 if (maxADJ==TRUE) maxADJconst=max(c(as.dist(adjmat1 ))) denomTOM=pmin(kk[WhichRow],kk)+maxADJconst-adjmat1[WhichRow,]; denomTOM[WhichRow]=1 #TOMmatrix=numTOM/denomTOM # this turns the TOM matrix into a dissimilarity 1-numTOM/denomTOM } } ##################################################################################################### ################################################################################################################################ # C) Defining gene modules using clustering procedures ##################################################################################################### ################################################################################################################################ # =================================================== #The function modulecolor2 function assigns colors to the observations # in the branches of a dendrogram # we use it to define modules.... if (exists("modulecolor2")) rm(modulecolor2); modulecolor2=function(hier1, h1=0.9,minsize1=50) { # here we define modules by using a height cut-off for the branches labelpred= cutree(hier1,h=h1) sort1=-sort(-table(labelpred)) modulename= as.numeric(names(sort1)) modulebranch= sort1>minsize1 no.modules=sum(modulebranch) # now we assume that there are fewer than a certain number of colors colorcode=c("turquoise","blue","brown","yellow","green","red","black","purple","orange","pink", "greenyellow","lightcyan","salmon","midnightblue","lightyellow") # "grey" means not in any module; colorhelp=rep("grey",length(labelpred)) if ( no.modules==0 | no.modules >length(colorcode)){ print(paste("The number of modules is problematic! \! Number of modules = ", as.character(no.modules)))} else { for (i in c(1:no.modules)) {colorhelp=ifelse(labelpred==modulename[i],colorcode[i],colorhelp)}; colorhelp=factor(colorhelp,levels=c(colorcode[1:no.modules],"grey")) } factor(colorhelp, levels=unique(colorhelp[hier1$order] )) } # The function hclustplot1 creates a barplot where the colors of the bars are sorted according to # a hierarchical clustering tree (hclust object) if (exists("hclustplot1")) rm(hclustplot1); hclustplot1=function(hier1,couleur,title1="Colors sorted by hierarchical clustering") { if (length(hier1$order) != length(couleur) ) {print("ERROR: length of color vector not compatible with no. of objects in the hierarchical tree")}; if (length(hier1$order) == length(couleur) ) { barplot(height=rep(1, length(couleur)), col= as.character(couleur[hier1$order]),border=F, main=title1,space=0, axes=F)} } # =================================================== # The function TOMplot1 creates a TOM plot # Inputs: distance measure, hierarchical (hclust) object, color label=couleur if (exists("TOMplot1")) rm(TOMplot1); TOMplot1=function(disttom,hier1, couleur,terrainColors=FALSE) { no.nodes=length(couleur) if (no.nodes != dim(disttom)[[1]] ) {print("ERROR: number of color labels does not equal number of nodes in disttom")} else { labeltree=as.character(couleur) labelrow = labeltree labelrow[hier1$order[length(labeltree):1]]=labelrow[hier1$order] options(expressions = 10000) if (terrainColors) heatmap(as.matrix(disttom),Rowv=as.dendrogram(hier1),Colv= as.dendrogram(hier1), scale="none",revC=T, ColSideColors=as.character(labeltree),RowSideColors=as.character(labelrow), labRow=F, labCol=F, col = terrain.colors(1000)) else heatmap(as.matrix(disttom),Rowv=as.dendrogram(hier1),Colv= as.dendrogram(hier1), scale="none",revC=T, ColSideColors=as.character(labeltree),RowSideColors=as.character(labelrow), labRow=F, labCol=F) } } #end of function # =================================================== # The function TOMplot2 creates a TOM plot where the top and left color bars can be different # Inputs: distance measure, hierarchical (hclust) object, color label=couleurTop, couleurLeft if (exists("TOMplot2")) rm(TOMplot2); TOMplot2=function(disttom,hier1, couleurTop, couleurLeft) { no.nodes=length(couleurTop) if (no.nodes != length(couleurLeft)) {stop("ERROR: number of top color labels does not equal number of left color labels")} if (no.nodes != dim(disttom)[[1]] ) {stop("ERROR: number of color labels does not equal number of nodes in disttom")} else { labeltree = as.character(couleurTop) labelrow = as.character(couleurLeft) labelrow[hier1$order[length(labeltree):1]]=labelrow[hier1$order] options(expressions = 10000) heatmap(as.matrix(disttom),Rowv=as.dendrogram(hier1),Colv= as.dendrogram(hier1), scale="none", revC=T, ColSideColors=as.character(labeltree),RowSideColors=as.character(labelrow), labRow=F, labCol=F) } } #end of function # IGNORE THIS FUNCTION... # The function "BestHeightCut" allows one to find the best height cut-off # for a hierarchical clustering tree when external gene information is available # It computes a Kruskal Wallis-test p-value for each height cut-off # based on determining whether gene significance differs across branch membership. if(exists("BestHeightCut")) rm(BestHeightCut); BestHeightCut=function(hier1, GeneSignif, hcut=seq(0.1,.95,by=0.01) ) { pvalues=rep(NA, length(hcut)) for (i in c(1:length(hcut))) { colorhelp=modulecolor2(hier1,hcut[i]) if (length(unique(colorhelp))>1 ) {pvalues[i]=kruskal.test(GeneSignif, colorhelp)$p.value} data.frame(hcut,pvalues) }} ##################################################################################################### ################################################################################################################################ # D) Summing up modules using their first principal components (first eigengene) ##################################################################################################### ################################################################################################################################ # =================================================== #The function ModulePrinComps1 finds the first principal component (eigengene) in each # module defined by the colors of the input vector "couleur" (Pardon my French). # It also reports the variances explained by the first 5 principal components. # And it yields a measure of module conformity for each gene, # which is highly correlated to the within module connectivity. # The theoretical underpinnings are described in Horvath, Dong, Yip (2005) # http://www.genetics.ucla.edu/labs/horvath/ModuleConformity/ # This requires the R library impute # The output is a list with 3 components: # 1) a data frame of module eigengenes (PCs), # 2) a data frame that lists the percent variance explained by the first 5 PCs of a module # 3) a data frame that lists the module conformity for each gene. # The be used as alternative connectivity measure.... if(exists("ModulePrinComps1")) rm(ModulePrinComps1); ModulePrinComps1=function(datexpr,couleur) { modlevels=levels(factor(couleur)) PrinComps=data.frame(matrix(666,nrow=dim(datexpr)[[1]],ncol= length(modlevels))) varexplained= data.frame(matrix(666,nrow= 5,ncol= length(modlevels))) names(PrinComps)=paste("PC",modlevels,sep="") for(i in c(1:length(modlevels)) ){ print(i) modulename = modlevels[i] restrict1= as.character(couleur)== modulename # in the following, rows are genes and columns are samples datModule=t(datexpr[, restrict1]) datModule=impute.knn(as.matrix(datModule)) datModule=t(scale(t(datModule))) svd1=svd(datModule) mtitle=paste("PCs of ", modulename," module", sep="") varexplained[,i]= (svd1$d[1:5])^2/sum(svd1$d^2) # this is the first principal component pc1=svd1$v[,1] signh1=sign(sum(cor(pc1, t(datModule)))) if (signh1 != 0) pc1=signh1* pc1 PrinComps[,i]= pc1 } ModuleConformity= rep(666,length=dim(datexpr)[[2]]) for(i in 1:(dim(datexpr)[[2]])) ModuleConformity[i]=abs(cor(datexpr[,i], PrinComps[,match(couleur[i], modlevels)], use="pairwise.complete.obs")) list(PrinComps=PrinComps, varexplained=varexplained, ModuleConformity=ModuleConformity) } ##################################################################################################### ################################################################################################################################ # E) Relating a measure of gene significance to the modules ##################################################################################################### ################################################################################################################################ # =================================================== # The function ModuleEnrichment1 creates a bar plot that shows whether modules are enriched with # significant genes. # More specifically, it reports the mean gene significance for each module. # The gene significance can be a binary variable or a quantitative variable. # It also plots the 95% confidence interval of the mean (CI=mean +/- 1.96* standard error). # It also reports a Kruskal Wallis P-value. if( exists("ModuleEnrichment1") ) rm(ModuleEnrichment1); ModuleEnrichment1=function(genesignif1,couleur,title1="gene significance across modules",labely="Gene Significance",boxplot=F) { if (length(genesignif1) != length(couleur) ) print("Error: vectors donĄ¯t have the same lengths") else { if (boxplot != TRUE) { mean1=function(x) mean(x,na.rm=T) means1=as.vector(tapply(genesignif1,couleur,mean1)); se1= as.vector(tapply(genesignif1,couleur,stderr1)) par(mfrow=c(1,1)) barplot(means1, names.arg=names(table(couleur) ),col= names(table(couleur) ) ,ylab=labely) err.bp(as.vector(means1), as.vector(1.96*se1), two.side=T)} else { boxplot(split(genesignif1,couleur),notch=T,varwidth=T, col= names(table(couleur) ),ylab=labely)} title(paste(title1,", p-value=", signif(kruskal.test(genesignif1,factor(couleur))$p.value,2))) } } # end of function # IGNORE THIS... # =================================================== #The function fisherPvector allows one to compute Fisher exact p-values # Thus it allows one to carry out an EASE analysis # Output: a table of FisherĄ¯s exact p-values # Input: annotation1 is a vector of gene annotations # Input: couleur (French for color) denotes the module color of each gene # Only those gene functions (levels of annotation1) that occur a certain mininum number of times # (parameter= minNumberAnnotation) in the data set will be considered. if (exists("fisherPvector" ) ) rm(fisherPvector); fisherPvector=function(couleur,annotation1,minNumberAnnotation=50) { levelsannotation1=levels(annotation1) levelscouleur=levels(factor(couleur)) no.couleur=length(levelscouleur) restannotation1=table(annotation1)>minNumberAnnotation no.annotation=sum( restannotation1) datoutP=data.frame(matrix(666,nrow=no.annotation,ncol=no.couleur) ) #datoutProp=data.frame(matrix(666,nrow=no.annotation,ncol=2*no.couleur) ) #names(datoutProp)=paste("Prop",paste( rep(levelscouleur ,rep(2, length(levelscouleur))) ) , c("Y","N") ,sep=".") datoutProp=data.frame(matrix(666,nrow=no.annotation,ncol=no.couleur) ) names(datoutProp)=paste("Perc",levelscouleur , sep=".") names(datoutP)=paste("p",levelscouleur,sep=".") restlevelsannotation1= levelsannotation1[restannotation1] row.names(datoutP)= restlevelsannotation1 for (i in c(1:no.annotation) ) { for (j in c(1:no.couleur) ){ tab1=table( annotation1 !=restlevelsannotation1[i], couleur !=levelscouleur[j]) datoutP[i,j]=signif(fisher.test(tab1)$p.value,2) #datoutProp[i,2*j-1]=signif(tab1[1,1]/sum(tab1[,1] ),2) #datoutProp[i,2*j]= signif(tab1[1,2]/sum(tab1[,2]) ,2) } table2=table(annotation1 !=restlevelsannotation1[i], couleur) datoutProp[i,]= signif(table2[1,]/apply(table2,2,sum),2) } data.frame(datoutP,datoutProp) } # end of function fisherPvector ##################################################################################################### ################################################################################################################################ # F) Carrying out a within module analysis (computing intramodular connectivity etc) ##################################################################################################### ################################################################################################################################ # =================================================== #The function DegreeInOut computes for each gene #a) the total number of connections, #b) the number of connections with genes within its module, #c) the number of connections with genes outside its module # When scaledToOne=TRUE, the within module connectivities are scaled to 1, i.e. the max(K.Within)=1 for each module if (exists("DegreeInOut")) rm(DegreeInOut); DegreeInOut =function(adj1, couleur,scaledToOne=FALSE) { no.nodes=length(couleur) couleurlevels=levels(factor(couleur)) no.levels=length(couleurlevels) kWithin=rep(-666,no.nodes ) diag(adj1)=0 for (i in c(1:no.levels) ) { rest1=couleur==couleurlevels[i]; if (sum(rest1) <3 ) { kWithin[rest1]=0 } else { kWithin[rest1]=apply(adj1[rest1,rest1],2,sum) if (scaledToOne) kWithin[rest1]=kWithin[rest1]/max(kWithin[rest1])} } kTotal= apply(adj1,2,sum) kOut=kTotal-kWithin if (scaledToOne) kOut=NA kDiff=kWithin-kOut data.frame(kTotal,kWithin,kOut,kDiff) } # ======================================================================= # The function WithinModuleCindex1 relates the node measures (e.g. connectivities) # to "external" node significance information within each module, # i.e. it carries out a by module analysis. # Output: first column reports the spearman correlation p-value between the network variable and the # node significance. The next columns contain the Spearman correlations between the variables. if (exists("WithinModuleAnalysis1")) rm(WithinModuleAnalysis1); WithinModuleAnalysis1=function(datnetwork,nodesignif, couleur) { cortesthelp=function( x ) { len1=dim(x)[[2]]-1 out1=rep(666, len1); for (i in c(1:len1) ) {out1[i]= signif( cor.test(x[,i+1], x[,1], method="s",use="p" )$p.value ,2) } data.frame( variable=names(x)[-1] , NS.CorPval=out1, NS.cor=t(signif(cor (x[,1], x[,-1],use="p",method="s"),2)), signif(cor(x[,-1],use="p",method="s"),2) ) } #end of function cortesthelp print("IGNORE the warnings..."); by( data.frame(nodesignif, datnetwork),couleur,cortesthelp); } #end of function WithinModuleAnalysis # ======================================================================= # The function WithinModuleCindex1 relates the node measures (e.g. connectivities) # to "external" node significance information within each module, # i.e. it carries out a by module analysis. # BUT it focuses on the C-index also known as area under the ROC curve # This measure is related to Kendall's Tau statistic and Somer's D, # see F. Harrel (Regression Modeling Strategies). Springer. # It requires the following library library(Hmisc) # Output: the first column reports the C-index and the second, p-value if (exists("WithinModuleCindex1")) rm(WithinModuleCindex1); WithinModuleCindex1=function(datnetwork,nodesignif, couleur) { CindexFun=function( x ) { len1=dim(x)[[2]]-1 outC=rep(666, len1); outP=rep(666, len1); for (i in c(1:len1) ) {rcor1=rcorr.cens(x[,i+1], x[,1]) outC[i]=rcor1[[1]] outP[i]=1- pnorm(abs(rcor1[[2]]/rcor1[[3]])) } data.frame( variable=names(x)[-1] , C.index=outC, p.value=outP) } #end of function CindexFun #print("IGNORE the warnings..."); by( data.frame(nodesignif, datnetwork),couleur,CindexFun); } #end of function WithinModuleAnalysis # The following function allows on to plot a gene (node) significance measure versus # connectivity. if(exists("plotConnectivityGeneSignif1") ) rm( plotConnectivityGeneSignif1); plotConnectivityGeneSignif1=function(degree1,genesignif1,color1="black", title1="Gene Significance vs Connectivity" , xlab1="Connectivity", ylab1="GeneSignificance") { lm1=lm(genesignif1~degree1 ,na.action="na.omit") plot(degree1, genesignif1, col=color1,ylab=ylab1,xlab=xlab1,main=paste(title1, ", cor=", signif(cor( genesignif1,degree1, method="s",use="p" ) ,2) )) abline(lm1) } ##################################################################################################### ################################################################################################################################ # G) Miscellaneous other functions, e.g. for computing the cluster coefficient. ##################################################################################################### ################################################################################################################################ # =================================================== # The function ClusterCoef.fun computes the cluster coefficients. # Input is an adjacency matrix if(exists("ClusterCoef.fun")) rm(ClusterCoef.fun) ; ClusterCoef.fun=function(adjmat1) { diag(adjmat1)=0 no.nodes=dim(adjmat1)[[1]] computeLinksInNeighbors <- function(x, imatrix){x %*% imatrix %*% x} nolinksNeighbors <- c(rep(-666,no.nodes)) total.edge <- c(rep(-666,no.nodes)) maxh1=max(as.dist(adjmat1) ); minh1=min(as.dist(adjmat1) ); if (maxh1>1 | minh1 < 0 ) {print(paste("ERROR: the adjacency matrix contains entries that are larger than 1 or smaller than 0!!!, max=",maxh1,", min=",minh1)) } else { nolinksNeighbors <- apply(adjmat1, 1, computeLinksInNeighbors, imatrix=adjmat1) plainsum <- apply(adjmat1, 1, sum) squaresum <- apply(adjmat1^2, 1, sum) total.edge = plainsum^2 - squaresum CChelp=rep(-666, no.nodes) CChelp=ifelse(total.edge==0,0, nolinksNeighbors/total.edge) CChelp} } # end of function # =================================================== # The function err.bp is used to create error bars in a barplot # usage: err.bp(as.vector(means), as.vector(stderrs), two.side=F) err.bp<-function(daten,error,two.side=F){ if(!is.numeric(daten)) { stop("All arguments must be numeric")} if(is.vector(daten)){ xval<-(cumsum(c(0.7,rep(1.2,length(daten)-1)))) }else{ if (is.matrix(daten)){ xval<-cumsum(array(c(1,rep(0,dim(daten)[1]-1)), dim=c(1,length(daten))))+0:(length(daten)-1)+.5 }else{ stop("First argument must either be a vector or a matrix") } } MW<-0.25*(max(xval)/length(xval)) ERR1<-daten+error ERR2<-daten-error for(i in 1:length(daten)){ segments(xval[i],daten[i],xval[i],ERR1[i]) segments(xval[i]-MW,ERR1[i],xval[i]+MW,ERR1[i]) if(two.side){ segments(xval[i],daten[i],xval[i],ERR2[i]) segments(xval[i]-MW,ERR2[i],xval[i]+MW,ERR2[i]) } } } # =================================================== # this function computes the standard error if (exists("stderr1")) rm(stderr1) stderr1 <- function(x){ sqrt( var(x,na.rm=T)/sum(!is.na(x)) ) } # =================================================== # The following two functions are for displaying the pair-wise correlation in a panel when using the command "pairs()" # Typically, we use "pairs(DATA, upper.panel=panel.smooth, lower.panel=panel.cor, diag.panel=panel.hist)" to # put the correlation coefficients on the lower panel. panel.cor <- function(x, y, digits=2, prefix="", cex.cor){ usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex * r) } panel.hist <- function(x, ...){ usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...) } # =================================================== # this function computes the standard error if (exists("stderr1")) rm(stderr1); stderr1 <- function(x){ sqrt( var(x,na.rm=T)/sum(!is.na(x)) ) } # =================================================== # This function collects garbage if (exists("collect_garbage")) rm(collect_garbage); collect_garbage=function(){while (gc()[2,4] != gc()[2,4] | gc()[1,4] != gc()[1,4]){}} collect_garbage() # this function is used for computing the Rand index below... # =================================================== if (exists("choosenew") ) rm(choosenew) choosenew <- function(n,k){ n <- c(n) out1 <- rep(0,length(n)) for (i in c(1:length(n)) ){ if (n[i]=1){ staticCutCluster = cutTreeStatic(hiercluster=hierclust, heightcutoff=0.99, minsize1=minModuleSize) }else{ staticCutCluster = cutTreeStatic(hiercluster=hierclust, heightcutoff=maxTreeHeight, minsize1=minModuleSize) } #get tree height for every singleton #node_index tree_height demdroHeiAll= rbind( cbind(hierclust$merge[,1], hierclust$height), cbind(hierclust$merge[,2], hierclust$height) ) #singletons will stand at the front of the list myorder = order(demdroHeiAll[,1]) #get # of singletons no.singletons = length(hierclust$order) demdroHeiAll.sort = demdroHeiAll[myorder, ] demdroHei.sort = demdroHeiAll.sort[c(1:no.singletons), ] demdroHei = demdroHei.sort[seq(no.singletons, 1, by=-1), ] demdroHei[,1] = -demdroHei[,1] # combine with prelimilary cluster-cutoff results demdroHei = cbind(demdroHei, as.integer(staticCutCluster)) # re-order the order based on the dendrogram order hierclust$order demdroHei.order = demdroHei[hierclust$order, ] static.clupos = locateCluster(demdroHei.order[, 3]) if (is.null(static.clupos) ){ module.assign = rep(0, no.singletons) colcode.reduced = assignModuleColor(module.assign, minsize1=minModuleSize, anameallmodules=nameallmodules, auseblackwhite=useblackwhite) return ( colcode.reduced ) } static.no = dim(static.clupos)[1] static.clupos2 = static.clupos static.no2 = static.no #split individual cluster if there are sub clusters embedded mcycle=1 while(1==1){ clupos = NULL for (i in c(1:static.no)){ mydemdroHei.order = demdroHei.order[ c(static.clupos[i,1]:static.clupos[i,2]), ] #index to [1, clusterSize] mydemdroHei.order[, 1] = mydemdroHei.order[, 1] - static.clupos[i, 1] + 1 #cat("Cycle ", as.character(mcycle), "cluster (", static.clupos[i,1], static.clupos[i,2], ")\n") #cat("i=", as.character(i), "\n") iclupos = processInvididualCluster(mydemdroHei.order, cminModuleSize = minModuleSize, cminAttachModuleSize = minAttachModuleSize) iclupos[,1] = iclupos[,1] + static.clupos[i, 1] -1 #recover the original index iclupos[,2] = iclupos[,2] + static.clupos[i, 1] -1 clupos = rbind(clupos, iclupos) #put in the final output buffer } if(deepSplit==FALSE){ break } if(dim(clupos)[1] != static.no) { static.clupos = clupos static.no = dim(static.clupos)[1] }else{ break } mcycle = mcycle + 1 #static.clupos } final.cnt = dim(clupos)[1] #assign colors for modules module.assign = rep(0, no.singletons) module.cnt=1 for (i in c(1:final.cnt )) { sdx = clupos[i, 1] #module start point edx = clupos[i, 2] #module end point module.size = edx - sdx +1 if(module.size = minsize1 no.modules=sum(modulebranch) colorhelp = rep(-1, length(labelpred) ) if ( no.modules==0){ print("No module detected") } else{ for (i in c(1:no.modules)) { colorhelp=ifelse(labelpred==modulename[i],i ,colorhelp) } } colorhelp } #leftOrright >0 : running length (with same sign) to right, otherwise to the left #mysign = -1: negative value, mysign = -1: positive value runlengthSign = function(mysequence, leftOrright=-1, mysign=-1){ seqlen = length(mysequence) if(leftOrright<0){ pseq = rev(mysequence) }else{ pseq = mysequence } if(mysign<0){ #see where the first POSITIVE number occurs nonezero.bool = (pseq > 0) }else{ #see where the first NEGATIVE number occur nonezero.bool = (pseq < 0) } if( sum(nonezero.bool) > 0){ runlength = min( c(1:seqlen)[nonezero.bool] ) - 1 }else{ runlength = 0 } } #"0" is for grey module assignModuleColor = function(labelpred, minsize1=50, anameallmodules=FALSE, auseblackwhite=FALSE) { # here we define modules by using a height cut-off for the branches #labelpred= cutree(hiercluster,h=heightcutoff) #cat(labelpred) #"0", grey module doesn't participate color assignment, directly assigned as "grey" labelpredNoZero = labelpred[ labelpred >0 ] sort1=-sort(-table(labelpredNoZero)) sort1 modulename= as.numeric(names(sort1)) modulebranch= sort1 >= minsize1 no.modules=sum(modulebranch) # now we assume that there are fewer than 10 modules colorcode=c("turquoise","blue","brown","yellow","green","red","black","pink","magenta","purple","greenyellow","tan","salmon","cyan", "midnightblue", "lightcyan","grey60", "lightgreen", "lightyellow") #"grey" means not in any module; colorhelp=rep("grey",length(labelpred)) if ( no.modules==0){ print("No module detected\n") } else{ if ( no.modules > length(colorcode) ){ print( paste("Too many modules \n", as.character(no.modules)) ) } if ( (anameallmodules==FALSE) || (no.modules <=length(colorcode)) ){ labeledModules = min(no.modules, length(colorcode) ) for (i in c(1:labeledModules)) { colorhelp=ifelse(labelpred==modulename[i],colorcode[i],colorhelp) } colorhelp=factor(colorhelp,levels=c(colorcode[1:labeledModules],"grey")) }else{#nameallmodules==TRUE and no.modules >length(colorcode) maxcolors=length(colorcode) labeledModules = no.modules extracolors=NULL blackwhite=c("red", "black") for(i in c((maxcolors+1):no.modules)){ if(auseblackwhite==FALSE){ icolor=paste("module", as.character(i), sep="") }else{#use balck white alternatively represent extra colors, for display only #here we use the ordered label to avoid put the same color for two neighboring clusters icolor=blackwhite[1+(as.integer(modulename[i])%%2) ] } extracolors=c(extracolors, icolor) } #combine the true-color code and the extra colorcode into a uniform colorcode for #color assignment allcolorcode=c(colorcode, extracolors) for (i in c(1:labeledModules)) { colorhelp=ifelse(labelpred==modulename[i],allcolorcode[i],colorhelp) } colorhelp=factor(colorhelp,levels=c(allcolorcode[1:labeledModules],"grey")) } } colorhelp } #locate the start/end positions of each cluster in the ordered cluster label sequence #where "-1" indicating no cluster #3-1 -1 1 1 1 1 2 2 2 #3 3 -1-1 1 1 1 1 2 2 2 (shift) #--------------------------------- #0-4 0 2 0 0 0 1 0 0 0 (difference) # * * @ locateCluster = function(clusterlabels) { no.nodes = length(clusterlabels) clusterlabels.shift = c(clusterlabels[1], c(clusterlabels[1:(no.nodes-1)]) ) #a non-zero point is the start point of a cluster and it previous point is the end point of the previous cluster label.diff = abs(clusterlabels - clusterlabels.shift) #process the first and last positions as start/end points if they belong to a cluster instead of no cluster "-1" if(clusterlabels[1] >0) {label.diff[1]=1} if(clusterlabels[no.nodes]>0) {label.diff[no.nodes]=1} flagpoints.bool = label.diff > 0 if( sum(flagpoints.bool) ==0){ return(NULL) } flagpoints = c(1:no.nodes)[flagpoints.bool] no.points = length(flagpoints) myclupos=NULL for(i in c(1:(no.points-1)) ){ idx = flagpoints[i] if(clusterlabels[idx]>0){ myclupos = rbind(myclupos, c(idx, flagpoints[i+1]-1) ) } } myclupos } #input is the cluster demdrogram of an individual cluster, we want to find its embbeded subclusters #execution order: mean-height ==> (mean+max)/2 ==> (mean+min)/2 #useMean: =0 ~ use mean-height as calibation line # =1 ~ use (mean+max)/2 as calibation line to detect relatively a small cluster sitting on the head of a bigger one, # so mean-height is too low to detect the two modules. # =-1~ use (mean+min)/2 as calibation line to detect relatively a small cluster sitting on the tail of a bigger one, # so mean-height & (mean+max)/2 are too high to detect the two modules processInvididualCluster = function(clusterDemdroHei, cminModuleSize=50, cminAttachModuleSize=100, minTailRunlength=12, useMean=0){ #for debug: use all genes #clusterDemdroHei =demdroHei.order no.cnodes = dim(clusterDemdroHei)[1] cmaxhei = max(clusterDemdroHei[, 2]) cminhei = min(clusterDemdroHei[, 2]) cmeanhei = mean(clusterDemdroHei[, 2]) cmidhei = (cmeanhei + cmaxhei)/2.0 cdwnhei = (cmeanhei + cminhei)/2.0 if (useMean==1){ comphei = cmidhei }else if (useMean==-1){ comphei = cdwnhei }else{ #normal case comphei = cmeanhei } # compute height diffrence with mean height heidiff = clusterDemdroHei[,2] - comphei heidiff.shift = shiftSequence(heidiff, -1) # get cut positions # detect the end point of a cluster, whose height should be less than meanhei # and the node behind it is the start point of the next cluster which has a height above meanhei cuts.bool = (heidiff<0) & (heidiff.shift > 0) cuts.bool[1] = TRUE cuts.bool[no.cnodes] = TRUE if(sum(cuts.bool)==2){ if (useMean==0){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=1) }else if(useMean==1){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=-1) }else{ new.clupos = rbind(c(1, no.cnodes)) } return (new.clupos) } #a good candidate cluster-end point should have significant # of ahead nodes with head < meanHei cutindex =c(1:no.cnodes)[cuts.bool] no.cutps = length(cutindex) runlens = rep(999, no.cutps) cuts.bool2 = cuts.bool for(i in c(2:(no.cutps-1)) ){ seq = c( (cutindex[i-1]+1):cutindex[i] ) runlens[i] = runlengthSign(heidiff[seq], leftOrright=-1, mysign=-1) if(runlens[i] < minTailRunlength){ #cat("run length=", runlens[i], "\n") cuts.bool2[ cutindex[i] ] = FALSE } } #attach SMALL cluster to the left-side BIG cluster if the small one has smaller mean height cuts.bool3=cuts.bool2 if(sum(cuts.bool2) > 3) { curj = 2 while (1==1){ cutindex2 =c(1:no.cnodes)[cuts.bool2] no.clus = length(cutindex2) -1 if (curj>no.clus){ break } pre.sdx = cutindex2[ curj-1 ]+1 #previous module start point pre.edx = cutindex2[ curj ] #previous module end point pre.module.size = pre.edx - pre.sdx +1 pre.module.hei = mean(clusterDemdroHei[c(pre.sdx:pre.edx) , 2]) cur.sdx = cutindex2[ curj ]+1 #previous module start point cur.edx = cutindex2[ curj+1 ] #previous module end point cur.module.size = cur.edx - cur.sdx +1 cur.module.hei = mean(clusterDemdroHei[c(cur.sdx:cur.edx) , 2]) #merge to the leftside major module, don't change the current index "curj" #if( (pre.module.size >minAttachModuleSize)&(cur.module.hei 2){ if( (cutindex2[no.cutps] - cutindex2[no.cutps-1]+1) < cminModuleSize ){ cuts.bool2[ cutindex2[no.cutps-1] ] =FALSE } } if(1==2){ myseqnce = c(2300:3000) cutdisp = ifelse(cuts.bool2==T, "red","grey" ) #re-order to the normal one with sequential signleton index par(mfrow=c(3,1), mar=c(0,0,0,0) ) plot(h1row, labels=F, xlab="",ylab="",main="",sub="",axes = F) barplot(heidiff[myseqnce], col= "black", space=0, border=F,main="", axes = F, axisnames = F) barplot(height=rep(1, length(cutdisp[myseqnce])), col= as.character(cutdisp[myseqnce]), space=0, border=F,main="", axes = F, axisnames = F) par(mfrow=c(1,1), mar=c(5, 4, 4, 2) + 0.1) } cutindex2 = c(1:no.cnodes)[cuts.bool2] cutindex2[1]=cutindex2[1]-1 #the first no.cutps2 = length(cutindex2) if(no.cutps2 > 2){ new.clupos = cbind( cutindex2[c(1:(no.cutps2-1))]+1, cutindex2[c(2:no.cutps2)] ) }else{ new.clupos = cbind( 1, no.cnodes) } if ( dim(new.clupos)[1] == 1 ){ if (useMean==0){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=1) }else if(useMean==1){ new.clupos=processInvididualCluster(clusterDemdroHei=clusterDemdroHei, cminModuleSize=cminModuleSize, cminAttachModuleSize=cminAttachModuleSize, useMean=-1) } } new.clupos } findClustersSignificant=function(mysequence, modulecolor) { modnames= names( table(modulecolor) ) mysize = length(modulecolor) validseq = rep(TRUE, mysize) for (each in modnames ){ mybool = (modulecolor==each) mymodulesig = mysequence[mybool] mydiff = abs(mymodulesig - mymodulesig[1]) if(sum(mydiff)==0){ validseq = ifelse(mybool==TRUE, FALSE, validseq) } } validseq } #leftOrright >0 : running length (with same sign) to right, otherwise to the left #mysign = -1: negative value, mysign = -1: positive value runlengthSign = function(mysequence, leftOrright=-1, mysign=-1){ seqlen = length(mysequence) if(leftOrright<0){ pseq = rev(mysequence) }else{ pseq = mysequence } if(mysign<0){ #see where the first POSITIVE number occurs nonezero.bool = (pseq > 0) }else{ #see where the first NEGATIVE number occur nonezero.bool = (pseq < 0) } if( sum(nonezero.bool) > 0){ runlength = min( c(1:seqlen)[nonezero.bool] ) - 1 }else{ runlength = 0 } } #delta >0 : shift to right, otherwise to the left shiftSequence = function(mysequence, delta){ seqlen = length(mysequence) if(delta>0){ finalseq=c(mysequence[1:delta], mysequence[1:(seqlen-delta)]) }else{ posdelta = -delta finalseq=c(mysequence[(posdelta+1):seqlen], mysequence[(seqlen-posdelta+1):seqlen]) } finalseq } #no of neighbors behind and before the point used for average averageSequence=function(mysequence, noneighbors){ sumseq = mysequence for(i in c(1:noneighbors)){ iseq = shiftSequence(mysequence, i) sumseq = sumseq + iseq iseq = shiftSequence(mysequence, -i) sumseq = sumseq + iseq } sumseq = sumseq/(1+2*noneighbors) } #delta >0 : shift to right, otherwise to the left shiftSequence = function(mysequence, delta){ seqlen = length(mysequence) if(delta>0){ finalseq=c(mysequence[1:delta], mysequence[1:(seqlen-delta)]) }else{ posdelta = -delta finalseq=c(mysequence[(posdelta+1):seqlen], mysequence[(seqlen-posdelta+1):seqlen]) } finalseq } #find the middle of each cluster and label the middle position with the corrsponding color getDisplayColorSequence=function(colordered){ mylen = length(colordered) colordered2 = c(colordered[1], colordered[1:(mylen-1)] ) colordiff = (colordered != colordered2) colordiff[1] = TRUE colordiff[mylen] = TRUE #mydispcolor = ifelse(colordiff==TRUE, colordered, "") mydispcolor = rep("", mylen) mytrueseq = c(1:mylen)[colordiff] for (i in c(1:(length(mytrueseq)-1)) ){ midi = (mytrueseq[i] + mytrueseq[i+1])/2 mydispcolor[midi] = colordered[midi] } fdispcolor = ifelse(mydispcolor=="grey", "", mydispcolor) fdispcolor } #use height cutoff to remove cutTreeStatic = function(hiercluster,heightcutoff=0.99, minsize1=50) { # here we define modules by using a height cut-off for the branches labelpred= cutree(hiercluster,h=heightcutoff) sort1=-sort(-table(labelpred)) sort1 modulename= as.numeric(names(sort1)) modulebranch= sort1 >= minsize1 no.modules=sum(modulebranch) colorhelp = rep(-1, length(labelpred) ) if ( no.modules==0){ print("No module detected") } else{ for (i in c(1:no.modules)) { colorhelp=ifelse(labelpred==modulename[i],i ,colorhelp) } } colorhelp } #merge the minor cluster into the major cluster merge2Clusters = function(mycolorcode, mainclusterColor, minorclusterColor){ mycolorcode2 = ifelse(as.character(mycolorcode)==minorclusterColor, mainclusterColor, as.character(mycolorcode) ) fcolorcode =factor(mycolorcode2) fcolorcode } ############################################################################################################################### # I) GENERAL STATISTICAL FUNCTIONS mean1=function(x) mean(x,na.rm=T) var1=function(x) var(x,na.rm=T) if (exists("scatterplot1") ) rm(scatterplot1); scatterplot1=function(x,y, title1="",col1="black",xlab1="x",ylab1="y" , cex1=1, cex.axis1=1.5,cex.lab1=1.5, cex.main1=1.5 ,ylim1=-1 ){ cor1=signif(cor(x,y,use="p",method="s"),2) corp=signif(cor.test(x,y,use="p",method="s")$p.value,2) if (corp<10^(-20) ) corp="<10^{-20}" if ( length(ylim1)==2) {plot(x,y, main=paste(title1,"cor=", cor1,"p=",corp),col=col1,xlab=xlab1,ylab=ylab1, cex=cex1, cex.axis=cex.axis1,cex.lab=cex.lab1, cex.main=cex.main1,ylim=ylim1)} else { plot(x,y, main=paste(title1,"cor=", cor1,"p=",corp),col=as.character(col1),xlab=xlab1,ylab=ylab1, cex=cex1, cex.axis=cex.axis1,cex.lab=cex.lab1, cex.main=cex.main1)} } # Functions for Network Screening if(exists("HubGeneSignificance") ) rm(HubGeneSignificance) HubGeneSignificance=function(k,GS,module,NumberHubs=10){ colorlevels=levels(factor(module)) HGS=rep(NA, length(colorlevels) ) for (i in c(1:length(colorlevels) ) ) { restmodule= module==colorlevels[i] GSModule= GS[restmodule] kModule=k[restmodule] HGS[i]=mean(GSModule[rank(-kModule, ties.method="first")<=NumberHubs],na.rm=T) } # end of for loop barplot(HGS,col=colorlevels, ylab="Mean Hub Gene Significance", main=paste("Hub Gene Significance, top", NumberHubs, "hubs")) datout=data.frame(matrix(HGS,nrow=1,ncol=length(colorlevels))); names(datout)=colorlevels; datout } # end of function if(exists("StandardScreening1") ) rm(StandardScreening1); StandardScreening1=function( GS, LN=c(5,10) ) { datout=data.frame(matrix(F, nrow=length(GS),ncol=length(LN) )) names(datout)=paste("List", LN,sep="") for ( j in c(1:length(LN)) ) { datout[,j]=rank(-GS,ties.method="first")<=LN[j] } datout } # end of function StandardScreening1 if(exists("NetworkScreening1") ) rm(NetworkScreening1) NetworkScreening1=function(datE, GS, MLN=1000, LN=10,beta=6, minModuleSize=100,powerAllocation=3, NumberHubsHGS=50 , excludegrey=F,excludeturquoise=F, consider.sign.corKGS=F) { if (length(GS) != dim(datE)[[2]] ) print("Error: length(GS) not compatible with datE. Please check whether length(GS) = dim(datE)[[2]]?") if ( max(LN)>MLN ) print("Error: requested list number bigger than maximum list number. Please increase MLN or decrease LN") if ( MLNevery gene is grouped into the turquoise module") if (length(GS) == dim(datE)[[2]] & max(LN) <= MLN ){ GS=abs(GS) restMLN=rank(-GS, ties.method="first")<=MLN GSrest=GS[restMLN] ADJ= abs(cor(datE[, restMLN], use="p"))^beta h1=hclust( as.dist(TOMdist1(ADJ)) , method="average") colorGS=rep("turquoise", dim(ADJ)[[2]] ) if (MLN > minModuleSize ) {colorGS= factor(cutreeDynamic(h1, minModuleSize= minModuleSize)) } colorGSlevels=levels(factor(colorGS)) if ( length(colorGSlevels)==1 ) { HGS=1; k=apply(ADJ,2,sum) } else { k= DegreeInOut(ADJ ,colorGS,scaledToOne=FALSE)$kWithin HGS=as.vector(as.matrix(HubGeneSignificance(k=k, GS=GSrest,module=colorGS,NumberHubs=NumberHubsHGS)[1,])) # the following definition of HGS uses the correlation between k and GS... if ( consider.sign.corKGS==T ) { for (i in c(1:length(colorGSlevels))){ HGS[i]=(cor( k[colorGS==colorGSlevels[i]], GSrest[colorGS==colorGSlevels[i]], use="p" ))} } # end of for loop } # end of if (consider.sign ) if (excludegrey) HGS[colorGSlevels=="grey"]=0 if (excludeturquoise==T) HGS[colorGSlevels=="turquoise"]=0 weightHGS=abs(HGS)^powerAllocation weightHGS=weightHGS/sum(weightHGS) datout=data.frame(matrix(F, nrow=length(GS),ncol=length(LN) )) names(datout)=paste("List", LN,sep="") for ( j in c(1:length(LN)) ) { LNcolor=round(LN[j]*weightHGS) maxNoColorGS=tapply(colorGS,colorGS,length) # if there are more hubs than the module size, then we pick genes from the next module # with highest hub gene significance for (ii in c(1:length(colorGSlevels))) { ExcessNo=sum(c(LNcolor-maxNoColorGS)[LNcolor>maxNoColorGS]) indexHighestWeightHGS=rank(-weightHGS, ties.method="first")==ii LNcolor[LNcolor>maxNoColorGS]=maxNoColorGS[LNcolor>maxNoColorGS] LNcolor[indexHighestWeightHGS ]= LNcolor[indexHighestWeightHGS ]+ExcessNo LNcolor[indexHighestWeightHGS ]=LN[j]- sum(LNcolor[!indexHighestWeightHGS]) } LNcolor[LNcolor>maxNoColorGS]=maxNoColorGS[LNcolor>maxNoColorGS] PickHubs=rep(F,length(GS)) for (i in c(1:length(colorGSlevels))){ # the following takes the sign between K and GS into account signKGS=1 if ( consider.sign.corKGS==T ) {signKGS=sign(cor( k[colorGS==colorGSlevels[i]], GSrest[colorGS==colorGSlevels[i]], use="p" ))} PickHubs[restMLN][ colorGS==colorGSlevels[i]] =rank(-signKGS*k[colorGS==colorGSlevels[i]], ties.method="first")<=LNcolor[i] } # end of for loop datout[,j]=PickHubs print(paste("For list", j, "with LN=",LN[j], "the algorithm picks the following number of hubs in each module")) print( data.frame(colorGSlevels, modulesize=table(colorGS), HubGeneSignif=signif(HGS,2), weightHGS=signif(weightHGS,2), LNcolor )) } # end of for ( j in c(1:length(LN)) ) datout } # end of if if (length(GS) == dim(datE)[[2]] & max(LN) < MLN } # end of function NetworScreening1