#**************************************************************************************** # # Gene co-Expression Network Analysis Package # # # Objective: # Automatic construction of /dichotomized weighted gene co-expression # networks and identify tightly co-expressed modules in networks # # Input: DNA microarray data file # # Output: Topological overlap matrix (TOM) and gene co-expressed modules # # Authors: Bin Zhang and Steve Horvath # # Array Data Analysis Group (ADAG) # Departments of Human Genetics and Biostatistics # University of California at Los Angeles # # Contact: shorvath@mednet.ucla.edu and Bin Zhang [binzhang.ucla@gmail.com] # # Copyright @2004-2008 ADAG,UCLA # # Date: Nov. 18, 2004 # Comment by Steve Horvath: This code is very similar to the network code posted in the tutorial of our network webpage: # http://www.genetics.ucla.edu/labs/horvath/GeneralFramework/ # Main differences: # 1) this code is not interactive. It allows you to run a batch analysis of the network. # And it outputs all graphs and figures into a directory. # 2) The code is older than the one at our main webpage. # 3) This code underlies the figures and tables in the following manuscript # Carlson MRJ, Zhang B, Fang Z, Mischel PS, Horvath S, and Nelson SF (2006) # Gene Connectivity, Function, and Sequence Conservation: Predictions from Modular Yeast Co-Expression Networks. # Biomed Central Genomics. # 4) This code was adapted by Bin Zhang but is based on the code in our main webpage: www.genetics.ucla.edu/labs/horvath/GeneralFramework/ # 5) It contains the function cutreeDynamic for dynamic tree cutting. # #**************************************************************************************** #---------------------------------------------------------------------------------------- # Description of major variables # # INPUT: # # inputfname ~ DNA microarray data file # headCol ~ the number of columns holding gene information in the data file # # Output: # # outputDir ~ direcrory to output all results # logFname ~ logo file to record intermediate results # moduleFname ~ file including gene information, expression values and modules # imgScaleFree ~ the plot of frequency distribution p(k), k is # of links for a genes # imgCoeffK ~ scatter plot of gene cluster coefficient vs k # imgHierClust ~ plot of hierarchical clustering of topological overlap matrix (TOM) # imgExprBands ~ gene expression bands in top three modules discovered in the network # imgHeatMap ~ plot of clustering dendrogram, modules and heatmap of TOM # #---------------------------------------------------------------------------------------- #---------------------------------------------------------------------------------------- # Description of major parameters you have to mess with for this beast to run # # *inputfname: you have to adjust this to the right file # headCol: you have to specify the number of cols to skip # *corrlpower: you must adjust this to optimize the scale free-ness of the network # Notes on corrlpower:On this value you must have a reasoble and consistent mean no of links # mean(nolinks) should be about the same for each (30 is decent) # You should also have a good R square for the line fit (.7 or so should be decent) # # *rest1: set this to be the number of most connected genes to use in the network # (COULD try setting rest1 to 3500 for the env resp dataset to try and get better results... but lets just keep them all the same as much as possible...) # # oddFilter and evenFilter: make sure these are not used unless you want to thin the data... # *corhelp: may need to be tweaked in order to make the software ignore the anticorrelated values... # Note: when you change corhelp, you will have to definitely choose a new power (corrlpower) for getting the network to be scale free # (try double what you used before), and you probably ALSO have to adjust the cuttoff (myheightcutoff) # # *myheightcutoff: you must adjust this to choose the cutoff you wish to cut the modules out at # (I optimized this variable so that it hit a local minima of k for the module and also # reasonably sized modules) # *merge2Clusters() use this if you have two modules that clearly belong together (to merge them) (such as happened for the cell cycle dataset) # # *remember to re-enable imgHeatMap generation and the pajek code too. #---------------------------------------------------------------------------------------- rm(list=ls()) #destroys all the objects for a fresh start (better than restarting R) # --------------- Parameters to be changed ----------------------- setwd("C:/Documents and Settings/MCarlson/Desktop/Yeast Networks/Newest Yeast Network Stuff") #current working directory, #setwd("C:/NEW Yeast Network Stuff") #need to inform R of where the functions file is source("C:/Documents and Settings/MCarlson/Desktop/Yeast Networks/Newest Yeast Network Stuff/NetworkFunctionsForYEAST.txt") #inputfname = "DNADmg_ess_4000_OntTrms_BLST.xls" #gene expresion data, including gene information #inputfname = "CellCycle4000_OntTrms_BLST_noOut.xls" #gene expresion data, including gene information inputfname = "EnvResp_ess_4000_OntTrms_BLST.xls" #gene expresion data, including gene information headCol = 8 #number of columns for gene information (info. about the genes that is NOT expr data) outputDir ="Results/" #directory for holding results #Choose a beta value (or power for power func) (6 usually is a good place to start) #corrlpower = 6 #default value of beta for the scale free plot DNA Dmg. at 6, mean was about 30.37799, and R value was .83 #corrlpower = 5 #default value of beta for the scale free plot Cell Cyc. at 5, mean was about 24.88333, and R val was .71 #corrlpower = 9 #default value of beta for the scale free plot Env. Resp. at 9, mean was about 33.41891, and R value was .76 #Choose a beta value (or power for power func) THIS TIME for a corr-ONLY network (double what worked before is usually is a good place to start) #corrlpower = 12 #default value of beta for the scale free plot DNA Dmg. at 12, mean was about 32.09570, and R value was .79 #corrlpower = 10 #default value of beta for the scale free plot Cell Cyc. at 10, mean was about 33.70219, and R val was .62 corrlpower = 18 #default value of beta for the scale free plot Env. Resp. at 18, mean was about 26.43615, and R value was .77 #whichTOM =0: dichotomized networks #whichTOM =1: weighted networks by Power function #whichTOM =2: weighted networks by Sigmoid function whichTOM = 1 #default as weighted networks by Power function # --------------- End of Parameters to be changed ---------------- #library(MASS) library(class) library(mva) library(cluster) library(survival) library(rpart) library(sma) # this is needed for plot.mat below library(lattice) # require is design for use inside functions library(scatterplot3d) library(Hmisc) collect_garbage() memory.size(TRUE) # check the maximum memory that can be allocated memory.limit(size=2048) # increase the available memory to 2GB fname =getFileName(inputfname) logFname =paste(outputDir, fname, "_logo.txt", sep='') cutoffFname =paste(outputDir, fname, "_tomcutoff.xls", sep='') moduleFname =paste(outputDir, fname, "_tommodules.xls", sep='') imgScaleFree=paste(outputDir, fname, "_imgScaleFree.png",sep='') imgCoeffK =paste(outputDir, fname, "_imgCoeffK.png", sep='') imgHierClust=paste(outputDir, fname, "_imgHierCulst.png", sep='') imgExprBands=paste(outputDir, fname, "_imgExprBands.png",sep='') imgHeatMap =paste(outputDir, fname, "_imgHeatmap.png", sep='') cutoff3DR2 =paste(outputDir, fname, "_tomcutoff3D_R2.png", sep='') cutoff3DMk =paste(outputDir, fname, "_tomcutoff3D_Meank.png", sep='') #plots of the entire network AND just the essential module get saved into these filenames imgKvsEss_ALL=paste(outputDir, fname, "_imgKvsEss_ALL.png", sep='') imgKvsEss_EssMod=paste(outputDir, fname, "_imgKvsEss_EssMod.png", sep='') imgKvsCons_ALL=paste(outputDir, fname, "_imgKvsCons_ALL.png", sep='') imgKvsCons_EssMod=paste(outputDir, fname, "_imgKvsCons_EssMod.png", sep='') imgKvsCons_ProtMod=paste(outputDir, fname, "_imgKvsCons_ProtMod.png", sep='') imgKvsEss_ProtMod=paste(outputDir, fname, "_imgKvsEss_ProtMod.png", sep='') imgTOvsEss_ALL=paste(outputDir, fname, "_imgTOvsEss_ALL.png", sep='') imgTOvsEss_EssMod=paste(outputDir, fname, "_imgTOvsEss_EssMod.png", sep='') imgTOvsCons_ALL=paste(outputDir, fname, "_imgTOvsCons_ALL.png", sep='') imgTOvsCons_EssMod=paste(outputDir, fname, "_imgTOvsCons_EssMod.png", sep='') imgTOvsCons_ProtMod=paste(outputDir, fname, "_imgTOvsCons_ProtMod.png", sep='') imgTOvsEss_ProtMod=paste(outputDir, fname, "_imgTOvsEss_ProtMod.png", sep='') imgMDS_Plot=paste(outputDir, fname, "_imgMDS_Plot.png", sep='') imgHierClustModule=paste(outputDir, fname, "_imgHierModules.png", sep='') imgCoeffKInmodule = paste(outputDir, fname, "_imgCoeffKInMod.png", sep='') imgModulSig =paste(outputDir, fname, "_moduleSignif.png", sep='') imgModulCC =paste(outputDir, fname, "_moduleCC.png", sep='') imgModConsSigBox =paste(outputDir, fname, "_moduleConsSignifBox.png", sep='') imgModConsSigBar =paste(outputDir, fname, "_moduleConsSignifBar.png", sep='') imgDistComp =paste(outputDir, fname, "_imgDistCompare.png", sep='') imgLinkComp =paste(outputDir, fname, "_imgLinkCompare.png", sep='') imgCCComp =paste(outputDir, fname, "_imgCCoeffCompare.png", sep='') imgTomComp =paste(outputDir, fname, "_imgTomCompare.png", sep='') imgNetVars =paste(outputDir, fname, "_imgNetVars", sep='') imgMDSplot =paste(outputDir, fname, "_imgMDS.png", sep='') pajekFname =paste(outputDir, fname, "_pajek.xls", sep='') #Files for the EASE results and the k.in information as well as the stat outputs for the kvsEss and KvsConservation plots EASE_Fname=paste(outputDir, fname, "_EASE.xls", sep='') K_Corr_Fname=paste(outputDir, fname, "_K_Corr.xls", sep='') #lets jus use one filename and append all our plot related tests to it. (k and to plots) tests_Fname=paste(outputDir, fname, "_Plot_tests_and_line_fits.xls", sep='') yfile <- file(logFname, "w+") close(yfile) #*------------------------------------------------------------------------------------- #* STEP 0: read in gene information, expression data #* #* #*------------------------------------------------------------------------------------- allMatrix <- read.delim(inputfname,sep="\t", header=T) attach(allMatrix) dim(allMatrix) genesInfor <- allMatrix[,c(1:headCol)] rowTitles=names(allMatrix) #These are the expression values datExpr <-t(allMatrix[,-c(1:headCol)]) dim(datExpr) #Normally this should be INACTIVE! #lets see what happens when we just grab the 1st half of the dataset #datExpr <- datExpr[c(78:155),] #dim(datExpr) # Bins method Keep this inactive too #Here I need to adjust the datExpr to only take the odd (or even) numbered rows #numCols = dim(datExpr)[1] #grab just the 1st element returned by dim() #mno= as.integer(numCols/2+1) #mno just returns 1/2 the value of cols #oindex= rep(c(TRUE, FALSE), mno) #repeat true and false an mno number of times #o2index = oindex[c(1:numCols)] #now define the matrix of odd cols #e2index = !o2index #and then the matrix of even cols #datExprOdd<-datExpr[o2index, ] #need to apply filter #dim(datExprOdd) #datExprEven<-datExpr[e2index, ] #need to apply filter #dim(datExprEven) # NORMALLY, THIS CODE SHOULD BE INACTIVE! # My method: #numCols = dim(datExpr)[1] #grab just the 1st element returned by dim() #oddFilter = seq(from=1, to=numCols, by=2) #oddFilter #evenFilter = seq(from=0, to=numCols, by=2) #evenFilter # Then Filter datExpr based on oddFilter or evenFilter #datExpr<-datExpr[evenFilter,] #need to apply filter #dim(datExpr) #*------------------------------------------------------------------------------------- #* STEP 1: compute correlation coefficient #* #* #* #*------------------------------------------------------------------------------------- corhelp <- cor(datExpr, use = "pairwise.complete.obs") no.genes <- dim(corhelp)[[2]] #to exclude anticorrelated relationships enable this corhelp <- (corhelp+1)/2 cc.done = FALSE #*------------------------------------------------------------------------------------- #* STEP 2: dichotomize correlation matrix based on an estimated correlation cutoff #* criteria: the resulted network should be scale-free #* #* #* notice: sometimes the default RsquaredCut(0.8) is too high, we need low down it #* #*------------------------------------------------------------------------------------- if (whichTOM == 0){ paraname ="tau=" #corrlcutoff = dichotcut(corrlMatrix=corhelp, noOFsamples=dim(datExpr)[1], mlogFname=cutoffFname, samplingSize=4000, RsquaredCut=0.8) collect_garbage() corrlcutoff =0.65 #preparation for computing Cluster Coefficients and TOM dichotCor <- abs(corhelp)>corrlcutoff diag(dichotCor)<- FALSE #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ #calculate Cluster Coefficient nolinks <- apply(dichotCor,2,sum, na.rm=TRUE) total.edge <- c(rep(-666,no.genes)) for (i in 1:no.genes){ #indexhelp= dichotCor[,i]==1 indexhelp= dichotCor[,i] > 0 total.edge [i] <- sum(dichotCor[indexhelp,indexhelp],na.rm=TRUE)/2 } cluster.coef = ifelse(nolinks>1,total.edge*2/nolinks/(nolinks-1),0) #vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv # Now we plot the cluster coefficient versus the number of links par(mfrow=c(1,1)) plot(nolinks,cluster.coef) # -------->>---------- output log file ----------- parainfor = paste(paraname, as.character(corrlcutoff), sep='') appendStringToFile(logFname, parainfor) mean(nolinks) cc.done = TRUE } if (whichTOM == 1 ) { paraname ="beta=" corrlcutoff = 0 #corrlpower = powercut(corrlMatrix=corhelp, noOFsamples=dim(datExpr)[1], mlogFname=cutoffFname, samplingSize=4000, RsquaredCut=0.8) collect_garbage() #In case someone wants to, they can redefine the beta value here for the scale free plots to see if its a better fit. #corrlpower = 6 corrlcutoff = corrlpower dichotCor <- abs(corhelp)^corrlpower diag(dichotCor)<- 0 nolinks <- apply(dichotCor,2,sum, na.rm=TRUE) # -------->>---------- output log file ----------- parainfor = paste(paraname, as.character(corrlcutoff), sep='') appendStringToFile(logFname, parainfor) if( 1==2 ){ #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ #the following denotes the number of connections (links) for each gene cluster.coef = computeClusterCoefficient(dichotCor) cc.done = TRUE #vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv } } if (whichTOM == 2 ) { paraname ="alpha=" corrlcutoff = 0 #mytau0=0.8 #corrlpower = sigmoidcut(corrlMatrix=corhelp, noOFsamples=dim(datExpr)[1], mlogFname=cutoffFname, samplingSize=4000, RsquaredCut=0.9, tau=mytau0) #search 2-d space (tau, alpha) #taualpha=sigmoidcutTauAlpha(mcorrlMatrix=corhelp, mnoOFsamples=dim(datExpr)[1], # mmlogFname=cutoffFname, msamplingSize=4000, mRsquaredCut=0.9) #mytau0 = as.numeric(taualpha[1]) #corrlpower = as.numeric(taualpha[2]) mytau0 = 0.8 corrlpower = 12 collect_garbage() corrlcutoff = corrlpower dichotCor <- sigmoidMatrix(abs(corhelp), corrlpower, tau=mytau0) #dichotCor <- apply(abs(corhelp), c(1,2), sigmoid, alpha=corrlpower, tau=mytau0) diag(dichotCor)<- 0 nolinks <- apply(dichotCor,2,sum, na.rm=TRUE) # -------->>---------- output log file ----------- parainfor = paste(paraname, as.character(corrlcutoff), ", tau0=", as.character(mytau0),sep='') appendStringToFile(logFname, parainfor) if( 1==1 ){ #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ #the following denotes the number of connections (links) for each gene #nolinksNeighbors <- c(rep(-666,no.genes)) #total.edge <- c(rep(-666,no.genes)) #nolinksNeighbors <- apply(dichotCor, 1, computeLinksInNeighbors, imatrix=dichotCor) #plainsum <- apply(dichotCor, 1, sum) #squaresum <- apply(dichotCor^2, 1, sum) #total.edge = plainsum^2 - squaresum #cluster.coef = nolinksNeighbors/total.edge cluster.coef = computeClusterCoefficient(dichotCor) cc.done = TRUE #vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv } } # let's check whether there is a scale free topology str_cutoff=paste(paraname,as.character(signif(corrlcutoff,3) )) suminfor=ScaleFreePlot(nolinks,no.breaks=15, mtitle=str_cutoff,truncated1=TRUE, tofile="") ScaleFreePlot(nolinks,no.breaks=15, mtitle=str_cutoff,truncated1=TRUE, tofile=imgScaleFree) # -------->>---------- output log file ----------- appendStringToFile(logFname, suminfor) #*------------------------------------------------------------------------------------- #* STEP 3: compute the topological overlap matrix (TOM), #* representing the interconnectedness of all pairs of genes #* #* #* notice: to have singificant modules, we will restrict the analysis to genes #* with a certain minimum number of links #* #*------------------------------------------------------------------------------------- mk=mean(nolinks) mk meaninfor=paste("mean(k)=", as.character(round(mk,2) ) ) appendStringToFile(logFname, meaninfor) #Set this to be the number of msot connected genes to be used in the network rest1= (rank(-nolinks) <= 3000) rtable=table(rest1) rtable # -------->>---------- output log file ----------- appendTableToFile(logFname, rtable) dichotCor.reduced = dichotCor[rest1,rest1] no.genes.reduced = dim(dichotCor.reduced)[1] nolinks.reduced = nolinks[rest1] genesInfor.reduced=genesInfor[rest1,] #Let’s calculate topological overlap matrix numTOM= dichotCor.reduced %*% dichotCor.reduced + dichotCor.reduced dist1 = matrix(NA, no.genes.reduced, no.genes.reduced) diag(dist1) <- 0 for (i in 1:(no.genes.reduced-1) ){ for (j in (i+1):no.genes.reduced){ denomTOMij = min(nolinks.reduced[i], nolinks.reduced[j]) + 1 - dichotCor.reduced[i,j] dist1[i,j] = 1- numTOM[i,j]/denomTOMij dist1[j,i] = dist1[i,j] } } #*------------------------------------------------------------------------------------- #* STEP 4: perform hierarchical clustering over TOM #* #* hierarchical clustering with average linkage #* #* #*------------------------------------------------------------------------------------- h1row <- hclust(as.dist(dist1),method="average") collect_garbage() # -------->>---------- output Hierarchical Clustering image ----------- openImgDev(imgHierClust) par(mfrow=c(1,1)) plot(h1row, labels=F, xlab="",ylab="",main="",sub="") dev.off() #options with "plot" for hclust, use help(hclust) plot(h1row, labels=F, xlab="",ylab="",main="",sub="") #*------------------------------------------------------------------------------------- #* STEP 5: detect and label modules in TOM based on #* the hierarchical clustering dendrogram #* #* #* notice: you may need manually adjust "heightcutoff", it is hard to #* estimate this cutoff #* #*------------------------------------------------------------------------------------- #these are the height cutoffs for when corhelp is excluding the anticorrelated values by shifting the entire set of network vals to the positive such that neg vales are 0 and uncorrelateds are .5) # good cutoffs for the DNA Dmg dataset #myheightcutoff = 0.97 #mydeepSplit = FALSE #myminModuleSize = 25 # good cutoffs for the CELL Cycle dataset #myheightcutoff = .98 #mydeepSplit = FALSE #myminModuleSize = 25 #for the Cell Cycle dataset, we need to merge the lightgreen and yellow modules #NOTE: This step has to be done BELOW just look for the line that follows this one... #colcode.reduced = merge2Clusters(colcode.reduced, mainclusterColor="yellow", minorclusterColor="greenyellow") # good cutoffs for the Env Stress dataset myheightcutoff = 1 mydeepSplit = TRUE myminModuleSize = 25 # old way to do tree cutoff #colcode.reduced = moduleDetectLabel(hiercluster=h1row, myheightcutoff, minsize1=myminModuleSize) #new way to do the cutoff colcode.reduced=cutreeDynamic(hierclust=h1row, deepSplit=mydeepSplit, maxTreeHeight =myheightcutoff, minModuleSize =myminModuleSize) table(colcode.reduced) #*-------------------------------Here is where you can start merging clusters if appropriate------------------------- # merge a minor cluster to a major cluster #colcode.reduced = merge2Clusters(colcode.reduced, mainclusterColor="pink", minorclusterColor="salmon") # for the Cell Cycle dataset, we need to merge the lightgreen and yellow modules # NOTE: This step has to be done HERE #colcode.reduced = merge2Clusters(colcode.reduced, mainclusterColor="yellow", minorclusterColor="greenyellow") #openImgDev(imgHierClustModule) par(mfrow=c(2,1), mar=c(0,0,0,0) ) plot(h1row, labels=F, xlab="",ylab="",main="",sub="",axes = F) barplot(height=rep(1, length(colcode.reduced)), col= as.character(colcode.reduced[h1row$order]), space=0, border=F,main="", axes = F, axisnames = F) par(mfrow=c(1,1), mar=c(5, 4, 4, 2) + 0.1) #dev.off() fdispcolor = getDisplayColorSequence(as.character(colcode.reduced[h1row$order])) par(mfrow=c(2,1), mar=c(6, 0, 0, 0), cex.lab=0.5) plot(h1row, labels=F, xlab="",ylab="",main="",sub="",axes = T) barplot(height=rep(1, length(colcode.reduced)), names.arg=fdispcolor, col= as.character(colcode.reduced[h1row$order]), space=0, las=2, border=F,main="", axes = T, axisnames = T, cex.lab=0.5) par(mfrow=c(1,1), mar=c(5, 4, 4, 2) + 0.1) #OPT: draw the hier cluster so that we can SEE the cutoff Easily #plot(h1row, labels=F, xlab="",ylab="",main="",sub="",axes = T) # Here we extend the color definition to all genes. # Recall that genes outside of modules are colored in grey color1=rep("grey",no.genes,) color1[rest1]=as.character(colcode.reduced) color1=factor(color1,levels=levels(colcode.reduced)) ctable=table(color1) ctable #dist1=1- abs( corhelp[rest1,rest1])^100 #MDS, pam for estimating # of clusters #cmd1=cmdscale( as.dist(dist1), k=2) #plot(cmd1,col=as.character(colcode.reduced)) #openImgDev(imgMDSplot) #plot(cmd1,col=as.character(colcode.reduced)) #dev.off() #cmd1=cmdscale( as.dist(dist1), k=3) #par(mfrow=c(2,2)) #plot(cmd1[,c(1,2)],col=color1) #plot(cmd1[,c(1,3)],col=color1) #plot(cmd1[,c(2,3)],col=color1) #pam1= pam( dist(cmd1), k=4,diss=T) #table(colcode.reduced,pam1$clustering) #plot(cmd1,type="n") #text(cmd1, col=as.character(colcode.reduced),label=pam1$clustering ) # -------->>---------- output log file ----------- hiercutinfor = paste("Tree Hieght Cutoff=", as.character(myheightcutoff), ", min module size=", as.character(myminModuleSize), ", deepSplit=", as.character(mydeepSplit), sep="") appendStringToFile(logFname, hiercutinfor) appendTableToFile(logFname, ctable) #*------------------------------------------------------------------------------------- #* STEP 5.0: EASE ANALYSIS #* #* #* #* #*------------------------------------------------------------------------------------- easeFile <- file(EASE_Fname, "w+") close(easeFile) tabPval=fisherPvector(color1, genesInfor$BiologicalProcess, outformat="pvalue") tabProp=fisherPvector(color1, genesInfor$BiologicalProcess, outformat="ratio") tabBoth=fisherPvector(color1, genesInfor$BiologicalProcess, outformat="both") bool.signif = (apply(tabPval,1,min)<0.01) & (apply(tabProp,1,max)>1) #tab2=tabPval[bool.signif, ] tab2=tabBoth[bool.signif, ] tab2 appendTableToFile(EASE_Fname, tab2) tabPval=fisherPvector(color1, genesInfor$MolecularFunction, outformat="pvalue") tabProp=fisherPvector(color1, genesInfor$MolecularFunction, outformat="ratio") tabBoth=fisherPvector(color1, genesInfor$MolecularFunction, outformat="both") bool.signif = (apply(tabPval,1,min)<0.01) & (apply(tabProp,1,max)>1) #tab2=tabPval[bool.signif, ] tab2=tabBoth[bool.signif, ] tab2 appendTableToFile(EASE_Fname, tab2) tabPval=fisherPvector(color1, genesInfor$CellularComponent, outformat="pvalue") tabProp=fisherPvector(color1, genesInfor$CellularComponent, outformat="ratio") tabBoth=fisherPvector(color1, genesInfor$CellularComponent, outformat="both") bool.signif = (apply(tabPval,1,min)<0.01) & (apply(tabProp,1,max)>1) #tab2=tabPval[bool.signif, ] tab2=tabBoth[bool.signif, ] tab2 appendTableToFile(EASE_Fname, tab2) #*------------------------------------------------------------------------------------- #* STEP 5.1: within module significance #* #* #* #* #*------------------------------------------------------------------------------------- significance = genesInfor$essentiality cc.in = computeModuleCC(dichotCor, color1) k.in = computeModuleLinks(dichotCor, color1) k.out = nolinks - k.in k.diff= k.in - k.out k.all = nolinks #sum of topological overlaps within modules to.in = rep(0, no.genes) to.Irest = computeModuleLinks(dist1, colcode.reduced) to.in[rest1] = no.genes.reduced - to.Irest #sum of topological overlaps to.all = rep(0, no.genes) to.allR = apply(dist1, 2, sum, na.rm=TRUE) to.all[rest1] = no.genes.reduced - to.allR # The following data frame contains all network properties: if(cc.done){ cc.all= cluster.coef datNetwork=data.frame(to.in, to.all, k.all, k.in, k.out, k.diff, cc.in, cc.all) }else { datNetwork=data.frame(to.in, to.all, k.all, k.in, k.out, k.diff, cc.in) } corrFile <- file(K_Corr_Fname, "w+") close(corrFile) #find genes sitting in modules usefulclustergenes = findClustersSignificant(significance, modulecolor=color1) corrTab1 = by( data.frame(significance, datNetwork), color1, cor1) appendMultiTablesToFile(K_Corr_Fname,corrTab1) #corrTab2 = by( data.frame(significance, datNetwork), color1, krusktest) corrTab2 = by( data.frame(significance, datNetwork)[usefulclustergenes, ], color1[usefulclustergenes], krusktest) appendMultiTablesToFile(K_Corr_Fname,corrTab2) openImgDev(imgCoeffKInmodule) par(mfrow=c(1,1)) plot(k.in, cc.in, col=as.character(color1), xlab="no of connections", ylab="cluster coefficient") dev.off() plot(k.in, cc.in, col=as.character(color1), xlab="no of connections", ylab="cluster coefficient") #plot(to.in, to.in, col=as.character(color1), xlab="no of connections", ylab="cluster coefficient") #plot relationships of various measures across/within modules VariableRelationshipPlot(data.frame(significance, datNetwork), fname=imgNetVars, colorcode=color1) #*------------------------------------------------------------------------------------- #* STEP 5.2: detect module significance #* #* #* #* #*------------------------------------------------------------------------------------- dispcolors=names(ctable) meansG=as.vector(tapply(significance, color1, mean)) seG =as.vector(tapply(significance,color1,stderror)) pval1=signif(kruskal.test(significance,color1 )$p.value,2) boxplot(split(significance,color1),col=dispcolors,notch=T,varwidth=T,ylab="essentiality", main=paste("p-value=",pval1)) #legend(0, 8, dispcolors, fill = dispcolors) boxinfor = paste("pvalue of moduel-gene-significance box plot =", as.character(pval1), sep="") appendStringToFile(logFname, boxinfor) openImgDev(imgModulSig) #boxplot(split(significance,color1),col=dispcolors,notch=T,varwidth=T,ylab="essentiality",main=paste("p-value=",pval1) ) barplot(meansG, names.arg=dispcolors,col= dispcolors, ylab="proportion essentiality",main=paste("p-value=",pval1) ) err.bp(as.vector(meansG), as.vector(seG), two.side=T) dev.off() #*------------ cluster coefficient Significance in modules -------------------------------- if(cc.done){ openImgDev(imgModulCC) pval2=signif(kruskal.test(cluster.coef,color1 )$p.value,2) boxplot(split(cluster.coef,color1),col=dispcolors,notch=T,varwidth=T,ylab="cluster coefficient", main=paste("p-value=",pval2)) dev.off() } #within module test #if(1==2){ # restG=color1=="turquoise" # pval2=cor.test(nolinks[restG], significance[restG] )$p.value # plot(nolinks[restG], significance[restG], # xlab="connectivity k",ylab="essentiality",main=paste("correlation p-value=",signif(pval2,2))) # pval2=cor.test(cluster.coef[restG], significance[restG] )$p.value # plot(cluster.coef[restG], significance)[restG], # xlab="cluster coefficient",ylab="essentiality",main=paste("correlation p-value=",signif(pval2,2))) #} #*------------------------------------------------------------------------------------- #* STEP 5.3: heatmap & dendrogram & expression bands .... #* #* #* #* #*------------------------------------------------------------------------------------- # this is required when dealing with more than 1000 genes labeltree = as.character(colcode.reduced) labelcol = labeltree labelrow = labeltree labelrow[ h1row$order[length(labeltree):1] ]=labeltree[h1row$order] # Here we create a scatterplot of cluster.coef vs nolinks # for all genes # -------->>---------- output image ----------- if (cc.done) { openImgDev(imgCoeffK) par(mfrow=c(1,1)) plot(nolinks,cluster.coef,col=as.character(color1), xlab="no of connections", ylab="cluster coefficient") dev.off() plot(nolinks,cluster.coef,col=as.character(color1), xlab="no of connections", ylab="cluster coefficient") } #Now we create color plots for each module #For each module there should be a clear band structure, #i.e. genes #corresponding to a given array (column) should have the same color. # -------->>---------- output Expression Bands image ----------->>---------- openImgDev(imgExprBands) iExprMatrix = t(datExpr[, rest1]) iExprMatrix = iExprMatrix[h1row$order,] heatmap(as.matrix( iExprMatrix ), Rowv=NA, RowSideColors=as.character(labelcol[h1row$order]), Colv=NA,scale="row", revC=F, xlab="", col = rgcolors.func(50)) dev.off() #write out the modules sd2=apply(datExpr,2, sd, na.rm=T) if (cc.done){ outMatrix=cbind(allMatrix, k.all, k.in, k.out, k.diff, cc.all, cc.in, to.all, to.in, sd2, color1) names(outMatrix) <- c(names(allMatrix), "k.all", "k.in", "k.out","k.diff", "cc.all", "cc.in", "to.all", "to.in", "sd", "module") } if (!cc.done){ outMatrix=cbind(allMatrix, k.all, k.in, k.out, k.diff, cc.in, to.all, to.in, sd2, color1) names(outMatrix) <- c(names(allMatrix), "k.all", "k.in", "k.out","k.diff", "cc.in", "to.all", "to.in", "sd", "module") } write.table(outMatrix, moduleFname, sep="\t",quote=FALSE, col.names=TRUE, row.names=FALSE) options(expressions = 10000) # -------->>---------- output image of dist heatmap ----------->>-------- #TEMPORARILY disabled to save time... #if(1==2){ openImgDev(imgHeatMap) par(mfrow=c(1,1)) #heatmap(as.matrix(dist1),Rowv=as.dendrogram(h1row),Colv="Rowv", scale="none",revC=T,ColSideColors=as.character(labelcol),RowSideColors=as.character(labelrow)) #we need reverse the distmatrix and row labels revDist1=apply(dist1[h1row$order,h1row$order], 2, rev) heatmap(as.matrix(revDist1^4),Rowv=NA,Colv=NA, scale="none",revC=F,ColSideColors=as.character(labelcol[h1row$order]),RowSideColors=as.character(rev(labelcol[h1row$order])) ) dev.off() #heatmap( 1-sqrt(1-as.matrix(revDist1)),Rowv=NA,Colv=NA, scale="none",revC=F,ColSideColors=as.character(labelcol[h1row$order]),RowSideColors=as.character(rev(labelcol[h1row$order])) ) #this loop produces the pajek output Wahoo! #rm(corhelp) #rm(allMatrix) #collect_garbage() colorpajek = pajekColorcode(as.character(color1)) restP = (as.character(color1) != "grey") restPr= (as.character(colcode.reduced) != "grey") gnames = as.character(genesInfor[restP, 1]) #This is correct. Proper usage, is to use a cutoff that will PROBABLY be very small (cutoff = 0 for example) #Then apply a nice big power function so that the network looks ok (values are spread out enought to be visible) #This is true EVEN THOUGH, the dichotCor has already been transformed with the power function. #values for the dichotCor witll range from 0 to 1. # all values will be positive # to reproduce the old cutoffs you can calculate what the (say .8 for this example) cutoffs used to be by doing 0.8^power_cutoff_val where #power_cutoff_val = the beta value, in the scale-free-ness calculation. So .8^power_cutoff_val would = be the transformed new cutoff for .8 old cutoff. pajekMatrix = cbind(gnames, round(cc.in[restP],3), colorpajek[restP], round(dichotCor.reduced[restPr,restPr],3)) colnames(pajekMatrix) <- c("pajek", "cc.in", "colorCode", gnames) #this might be a problem. (may need the other table export functions) write.table(pajekMatrix, pajekFname, sep="\t",quote=FALSE, col.names=TRUE, row.names=FALSE) #} # -------->>------------------>>------------------>>------------------>>------------------>>------------------>>-------------- # -------->>------------------>>------------------>>------------------>>------------------>>------------------>>-------------- # -------->>---------- output plots of module based property vs the essentiality/conservation as images ----------- # -------->>------------------>>------------------>>------------------>>------------------>>------------------>>-------------- # -------->>------------------>>------------------>>------------------>>------------------>>------------------>>-------------- modulecolors = names(ctable) #keyproperties = c("k.all", "k.in", # "cc.in", "cc.all", # "to.all","to.in") propertynames = colnames(datNetwork) no.properties = dim(datNetwork)[2] #make a file to append all the stat results to testsFname <- file(tests_Fname, "w+") close(testsFname) #this assumes that you want to append to tests_Fname and that you want 20 points unless you tell it otherwise #plotTester(essentiality, k.all, plotTitle, imgKvsEss_ALL) for (modulep in modulecolors) { #modulep = "turquoise" if(modulep=="grey"){ next } color.module = color1==as.character(modulep) #define no.nas for within-module genes no.nas.essential = ! is.na(essentiality[color.module]) essent.nonas = (essentiality[color.module])[no.nas.essential] no.nas.meaneval = ! is.na(mean_e_val[color.module]) logmeaneval.nonas = conservationVectorCleaner(mean_e_val[color.module], no.nas.meaneval) for (ip in c(1:no.properties) ){ #ip=1 property.name = propertynames[ip] filedname=replaceChars(property.name, ".", "_") property.module = (datNetwork[, ip])[color.module] property.module2essential = property.module[no.nas.essential] property.module2meanevl = property.module[no.nas.meaneval] plotTitle = paste(modulep, ": ", sep="") #---------------------------------------------------------------------------------- # Plot: property vs Essentiality in each module #---------------------------------------------------------------------------------- myimgname = paste(outputDir, fname, "_", modulep, "_essentVS", property.name, ".png", sep="") #myimgname = paste(outputDir, fname, "_", modulep, "_essentVS", property.name, ".ps", sep="") #this assumes that you want to append to tests_Fname and that you want 20 points unless you tell it otherwise scatterCorrelTester(essent.nonas, property.module2essential, binaryvar=TRUE, PlotsTitle=plotTitle, myxlab=property.name, myylab="essentiality", outputImage=myimgname, outputFile=testsFname) #---------------------------------------------------------------------------------- # Plot: property vs conservation in each module #---------------------------------------------------------------------------------- myimgname = paste(outputDir, fname, "_", modulep, "_conservVS", property.name, ".png", sep="") #myimgname = paste(outputDir, fname, "_", modulep, "_conservVS", property.name, ".ps", sep="") #this assumes that you want to append to tests_Fname and that you want 20 points unless you tell it otherwise scatterCorrelTester(logmeaneval.nonas, property.module2meanevl, binaryvar=FALSE, PlotsTitle=plotTitle, myxlab=property.name, myylab="conservation", outputImage=myimgname, outputFile=testsFname) } } #*------------------- "Conservation Significance" in modules -------------------------------- # ie. how does the average conservation look for each module? # For this we want to calculate the values again (for the entire thing) and filter # the way we used to in the older code so that I can get just split based on the color and present # a simple average of the evals for each module with error bars all.no.nas.meaneval = ! is.na(mean_e_val) all.logmeaneval.nonas = conservationVectorCleaner(mean_e_val, all.no.nas.meaneval) meansE = as.vector(tapply(-1*all.logmeaneval.nonas, color1[all.no.nas.meaneval], mean)) seE = as.vector(tapply(-1*all.logmeaneval.nonas, color1[all.no.nas.meaneval],stderror)) filtColors = names(table(color1[all.no.nas.meaneval])) openImgDev(imgModConsSigBox) #boxplot(split(significance,color1),col=dispcolors,notch=T,varwidth=T,ylab="Conservation" ) boxplot(split(all.logmeaneval.nonas,color1[all.no.nas.meaneval]),col=names(table(color1[all.no.nas.meaneval])),notch=T,varwidth=T,ylab="Conservation" ) #err.bp(as.vector(meansE), as.vector(seE), two.side=T) dev.off() openImgDev(imgModConsSigBar) #boxplot(split(significance,color1),col=dispcolors,notch=T,varwidth=T,ylab="Conservation" ) barplot(meansE, col=filtColors, ylab="Conservation" ) err.bp(as.vector(meansE), as.vector(seE), two.side=T) dev.off() #*------------------------Now lets just replot the image of the Hierclust so that we can preserve it---------------------- #this hierClust plot is for publication par(mfrow=c(2,1), mar=c(0,0,0,0) ) plot(h1row, labels=F, xlab="",ylab="",main="",sub="",axes = F) barplot(height=rep(1, length(colcode.reduced)), col= as.character(colcode.reduced[h1row$order]), space=0, border=F,main="", axes = F, axisnames = F) par(mfrow=c(1,1), mar=c(5, 4, 4, 2) + 0.1) #this hierClust plot is the same as above EXCEPT that it also has color names (can be trimmed for publication) fdispcolor = getDisplayColorSequence(as.character(colcode.reduced[h1row$order])) par(mfrow=c(2,1), mar=c(6, 0, 0, 0), cex.lab=0.5) plot(h1row, labels=F, xlab="",ylab="",main="",sub="",axes = T) barplot(height=rep(1, length(colcode.reduced)), names.arg=fdispcolor, col= as.character(colcode.reduced[h1row$order]), space=0, las=2, border=F,main="", axes = T, axisnames = T, cex.lab=0.5) par(mfrow=c(1,1), mar=c(5, 4, 4, 2) + 0.1) ## ------------------------------------- END ------------------------------------------------