# neo.txt: Network Edge Orienting. Learn a Bayesian network # for gene expression traits and clinical phenotype trait # data, using SNPs as causal anchors and compute # structural confidence scores for each directed # edge between traits. # # Uses SNPs or genetic variation to orient edges in # an (mRNA trait,clinical trait) correlation network. # # Methods implemented (but not necessary active by default) # include ZEO (Z-score Edge Orienting), LEO (Local-structure # Edge Orienting), RVV (Recursive V-structures with Verification), # and refinements of Cohen et al's FTC filtering approach for # learning SEMs. # # As currently configured, NEO is set up to automatically # identify SNPs (columns starting with 'SNP', snd a single # Trait column in the data x from the column names, and # process these data accordingly; searching for a causal # directed but not-necessarily-acyclic graph within the # SNP-gene-trait variation data. # # Copyright (c) 2007, 2008, Jason E. Aten # Intial implementation: 14 January 2007 # # Licensing: # Licensed non-exclusively under the GNU GPL version >= 3. # http://www.gnu.org/licenses/gpl-3.0.txt # # This file is provided AS IS with NO WARRANTY OF ANY KIND, # INCLUDING THE WARRANTY OF DESIGN, MERCHANTABILITY AND # FITNESS FOR A PARTICULAR PURPOSE. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS # OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT # OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # # neover(): Automatically updated CVS source code control information; # the R user can call neover() in R to get the version of the # NEO code base they are running. # # CVS Version = Id: neo.txt,v 1.339 2008/02/12 08:26:41 jaten Exp # Became Subversion = Id: neo.txt 23 2008-02-12 08:38:20Z jaten # # Currently: $Id: neo.txt 78 2008-03-14 00:48:54Z jaten $ if(exists("neover") ) rm(neover); neover=function(quiet=FALSE) {a=paste(sep="","neo version: ","$Id: neo.txt 78 2008-03-14 00:48:54Z jaten $"); if (!quiet) { cat(a); cat("\n") }; invisible(a)} # # # neo(): Network Edge Orienting, top level wrapper function # # Author: Jason E. Aten # Jan-June 2007 # # Arguments: # # datCombined = combined data frame containing SNPs and traits. SNPs # are also called exogenous variables in Structural # Equation Modeling. This means they have no parents in the graph. # # In contrast to SNPs, traits (either clinical traits, mRNA # levels, cluster centers, or Princple components from a PCA # or cluster analysis) are endogenous variables and so are # expected to have parents in the graph, namely one or more # of the SNPs. # # Every column of datCombined that is specified in snpcols # (or has a name that begins with "SNP" or "snp"), # is considered an exogenous SNP column. For instance, it # may be useful to specify sex (male/female) as an # exogenous variable in snpcols to see when sex is # playing an important role in gene expression causality. # # Every other column (not in snpcols) is considered an # endogenous trait column. # # Typically there are two kinds of traits used in this analysis: # the mRNA or gene expression traits, and the clinical traits. # # The clinical traits are typically the focus of the # analysis, and we usually want to find if genes (mRNA traits) # are causal or reactive to the (clinical) traits. Typically # a single trait at a time would be considered (set traitcols= to # just the single column containing the clinical trait of interest), # but multiple trait analysis is allowed and performed if more # that one traitcols is specified. # # The only distinction that setting traitcols offers over # all the given non-snp columns, is that NEO will always try # to oreint edges into these special variables, even if they are # in a supernode and so edge orienting scores show # higher variation and deserve a more cautious interpretation. # # snpcols = columns of datCombined that contain SNPs. The default is to # call noe.guess.snp(datCombined) which guesses that the snpcols # are those that start with "SNP" (upper or lowercase) # # traitcols = Traits are special in that we always attempt to orient # edges impinging on a trait, even if the trait is in a # supernode with other variables it is highly correlated with. # # skip.LEO = if TRUE, we don't even bother to do the LEO computations at all. # Default is false. The LEO methods are much more sensitive, but # are slower. For a quick look at the ZEO results, set # skip.LEO=TRUE. # # quiet = if TRUE, no log output is displayed on the R console. A # logfile is still generated, given a current timestamp and # pm$run.title, and written to disk. If quiet is FALSE, then # everything to the screen is also sink()-ed to a log file # whose name ends with '.txt' # # pm = a vector of parameters that control the computation. To # modify these, it is probably easiest to get some good # defaults by calling neo.get.param(), which is also the default # action, and then modify just the members of the named list # that you wish to change. See the neo.get.param() function # just below neo() for documentation of the parameters. # # use.ranks = if TRUE, replace the data in the datCombined frame by their # ranks before doing any other processing, so as to get a # non-parametric computation (e.g. Spearman correlations and # partials are then automatically computed rather than Pearson # correlations and partials). FALSE by default. # # repos.nodes = if the final graph summary picture is drawn and repos.nodes # is TRUE, then the program will pause to allow the user to # click on the picture to rearrange the nodes in the drawn graph. # The node nearest to the click is moved to the point of the # click. # # pm$ignorable.edge.list = a list of edge pairs (the indices of columns of # datCombined) that will be ignored. Useful for speeding up # processing when you only care about relationships between # classes of traits, such as between genes and clinical traits, # rather than relationships between the clinical traits # themselves. # # pm$run.title = string giving name of experiment, appended to log directory # name (should have no spaces) to help track different experiments. # # pm$no.log = defaults to FALSE. If TRUE, don't produce the (somewhat # extensive) logging of results. Useful for doing lots of # simulation calls to neo() where you just what the programatic # output in the returned value. # if(exists("neo") ) rm(neo); neo=function(datCombined, snpcols=neo.guess.snps(datCombined,pm), traitcols=neo.guess.traits(datCombined,pm), pm=neo.get.param(),quiet=FALSE, repos.nodes=FALSE, skip.LEO=FALSE, use.ranks=FALSE) { # we crash if starting directory is too long. if (nchar(getwd()) > 150) { stop("Must run NEO from a directory whose path is short: nchar(getwd()) < 150. Otherwise the file system calls can fail when logging. Use Sysinternals' 'junction' if necessary to creation symlinks."); } pm.print(pm,paste("All starting...",date(), "with version",neover())) sanity.column.names.check = sapply(colnames(datCombined),starts.with.zero.thru.nine) if(any(sanity.column.names.check)) { stop(paste("column names of datCombined cannot start with 0-9 as digits. Aborting. See column", colnames(datCombined)[sanity.column.names.check])) } # screen out factors in snpcols, because we want alleles numeric and ordered for F2 crosses # and factors as default read in may not be ordered for (i in snpcols) { if (is.factor(datCombined[,i])) stop("The specified snpcols must be numeric rather than factors so that correlations are computed correctly.") } # sanity check the supplied data frame -- it should not have any NA or non-varying columns check.rather.than.impute=TRUE if (pm$rough.and.ready.NA.imputation) { check.rather.than.impute=FALSE; } # get names of any forced SNP assignments before anything changes as far as datCombined # columns goes. if (!is.null(pm$forced.MA.colnum)) { pm$forced.MA.cn = colnames(datCombined)[pm$forced.MA.colnum] # cn stands for names } else { pm$forced.MA.cn = NULL } if (!is.null(pm$forced.MB.colnum)) { pm$forced.MB.cn = colnames(datCombined)[pm$forced.MB.colnum] } else { pm$forced.MB.cn = NULL } datCombined=check.or.impute.missing.data(datCombined=datCombined, snpcols=snpcols,pm=pm,check=check.rather.than.impute) # co.lin=check.multi.collinearity(datCombined,pm=pm) # if (co.lin$has.redundant) { # stop(co.lin$msg) # } if (skip.LEO) pm$use.LEO = FALSE; # force use of just ZEO if we aren't # computing LEO if (use.ranks) { for (j in 1:ncol(datCombined)) { datCombined[,j]=rank(datCombined[,j]) } } # setup for graphing at the end. zGRAPH=list() # need to define: zGRAPH$numsnps, zGRAPH$nc, zGRAPH$num.non.snps # defaults: zGRAPH$numsnps = length(snpcols) zGRAPH$num.non.snps = length(setdiff(1:ncol(datCombined),snpcols)) zGRAPH$nc = zGRAPH$numsnps + zGRAPH$num.non.snps # set defaults for permutation block, if needed. if (is.null(pm$block.to.permute) | length(pm$block.to.permute) ==0) { pm$block.to.permute=snpcols } if (any(!(pm$block.to.permute %in% (1:ncol(datCombined))))) { stop("Bad specification of pm$block.to.permute: out of range") } # preserve names in case columns get re-arranged. pm$block.to.permute.cn = colnames(datCombined)[pm$block.to.permute] if (!is.null(pm$A) & !is.null(pm$B)) { pm$just.AB = TRUE # we have A and B and so we need only consider these columns in # addition to the snpcols if (length(intersect(pm$A,pm$B)) > 0) { stop("pm$A and pm$B sets must be disjoint. Halting.") } if (any(!(pm$A %in% 1:ncol(datCombined)))) stop ("pm$A values out of bounds.") if (any(!(pm$B %in% 1:ncol(datCombined)))) stop ("pm$B values out of bounds.") if (any((pm$A %in% snpcols))) stop ("pm$A values out of bounds: cannot be in snpcols.") if (any((pm$B %in% snpcols))) stop ("pm$B values out of bounds: cannot be in snpcols.") pm$A.cn = colnames(datCombined)[pm$A] # so we can re-match them easily # later, even after shrinking the matrix after SNP selection. pm$B.cn = colnames(datCombined)[pm$B] keepers=sort(union(c(snpcols,traitcols,pm$A),pm$B)) snpcols=match(snpcols,keepers) # if rearranged, get back proper snpcols traitcols = match(union(pm$A,pm$B),keepers) pm$A =match(pm$A, keepers) # ditto pm$B =match(pm$B, keepers) # ditto datCombined=datCombined[,keepers] # adjust graph drawing stuff. zGRAPH$numsnps = length(snpcols) zGRAPH$num.non.snps = length(traitcols) zGRAPH$nc = length(keepers) # enumerate the edges to be checked. Save in pm$enumerated.A.cn, pm$enumerated.B.cn pm=get.enumab.cn.set(pm) } else { # pm$A/B not called for, but still use the pm$enumerated.A.cn, pm$enumerated.B.cn # lists to organize execution into batch execution. # Can't use get.enumab.cn.set when A and B lists overlap, so do it manually: cn=colnames(datCombined) non.snp.cols=setdiff(1:ncol(datCombined),snpcols) lenT = length(non.snp.cols) n.rows = lenT * (lenT-1) /2 pm$enumerated.A.cn = rep(NA,n.rows) # initialize pm$enumerated.B.cn = rep(NA,n.rows) k=1 for (i in 1:(lenT-1)) { for (j in (i+1):lenT) { pm$enumerated.A.cn[k] = cn[non.snp.cols[i]] pm$enumerated.B.cn[k] = cn[non.snp.cols[j]] k=k+1 } } } if (quiet) pm$quiet=quiet; if (pm$no.log) { neo.log.file="no.neo.log" } else { # setup logging - put all log files in a new, timestamped directory neo.log.file = neo.mkdir.logfile(pm$run.title); pm$neo.log.file=neo.log.file pm$orig.neo.log.file=neo.log.file sink(file = paste(sep="",neo.log.file,".txt"), append=TRUE, type="output",split = !pm$quiet); on.exit(while (sink.number()) sink()); # turn off log in case of crash, and upon normal exit. pm.print(pm,paste("Logfile:",neo.log.file)); } # do we open excel at the end? pm$open.excel.at.end = FALSE # default if (!pm$no.log & !pm$quiet) pm$open.excel.at.end = TRUE pm$open.excel.now = FALSE # set to true after last permutation # setup control structure--parameters in pm, for the run if(is.null(pm)) { pm = neo.get.param(logpath=neo.log.file); } # not necessary, these are the defaults # pm$use.leo=TRUE; # LEO if true, ZEO if false. # pm$eo.type="forward.step.selection"; # additional options: "all.snp.permutations.equal.votes","max.vs.max" #necessary pm$no.obs.Z = nrow(datCombined); # If you want, you can over-ride no.obs for Z-score computations here/and below. # INVAR: we have pm, set true no.obs rows pm$no.obs = nrow(datCombined); if (nrow(datCombined) < 5) { stop("datCombined must have 5 or more rows (individuals) for neo() to work.") } # before any columns get shuffled, translate the pm$only.score.edges.into into column names # so we can re-match them later during filtering. if (!is.null(pm$only.score.edges.into)) { pm$only.score.edges.into = colnames(datCombined)[pm$only.score.edges.into] } # auto-farm out permutations to SGE if requested if (pm$run.perms.on.sge & pm$number.BLOCK.permutations > 0) { # save the current environment # after changing pm$run.perms.on.sge to avoid infinite loop pm$run.perms.on.sge = FALSE # and give sge jobs their own directory, inside the main log directory pm$sge.run.dir = create.logpath.enclosed.sge.directory(pm) pm.print(pm,paste("Created directory for Sun Grid Engine job output: ",pm$sge.run.dir)) logfile.to.restore = pm$neo.log.file pm$neo.log.file = "sge_neo." # temporarily change directory into sge run directory cwd=getwd() setwd(pm$sge.run.dir) save(list=ls(), file="master.sge.image.rdat") # now that sge jobs have their own logfile directory, get back ours! pm$neo.log.file = logfile.to.restore # function neo.sge.perm loads the saved file "master.sge.image.rdat" and runs one permutation (could be more). #n.perm.job=ceiling(pm$number.BLOCK.permutations/10) n.perm.job=pm$number.BLOCK.permutations #job.list=rep("neo.sge.perm",n.perm.job) job.list=rep("neo.sge.perm",n.perm.job) pm.print(pm,paste("Submitting ",n.perm.job," permutation jobs to Sun Grid Engine.")) submit.to.sge(job.list=job.list, neo.install.path=pm$sge.needs.neo.install.path, r.program.install.path=pm$sge.needs.r.program.install.path) setwd(cwd) # restore directory, now that jobs are submitted. pm$number.BLOCK.permutations = 0 } pm$doing.permutation = FALSE; # first time through # Implement permutations loop # handle permutations in a loop pm$perm.count = 1; while(1) { # permutation loop will end with a break; # at end of this while loop, put: if (pm$perm.count > pm$number.BLOCK.permutations) { break; # we are done} else { # permute data and repeat} # skip initial data analysis on unpermuted data if only doing a permutation if (pm$perm.count == 1 & pm$just.do.permutation.no.unpermuted) { # then skip the first unpermuted data run, and go straight down to permuting the data } else { # begin skip.first.time.test # Implement batch processing for all processing... pm$neo.batch.run = TRUE pm$first.batch.pass = TRUE pm$last.batch.pass = FALSE # IMPORTANT: Inside the batch loop, we modify pm$A and pm$B, as well # as pm$A.cn and pm$B.cn, but we don't change # the fully enumerated lists, pm$enumerated.A.cn and pm$enumerated.B.cn batches = length(pm$enumerated.A.cn) for (batch.num in 1:batches) { k=batch.num pm.print(pm,paste(sep="","*** Edge ",batch.num," of ",batches, ", running: ",pm$enumerated.A.cn[k]," - ",pm$enumerated.B.cn[k]," ...",date())); if (batch.num > 1) pm$first.batch.pass = FALSE if (batch.num == batches) pm$last.batch.pass = TRUE traitA.cn = pm$enumerated.A.cn[k] traitB.cn = pm$enumerated.B.cn[k] traitA = match(traitA.cn, colnames(datCombined)) traitB = match(traitB.cn, colnames(datCombined)) dc = datCombined[,c(snpcols,traitA,traitB)] my.snpcols = 1:length(snpcols) pm$A = trA.loc = ncol(dc)-1 pm$B = trB.loc = ncol(dc) pm$A.cn = colnames(dc)[pm$A] # keep consistency pm$B.cn = colnames(dc)[pm$B] # keep consistency # main four calls # # 1) initial SNP primary is not the final SNP election # # modified for batch, was: r1 = neo.snp.primary(datCombined, snpcols=snpcols, traitcols=traitcols, r1 = neo.snp.primary(datCombined=dc, snpcols=my.snpcols, traitcols=c(trA.loc,trB.loc), pm=pm, skip.snp.select=pm$skip.snp.select, neo.log.file=neo.log.file); if (!pm$no.log && pm$save.intermediate.files) save(r1,file=paste(sep="",neo.log.file,".batch_",batch.num,".r1.rdat")); # if requested by setting pm$number.BLOCK.permutations > 0, we evaluate # the choice of SNPs by shuffling (generating a permutation of) the # assignment of SNPs to cases (individuals, mice), and then # re-selecting SNPs. # # For each SNP chosen, we can then assign a permtation frequency: how often # the k-th ranked SNP exceeded the value observed in the k-th ranked SNP in # the actual data. # # 2) variables go through two more processes...supernoding and filtering # r2 = neo.create.super.nodes(r1); if (!pm$no.log && pm$save.intermediate.files) save(r2,file=paste(sep="",neo.log.file,"r2.rdat")); # # 3) the final "election" happens after applying various filters # r3 = neo.edge.filters(r2,ignorable.edge.list); if (!pm$no.log && pm$save.intermediate.files) save(r3,file=paste(sep="",neo.log.file,"r3.rdat")); # # 4) edge orient. Walk two steps down from each SNP, and integrate # SNPs signals by the various edge orienting strategies. # r4 = neo.edge.orient(r3, fit.full.model=FALSE,skip.LEO=skip.LEO) if (!pm$no.log && pm$save.intermediate.files) save(r4,file=paste(sep="",neo.log.file,"r4.rdat")); # Excel sheet should be opened automatically # at end of summary, if not, set pm$excel.path correctly. z = summary(r4); } # end batch.num loop } # end skip.first.time.test, implemented in case we are only doing a permutation... # process permutations, if any. if (pm$perm.count > pm$number.BLOCK.permutations) { if (pm$number.BLOCK.permutations) {print(paste("done with all ",pm$number.BLOCK.permutations," permutations, at ",date()))} break; # we are done (thus the default of perm.count=1 when pm$number.BLOCK.permutations=0 } else { # permute data and repeat pm$perm.count = pm$perm.count+1 pm$no.log = TRUE pm$quiet = TRUE pm$save.intermediate.files=FALSE pm$doing.permutation = TRUE print(paste("Starting on ",pm$perm.count-1," out of ",pm$number.BLOCK.permutations," permutations, at ",date())) # save permutations to their own file pm$perm.file = paste(sep="",pm$neo.log.file,".perm.csv") # re-set the permute colnums in case of re-arrangement of datCombined pm$block.to.permute = match(pm$block.to.permute.cn,colnames(datCombined)) datCombined=permute.snps.traits(pm, datCombined, pm$block.to.permute) print(paste(".............permutation of SNP:gene data done....starting NEO evalutaion",date())) if (pm$perm.count > pm$number.BLOCK.permutations) { pm$open.excel.now = TRUE } } # conclusion/goto top of while(1) loop. } # end permutation loop # # possible halting point: if just doing permutations, skip the graphing and the messing with the excel file output. # if (pm$just.do.permutation.no.unpermuted) { pm.print(pm,paste("Done with permutation and pm$just.do.permutation.no.unpermuted=TRUE. Halting at ",date())) return(invisible(c())) } # read in permutations and summarize in .csv file for Excel. (Assuming we also computed the observed stats) if (pm$doing.permutation & !pm$just.do.permutation.no.unpermuted) create.permutation.report(pm,colnames(datCombined),snpcols) # read back in the final matrix to get the whole thing # for graph drawing. # only go past here if we have a valid excel file... excel.file=get.excel.file(pm) if(!file.exists(excel.file)) { pm.print(pm,paste(sep="","Could not load excel file: '",excel.file,"'. (Probably no pm$neo.log.file set. Inspect pm$neo.log.file='",pm$neo.log.file,"') Finishing before graphing or (trying to) open spreadsheet.")) return(invisible(z)) } # set a default so we know if we got it by checking !is.null(fullcsv), even if we don't do graph drawing... fullcsv=NULL if (1) { # to comment in/out graph drawing. # don't insit on rownames=1 b/c snp edges may get duplicated fullcsv = read.csv(file=excel.file,header=T,sep=",",quote="\"",dec=".",fill=T,comment.char="") w.have.leo=which(!is.na(fullcsv$LEO.NB.OCA)) zGRAPH$pm = pm non.snp.cols=setdiff(1:ncol(datCombined),snpcols) # not sure why, but this variable not in scope now. So re-define. zGRAPH$non.snp.cols=non.snp.cols zGRAPH$coor = do.graph.placement(snpcols,zGRAPH$numsnps, zGRAPH$nc, zGRAPH$num.non.snps, non.snp.cols) traits=unique(c(pm$enumerated.A.cn,pm$enumerated.B.cn)) # the matrix to graph; get this from the final (accumulated after batches) excel file, now in fullcsv zGRAPH$M = matrix(nrow=zGRAPH$nc, ncol=zGRAPH$nc,data=0); colnames(zGRAPH$M)=rownames(zGRAPH$M) = colnames(datCombined) zGRAPH$final.eo = zGRAPH$M # also make a score matrix, for the labelling of the graph zGRAPH$final.eo[] = NA # start unlablled though. non.snp.cols=setdiff(1:ncol(datCombined),snpcols) rn=as.character(fullcsv[,1]) # rownames now in first column splt=strsplit(rn,split=" -> ",fixed=TRUE) FROM.A=sapply(splt,function(x) x[1]) TO.B =sapply(splt,function(x) x[2]) # create the graph for (i in w.have.leo) { if (fullcsv$LEO.NB.OCA[i] > 0) { zGRAPH$final.eo[FROM.A[i],TO.B[i]] = fullcsv$LEO.NB.OCA[i] zGRAPH$M[FROM.A[i],TO.B[i]] = 1 eval(parse(text=as.character(fullcsv$Final.SNPs.LEO.NB.OCA[i]))) # sets fsnp.cpa,fsnp.oca if (!is.null(fsnp.cpa)) zGRAPH$M[fsnp.cpa,FROM.A[i]]=1 if (!is.null(fsnp.oca)) zGRAPH$M[fsnp.oca,TO.B[i]]=1 } } zGRAPH$my.title=pm$run.title # draw the graph; use adjust=TRUE to click with mouse and # reposition the nearest node. # Skip drawing on unix b/c x11 frequently doesn't have the fonts. # if (!pm$quiet & !is.unix()) { z$final.coor = neo.graph(zGRAPH,adjust=repos.nodes);} } # end comment in/out graph drawing # Since z now just has the stats for last edge evaluated, and some of the earlier # scripts (before we implemented batch processing for scalability) # expect there to be multiple edges evaluated in z$stats, we'll try # to keep back-compatibility by reading in the full set of edges # that were evaluated, and assigning them to z$stats # if (!is.null(fullcsv)) { excel.file=get.excel.file(pm) # try to sort first, if possible. Use check.names=FALSE to prevent ___ -> X___ # excel.file.to.sort = read.csv(file=excel.file,header=T,sep=",",quote="\"",dec=".",fill=T,comment.char="",check.names=FALSE) # old: sorted.df=excel.file.to.sort[order(excel.file.to.sort$LEO.NB.OCA,decreasing=TRUE),] # new: sorted.df=excel.file.to.sort[order(excel.file.to.sort$P.weighted.LEO.NB.OCA,decreasing=TRUE),] colnames(sorted.df)[1]="edge" z$stats = sorted.df write.csv(sorted.df,file=excel.file,na ="") if (pm$open.excel.at.end) { open.excel.file(excel.file,pm) } } # re-save, with the coordinates, and the fullcvs spreadsheet in z$stats if (!pm$no.log && pm$save.intermediate.files) save(z,file=z$post.save.fname); invisible(z) } # end neo() top level wrapper function # PARAMETER CHOICES DOCUMENTATION FOLLOWS! # # neo.get.param(): Parameter defaults for neo functions, neo.get.param() # # give the user sane default parameters and let them avoid having to type all # the defaults out each time # # logpath gets pre-generated and passed in each time neo() calls. # if(exists("neo.get.param") ) rm(neo.get.param); neo.get.param=function(logpath="neo.logfile") { list( graph.arrow.edge.th=.3, # an edge score must surpass this threshold to be graphed # by neo.graph, # (and kept if using ZEO to orient edges) # a pair with correlation below this are considered independent. # This parameter is only used for LCD, RVV, and the max.max special # case handling; not used in OCA and CPA computations. cor.ind.th=0.2, # a pair with correlation above this are considered dependent. # The absolute correlation between a marker and a trait has # to surpass this threshold to be considered at all (including in OCA and CPA). cor.dep.th=0.1, # if the partial correlation falls below this upon conditioning, # the edge is eliminated by a filter. #pcor.th=0.05, # Not relevant for OCA and CPA, and in particular is turned off # by pm$skip.pcor.filters.currently, which is TRUE by default. pcor.th=0, # turn off this filter. # threshold for Cohen's omega filter. We keep small omega edges # (below omega.th), because this implies small indirect influences # between a pair of variables. #omega.th = 0.85, omega.th = 1, # 1 will effectively turn off this pcor threshold filters. k.pcor.th=-0.05, # negative value for k.pcor.th will turn off the # VCdep loops-- which take a long time, and are # only used if no SNPs are availabe; the negative # default makes for faster runs. In general, k.pcor.th # is the point at which: if the partial correlation between a # pair when conditioned upon a single variable k falls below # the k.pcor.th value, then the edge between the pair is filtered out. induced.dep.th = .2, # if checking for conditional dependence # (v-structure location), # --that forms between independent parents of a common child, # we require that the induced correlation (upon conditioning # on the prospective child) reach at least this level. snp.below=.3, # if SNPs are correlated at 1-snp.below or higher, # then they are clustered # together and a single representative from the cluster is chosen to # represent the Super(SNP)node. Only relevant if pm$do.snp.pick.hcluster = TRUE, # but by default currently it is FALSE. cor.above.supernode.def=.7, # if PCs are correlated at this value or # higher, then # they are clumped into a supernode. Variables within # a supernode are not allowed to filter each other's edges. force.cutree.count=25, # when picking SNPs from the hcluster-ing of # all SNPs, cut the # SNP tree into this many branches sets. Within each set (cluster) # the SNP with the highest correlation with any non.snp trait or PC # is chosen as a representative. This is an attempt to get some # orthogonal SNPs into the playing field. top.N.snps.per.trait=4, # During greedy SNP selection, pick this # many top SNPs in terms of # the SNPs that are correlated with each pc or trait. snp.redundancy.cor.level=.9, # retract redundant SNPs at this level # of abs correlation skip.greedy.snps.if.redundant = TRUE, # if TRUE, # don't even bother selecting subsequent greedy snps that don't # differ from previous snps by having a correlation below the # pm$snp.redundancy.cor.level set above. # At or above this correlation level, columns in datCombined # are flagged as too multi-collinear to continue. This check # is implemented in check.multi.collinearity(). # (Must be positive; negative values will have abs() applied first.) datCombined.column.redundancy.reject.cor.level = .95, warn.if.zeo.and.leo.disagree = FALSE, # if TRUE, # then issue a warning as we notice that the ZEO and LEO scores # differ in sign. show.snp.selection.log = FALSE, # if TRUE, # show the details of why the Greedy SNPs were chosen the way they were. skip.snp.retraction=TRUE, # if TRUE, we don't retract redundant SNPs # correlated with # previously chosen SNPs at or above snp.redundancy.cor.level turn.NA.scores.to.zero.and.zero.to.666 =FALSE, # when SNPs aren't # correlated with traits, we ususally just # report NA for the edge scores without enough information to be # scored. However, if this option is TRUE, we report 0 instead. # And formerly zero scores are turned to 666e-8. Implemented per # Steve's request. no.obs=NULL, # The true value of nrow(datCombined), for when it is # needed. no.obs.Z=NULL, # if set, over-rides nrow(datCombined) in the # Z-score calculations # The user should supply data with NA already imputed. # We hacked in some solutions, but probably these are not optimal. # They are only used if this is not FALSE. rough.and.ready.NA.imputation = FALSE, # when fitting multiple marker models in local.sem.four.var.m1m2(), include # covariance terms within the M_A set and within the M_B set? # FALSE is traditional; TRUE may give better or more accurate model fits # because we aren't penalized for some of the markers being correlated # with others in their group. Now confirmed, we default to TRUE now. add.MA.MB.covar.in.local.sem.four.var.m1m2 = TRUE, use.leo=TRUE, # Use the LEO (Local SEM Edge Orienting) method if true, otherwise # use the ZEO to draw the final graph and set the "keep" flag in # the output spreadsheet/dataframe. Note that LEO is still calculated, # even if it won't be used (for comparison). If you wish to circumvent # the LEO computations, use instead the flag skip.LEO=TRUE # when called neo() or neo.edge.orient(). # Edge Orienting method to use (to draw graph from) # # Use the following for eo.type to specify the edge orienting used # in drawing the final graph in neo.graph from z$M. # Options: "all.snp.permutations.equal.votes", "forward.step.selection" or # "max.vs.max" eo.type = "forward.step.selection", # "all.snp.permutations.equal.votes", # default # Where Excel is installed. excel.path="C:\\Program Files\\Microsoft Office\\Office10\\EXCEL.EXE", # parameters for using the fisher transform of the correlation # coefficient to (semi)normality. fisher.dof.cor=3, # 3 usually fisher.dof.pcor1=4, # 4 usually # the fit function sem (from R package sem) crashes less often with # correlation matrices instead of covariance matrices. sem.fit.correlation.instead.of.covariance = TRUE, # Control how deep the forward step-wise regression to select SNPs goes. # This is the number of steps beyond the first greedy step. For instance, # a value of 1 here means that 2 SNPs will be chosen (assuming they are # available) into A, and 2 SNPs chosen into B, for each A-B edge oriented. # A value of 0 is allowed; in which case if do.snp.pick.fstep==TRUE then # the same SNP as the first greedy SNP will be selected. orthog.search.depth = 5, # Over-ride orthog.search.depth with a smaller number if this is not NULL. # Only applied in OCA model building at present (losem.walk.two.steps.weighted.mean.mlreg). # Also, this gets applied BEFORE the final forward stepwise regression, # so this will likely be the strongest marginal set...not necessarily # what you always want, but good for emulating manual selection... # Applied after marker assignment consistency is obtained. max.snps.in.OCA.CPA.sets = 3, #logfile dir/path.prefix neo.log.file=logpath, # may wish (with multiple runs under SGE especially) to re-use existing # log directory....in which case neo.log.file is kept, and is assumed # to be a file path prefix that points into an already existing directory. # Otherwise (if TRUE) we generate such a path prefix by combining pm$run.title # and a time stamp. keep.existing.neo.log.file=FALSE, # for SunGridEngine runs, these need to be specified... sge.needs.neo.install.path="/home/jaten/dev/peculiar/neo.txt", sge.needs.r.program.install.path="/usr/local/bin/R", # if quiet is TRUE, we endeavor not to draw graphs and open excel # spreadsheets and display progress. quiet=FALSE, # number.BLOCK.permutations is > 0, then we re-run the analysis (much # slower) the given number of times, each time shuffling the # relationship between cases and this block of columns (defaults to SNPs--see next parameter). number.BLOCK.permutations = 0, # Specify the block of column numbers to permute. # Typically this would be the snpcols, or the traitcol or some genecol, # depending on which hypothesis of independence you wish to evaluate. # DEFAULT: use snpcols if left to NULL. block.to.permute = NULL, # optional additional blocks to permute (must be non-overlapping # with each other and the first block). block2.to.permute = NULL, block3.to.permute = NULL, # options to permutation runs... # TRUE means ignore the original data and just permute data and run neo. just.do.permutation.no.unpermuted = FALSE, # if a run on a sun grid engine based cluster, we # can parcel out the permutation jobs to different cluster nodes # for faster execution, then collect at the end. run.perms.on.sge = FALSE, # impute.nn() will use as nearby SNPs to impute, only those SNPs that are correlated # at this minimum level or higher impute.nn.minimum.correlation.th = .8, # Do final collider computation at each v-structure inferred in the # network. Or not. do.final.colliders = FALSE, # skip the SNP selection and just use all provided SNPs? skip.snp.select = FALSE, # should we save after each stage for diagnostics/recovery during # a long run? save.intermediate.files = FALSE, # make batch processing go faster #save.intermediate.files = TRUE, # edges to ignore; ex: pm$ignorable.edge.list=list(c(1,2),c(2,3)) # to ignore edges between columns 1 & 2 (and 2 & 1) # and also ignore any edges between columns 2 and 3. ignorable.edge.list = NULL, # if specified, only score edges into this column or columns of # datCombined only.score.edges.into = NULL, minimum.minor.allele.freq = NULL, run.title = "", # name of the run/experiment no.log = FALSE, # whether to log or not # set this to TRUE if you want to mix single marker scores with # double marker model # scores, even though they are on very different scales and not # really comparable. work.hard.at.EO.contingencies.if.not.both.M.B.and.M.A.markers = FALSE, # shall we do Minority Reporting? If so, we compare max,for,all scores # and issue a warning if they differ in sign. do.minority.report.max.for.all = FALSE, # m1m2.average: This is a very, very slow score, the average over all the m1m2 # models, using each of the M.A markers paired with each of the M.B # markers at a time. Typically off (FALSE) by default because it's slow. do.m1m2.average = FALSE, # do.m1m2.average = TRUE, do.leo.max = TRUE, # fit and compare MA->A->B<-MB models for best single marker MA and MB do.leo.all = TRUE, # fit and compare the average of MA->A->B models use.traitcols.guess = FALSE, # Turn back on if you want neo # to use trait names that start with T as clinical traits by defaulat zeo.proxy.for.leo.th = NA, # if NA, compute all LEO scores. If not NA, # use this threshold above which # we compute LEO scores (slow), below which the ZEO (fast) score is # saying the LEO is probably not worth computing. Since ZEO is more # than 100 times faster than LEO, using this proxy yields massive # speedups. A 2.0 threshold is more realistic and conservative, but # we want to not miss subtle signals too. # So we would default to a somewhat low proxy score of 0. do.snp.pick.greedy = TRUE, # if TRUE, SNP selection includes the # greedy SNPs do.snp.pick.fstep = TRUE, # if TRUE, pick SNPs by (Gram-Schmidt) # forward step-wise regression do.snp.pick.hcluster = FALSE, # if TRUE, also cluster the SNPs and # include the single SNP from each cluster with highest trait # correlation A = NULL, # if both A and B are non-NULL, we only check edges between # the A set and B set of column numbers. B = NULL, just.AB = FALSE, # if true (which is useful for big runs using A, B specification) # then we don't try to create the big square matrices M, af, # etc, and in this way we can avoid an out-of-memory crash. # just.AB will be set to true automatically by the program # if A and B are non-null. skip.pcor.filters.currently = TRUE, # turn off the rarely used partial correlation # filters, especially because they will crash # on a big batch multi-marker pm$A, pm$B set run. neo.batch.run = TRUE, # flag set to true when running a large job # whose results won't all stay in memory, and so must be written out to # disk as we go. first.batch.pass = TRUE, # used to track batch processing...do we need to write the headers? # Should be TRUE only the first batch of edges processed. last.batch.pass = FALSE, # set to false on the last pass to signal to open the Excel file. # chromsome.wide.strong.marker.consisteny: # # If TRUE, when enforcing marker consistency (all M.A markers have stronger correlation # with A than with B, and vice-versa for M.B), let the strongest trait-marker # correlation within each chromosome win that entire chromosome for the trait (A or B), # and subsequently don't let the other trait (B, if for instance A wins) have any markers # on that whole chromosome. # # This prevents semi-overlapping peak eQTL from having M.A and M.B both # end up with markers nearby from the same chromosome. This happened in # the BxH.Apoe-null data for females with Insig1 and B4301..Riken on # chr8:100Mb or so. It allows automatic SNP selection to model what # would likely happen under manual SNP curation more closely. # # If TRUE, the marker labels must include the chromsome number after '.chr' as # in gene.mmt23434.chr1.Morgrans0.01, or geneB.mmt23223.chrX.Morgrans0.232. # chromsome.wide.strong.marker.consistency = TRUE, # Forcing SNP choices for M.A or for M.B. Specify the column numbers of the snps within datCombined. # If set, then automatic SNP selection will be ignored. forced.MA.colnum = NULL, forced.MB.colnum = NULL ) } # # install required libraries, if not already... repos=getOption("repos"); repos["CRAN"] = "http://cran.stat.ucla.edu"; options(repos=repos); # to choose a different mirror: # chooseCRANmirror() # to auto-install libraries, if missing, when first sourced. if (!require(GeneTS,quietly=TRUE)) install.packages("GeneTS"); if (!require(ggm,quietly=TRUE)) install.packages("ggm"); if (!require(e1071,quietly=TRUE)) install.packages("e1071"); if (!require(sem,quietly=TRUE)) install.packages("sem"); if (!require(MASS,quietly=TRUE)) install.packages("MASS"); # try to pre-empt Excel opening bug... if (.Platform$OS.type == "windows") { if (!require(rcom,quietly=TRUE)) { install.packages("rcom"); # try to pro-actively prevent Excel-VBA error in R-2.5 and R(D)COM-2.5 library(rcom) comRegisterRegistry() comRegisterServer() } } #if (!require(Matrix,quietly=TRUE)) install.packages("Matrix"); # for sparse Matrix #install.packages(c("GeneTS","ggm","e1071","sem")); # Could also: use ignoreWarnings() to ignore warnings if we already have the packages. #library(GeneCycle) #for pcor.shrink to compute conditioning on all remaining vars #library(GeneNet) #for pcor.shrink to compute conditioning on all remaining vars #library(ggm) # for pcor to compute conditioning on individual variables k #library(e1071) # for impute: to handle NA in the data by filling in the median #library(sem) # for RAM model format 'mod' and Maximum likelihood indices of fit. #library(MASS) # for stepAIC #library(Matrix) library(rpart) # CART models, comes with R. # function to run a bunch of (multimaker/neo) permutations under sun.grid.engine # # some other function should have save datCombined,snpcols,traitcols,pm # with something like: save(list=ls(name=parent.frame()), file="master.sge.image.rdat") # before calling this one. # if(exists("neo.sge.perm") ) rm(neo.sge.perm); neo.sge.perm=function() { # load our workspace load("master.sge.image.rdat") # We do use our own dedicated directory -- but this should be set before qsub-ing. # replace old variable with new value sge.task = get.sge.task.or.zero() # only permutations, no observed stats pm$just.do.permutation.no.unpermuted = TRUE pm$number.BLOCK.permutations = 10 # do 10 permutations per sge task (can be more if desired) # pm$number.BLOCK.permutations = 1 pm$quiet=TRUE pm$no.log=TRUE # just in case we do turn logging on; don't trample the original. if (sge.task > 0) { pm$neo.log.file = paste(sep="",pm$neo.log.file,".sgetask.",sge.task) pm$keep.existing.neo.log.file = TRUE } print(paste("Starting neo inside sge run...pm$number.BLOCK.permutations=",pm$number.BLOCK.permutations," at ",date())) z=neo(datCombined=datCombined,snpcols=snpcols,traitcols=traitcols,pm=pm) invisible(z) } # sma.neo.sge.perm: function to run a bunch of single.marker.analysis permutations under sun.grid.engine # # some other function should have save datCombined,snpcols,traitcols,pm # with something like: save(list=ls(name=parent.frame()), file="permute.sma.master.sge.image.rdat")) # before calling this one. # # This function can't take any arguments. All data is loaded from "permute.sma.master.sge.image.rdat" # if(exists("sma.neo.sge.perm") ) rm(sma.neo.sge.perm); sma.neo.sge.perm=function() { # unique timestamp for produced files. one.date=paste(sep="",gsub(":",".",gsub(" ","_",paste(sep="",date())))) # load our workspace; including pm load("permute.sma.master.sge.image.rdat") # We do use our own dedicated directory, but this should be set before qsub-ing # replace old variable with new value sge.task = get.sge.task.or.zero() # only permutations, no observed stats pm$just.do.permutation.no.unpermuted = TRUE # just in-case, to avoid infinite loop pm$run.perms.on.sge = FALSE # run just one permutation per SGE job pm$number.BLOCK.permutations = 1 num.perm = pm$number.BLOCK.permutations # reuse code below pm$quiet=TRUE pm$no.log=TRUE # just in case we do turn logging on; don't trample the original. if (sge.task > 0) { pm$neo.log.file = paste(sep="",pm$neo.log.file,".sgetask.",sge.task) pm$keep.existing.neo.log.file = TRUE sma.perm.file = paste(sep="","sgetask.",sge.task,".",sma.perm.file) } print(paste("sma.neo.sge.perm(): Starting single.marker.analysis inside sge run...pm$number.BLOCK.permutations=",pm$number.BLOCK.permutations," at ",one.date,"; output to dir: ",getwd()," file: ",sma.perm.file)) # Now start the permutations. # execute SMA for (i in 1:num.perm) { print(paste("single.marker.analysis permutations, doing : ",i," out of ",num.perm," at ",date())) datCombined=permute.snps.traits(pm=pm, datCombined, block.to.permute, save.perm.to.file=FALSE) sma.pre = single.marker.analysis(datCombined=datCombined,snpcols=snpcols,genecols=genecols,traitcols=traitcols,leo.i.th=leo.i.th, leo.o.th=leo.o.th, leo.nb.th=leo.nb.th,pm=pm,use.ranks=use.ranks,build.multi.marker.to.gene.model=build.multi.marker.to.gene.model, impute.na=impute.na) # write out just statistics of interest...actually keep most everything now, adding :ncol sma = sma.pre[,c(1,1,3,5,6,7,8,9,11,12:ncol(sma.pre))] colnames(sma)[1]="model" colnames(sma)[2]="M" colnames(sma)[3]="A" colnames(sma)[4]="B" sma[,2] = as.character(sma[,2]) sma[,3] = as.character(sma[,3]) sma[,4] = as.character(sma[,4]) # create unique row identifier sma[,1]=paste(sep="->",sma[,2],sma[,3],sma[,4]) if (i==1) { write.table(sma, file=sma.perm.file, col.names = NA, sep=",",dec=".",qmethod="double") } else { write.table(sma, file=sma.perm.file, append=TRUE, col.names = FALSE, sep=",",dec=".",qmethod="double") } } # end i loop invisible(c()) } # seed.root.leaf.neo: function to go # over the whole processing of seeding and then 2 or 3 marker (pm$max.snps.in.OCA.CPA.sets # parameter controls this) rooting, # followed by leafing and neo multimarker analysis. # # Basically an automation of the process described in # Methods_Insig1.Supplement.11Sept2007.doc # so that we can also do permutation testing on # this proceedure. # # INVAR: known.downstream and known.upstream are single genes # (might relax this later). # if(exists("seed.root.leaf.neo") ) rm(seed.root.leaf.neo); seed.root.leaf.neo=function(pm, datCombined, snpcols, known.upstream, known.downstream, genecols, require.sma.model.prob.over=.89, require.sma.leo.nb.over=.75, num.seed.snps=2, require.cor.with.known.up.over=.5) { cn=colnames(datCombined) w.up = known.upstream # match(known.upstream,cn) w.dn = known.downstream # match(known.downstream,cn) if (w.up > ncol(datCombined) | any(is.na(w.up))) stop("bad known.upstream specification") if (w.dn > ncol(datCombined) | any(is.na(w.dn))) stop("bad known.downstream specification") # seed: find top loci on different chromosomes that look like # causal drives of w.up -> w.dn # sma.seed = single.marker.analysis(datCombined=datCombined,snpcols=snpcols,genecols=w.up,traitcols=w.dn,pm=pm) ord.seed = sma.seed[order(sma.seed$leo.nb.AtoB ,decreasing=TRUE),] # take the top distinct markers...into uniq.mark markers = as.character(ord.seed[,1]) uniq.mark = unique(markers) # take out the M: annotation in front...so "M:snp" becomes "snp" uniq.mark.cn = sapply(strsplit(uniq.mark,"M:",fixed=TRUE),function(x) x[2]) # find the chromosomes, so we can pick markers on distinct chromosomes... chrm1=sapply(strsplit(uniq.mark,".chr",fixed=TRUE),function(x) x[2]) chrm=sapply(strsplit(chrm1,".",fixed=TRUE),function(x) x[1]) if (all(!is.na(chrm)) & all(chrm != "")) { got.chrom = TRUE; } # get the set of unique chromosomes, in order. uniq.chrm = unique(chrm) chosen.seed.snp = rep(NA,num.seed.snps) chosen.seed.snp[1]=uniq.mark.cn[1] if (num.seed.snps > 1) { for (i in 2:num.seed.snps) { chosen.seed.snp[i]=uniq.mark.cn[min(which(chrm == uniq.chrm[i]))] } } uniq.mark.w = match(chosen.seed.snp ,cn) uniq.mark.cn = cn[uniq.mark.w] if (any(is.na(uniq.mark.w))) { stop(paste("problem locating uniq.mark.cn genes in datCombined, these being:",uniq.mark.cn))} # pick out genes highly correlated with known.upstream to begin with use.genecols = setdiff(genecols, w.up) cx2=cor(datCombined[,w.up],datCombined[,use.genecols],use="p") known.up.cor.genes=colnames(cx2)[which(abs(cx2)[1,]>= require.cor.with.known.up.over)] w.known.up.cor.genes = match(known.up.cor.genes, cn) # now, any good SMA results for these highly correlated genes using the chosen markers... require.mlogp.below = -log10(require.sma.model.prob.over) #sma.res=list() high.leo.and.model.prob=c() for (i in 1:length(uniq.mark.w)) { sma=single.marker.analysis(datCombined=datCombined,snpcols=uniq.mark.w[i],genecols=w.up,traitcols=w.known.up.cor.genes,pm=pm) ord.sma=sma[order(sma$leo.nb.AtoB,decreasing=TRUE),] # sma.res[[i]]=ord.sma high.leo = which(ord.sma$leo.nb.AtoB >= require.sma.leo.nb.over) high.model.prob = which(ord.sma$mlogp.M.AtoB <= require.mlogp.below) both.criteria = intersect(high.leo, high.model.prob) pass.criteria.genes = sapply(strsplit(as.character(ord.sma[both.criteria,5]),"B:",fixed=TRUE),function(x) x[2]) w.pass.criteria.genes = match(pass.criteria.genes, cn) # only have to look good in response to one of the markers. high.leo.and.model.prob = union(high.leo.and.model.prob, w.pass.criteria.genes) if (!pm$no.log) { save(ord.sma, pass.criteria.genes, file=gsub(" ","_",paste(sep="",pm$neo.log.file,".seed.sma.for.",uniq.mark.cn[i],".to.",cn[known.upstream],".rdat"))) } } high.leo.and.model.prob.cn = cn[high.leo.and.model.prob] if (!pm$no.log) { save(list = c("ord.seed","high.leo.and.model.prob.cn"), file=gsub(" ","_",paste(sep="",pm$neo.log.file,"post.sma.ord.seed.for.known.upstream.",cn[known.upstream],".rdat"))) my.file = gsub(" ","_",paste(sep="",pm$neo.log.file,".total.sma.leafings_num.pass.genes.is_",length(high.leo.and.model.prob),"_.for.known.",cn[known.upstream],".pass.criteria.genes.rdat")) save(high.leo.and.model.prob.cn, file=my.file) pm.print(pm,paste("Saved results to ",my.file)) } pm.print(pm,paste("Number of genes passing criteria: ",length(high.leo.and.model.prob))) # and run a neo analysis on these high.leo.and.model.prob = setdiff(high.leo.and.model.prob,w.up) # just in case, so we don't overlap pm$A=w.up pm$B=high.leo.and.model.prob z=neo(datCombined=datCombined,snpcols=snpcols,pm=pm) invisible(z) } # # reproduce the Insig1 analysis (previously done by hand) # with an automated version # if (exists("neo.seed.insig1.test")) { rm(neo.seed.insig1.test) } neo.seed.insig1.test=function() { a=load("/home/jaten/dev/peculiar/bxh/liver.1146snps.23388mrna.21clinical.bxh.apoe.null.rdat") liver.bxh.male.female=liver.bxh.male.female.cm x=liver.bxh.male.female.cm female=liver.bxh.male.female$sex==2 male=liver.bxh.male.female$sex==1 datCombined=liver.bxh.male.female[female,-str.me$sex] # datCombined=liver.bxh.male.female[male,-str.me$sex] pm=neo.get.param() which.Insig1=pmatch("Insig1",colnames(liver.bxh.male.female)) which.Fdft1=pmatch("Fdft1.MMT00082285",colnames(liver.bxh.male.female)) which.Dhcr7=pmatch("Dhcr7",colnames(liver.bxh.male.female)) which.B4301=pmatch("B430110G05Rik",colnames(liver.bxh.male.female)) pm$run.title="known.up.Insig1.known.dn.Fdft1" one.date=date() my.new.dir=gsub(":",".",gsub(" ","_",paste(sep="","seed.insig1.",one.date))) dir.create(my.new.dir) setwd(my.new.dir) seed.root.leaf.neo(pm, datCombined, snpcols=str.me$snpcols, known.upstream=which.Insig1, known.downstream=which.Fdft1, genecols=str.me$genecols, require.sma.model.prob.over=.5, require.sma.leo.nb.over=.75, num.seed.snps=2, require.cor.with.known.up.over=.5) } # MALE VERSION if (exists("neo.seed.insig1.test.MALE")) { rm(neo.seed.insig1.test.MALE) } neo.seed.insig1.test.MALE=function() { a=load("/home/jaten/dev/peculiar/bxh/liver.1146snps.23388mrna.21clinical.bxh.apoe.null.rdat") liver.bxh.male.female=liver.bxh.male.female.cm x=liver.bxh.male.female.cm female=liver.bxh.male.female$sex==2 male=liver.bxh.male.female$sex==1 # datCombined=liver.bxh.male.female[female,-str.me$sex] datCombined=liver.bxh.male.female[male,-str.me$sex] pm=neo.get.param() which.Insig1=pmatch("Insig1",colnames(liver.bxh.male.female)) which.Fdft1=pmatch("Fdft1.MMT00082285",colnames(liver.bxh.male.female)) which.Dhcr7=pmatch("Dhcr7",colnames(liver.bxh.male.female)) which.B4301=pmatch("B430110G05Rik",colnames(liver.bxh.male.female)) pm$run.title="known.up.Insig1.known.dn.Fdft1" one.date=date() my.new.dir=gsub(":",".",gsub(" ","_",paste(sep="","seed.insig1.",one.date))) dir.create(my.new.dir) setwd(my.new.dir) seed.root.leaf.neo(pm, datCombined, snpcols=str.me$snpcols, known.upstream=which.Insig1, known.downstream=which.Fdft1, genecols=str.me$genecols, require.sma.model.prob.over=.5, require.sma.leo.nb.over=.75, num.seed.snps=2, require.cor.with.known.up.over=.5) } # # permutation version of the above, for FEMALE data. See next for MALE. # # started by doing, on calypso (the SGE cluster headnode): # ### job.list=rep("neo.seed.insig1.perm",20) ### getwd() ### #[1] "/home/jaten/dev/peculiar/insig1.perms" ### source("../neo.txt") ### submit.to.sge(job.list) ### #your job-array 361256.1-20:1 ("neo.sge.2.qsub.me.sh") has been submitted if (exists("neo.seed.insig1.perm")) { rm(neo.seed.insig1.perm) } neo.seed.insig1.perm=function() { a=load("/home/jaten/dev/peculiar/bxh/liver.1146snps.23388mrna.21clinical.bxh.apoe.null.rdat") liver.bxh.male.female=liver.bxh.male.female.cm x=liver.bxh.male.female.cm female=liver.bxh.male.female$sex==2 datCombined=liver.bxh.male.female[female,-str.me$sex] pm=neo.get.param() which.Insig1=pmatch("Insig1",colnames(liver.bxh.male.female)) which.Fdft1=pmatch("Fdft1.MMT00082285",colnames(liver.bxh.male.female)) which.Dhcr7=pmatch("Dhcr7",colnames(liver.bxh.male.female)) which.B4301=pmatch("B430110G05Rik",colnames(liver.bxh.male.female)) pm$run.title="known.up.Insig1.known.dn.Fdft1" one.date=date() wd=getwd() sge.task = get.sge.task.or.zero() # do 10 permutations at once for (i in 1:10) { my.new.dir=gsub(":",".",gsub(" ","_",paste(sep="","seed.insig1.perm.",one.date,".task.",sge.task,".perm.",i))) # start from where we started setwd(wd) dir.create(my.new.dir) setwd(my.new.dir) dc=permute.snps.traits(pm, datCombined, block.of.cols.to.permute.labels.on=str.me$snpcols, save.perm.to.file=FALSE) seed.root.leaf.neo(pm, datCombined=dc, snpcols=str.me$snpcols, known.upstream=which.Insig1, known.downstream=which.Fdft1, genecols=str.me$genecols, require.sma.model.prob.over=.5, require.sma.leo.nb.over=.75, num.seed.snps=2, require.cor.with.known.up.over=.5) } } # end permutation version/ neo.seed.insig1.perm() if (exists("analyze.female.insig1.perms")) { rm(analyze.female.insig1.perms) } analyze.female.insig1.perms=function() { my.dir="/home/jaten/dev/peculiar/insig1.FEMALE3.perms" # setwd("/home/jaten/dev/peculiar/insig1.perms") setwd(my.dir) system(paste(sep="","find . -name 'neo.logs.*.csv' -print > ",my.dir,"/dirs.with.logs.txt")) a=read.table(paste(sep="",my.dir,"/dirs.with.logs.txt")) a=as.character(a$V1) oca.over.th = rep(NA,length(a)) oca.over.th.weighted = rep(NA,length(a)) cpa = rep(NA,length(a)) cpa.weighted = rep(NA,length(a)) th = 0.05 for (i in 1:length(a)) { my=read.csv(a[i]) oca.over.th[i]=sum(my$LEO.NB.OCA > th,na.rm=T) oca.over.th.weighted[i]=sum(my$P.weighted.LEO.NB.OCA > th,na.rm=T) cpa[i]=sum(my$LEO.NB.CPA > th,na.rm=T) cpa.weighted[i]=sum(my$LEO.NB.CPA * my$Model.P.value.AtoB > th,na.rm=T) } summary(oca.over.th) summary(oca.over.th.weighted) summary(cpa) summary(cpa.weighted) # how does this compare to the actual observed female insig1 run? # obs.female.insig1 = read.csv("/home/jaten/dev/peculiar/insig1.FEMALE/seed.insig1.Mon_Feb_18_15.56.29_2008/neo.logs.Mon_Feb_18_16.28.21_2008.known.up.Insig1.known.dn.Fdft1/neo.logs.Mon_Feb_18_16.28.21_2008.known.up.Insig1.known.dn.Fdft1.csv") obs.female.insig1 = read.csv("/home/jaten/dev/peculiar/insig1.FEMALE3/seed.insig1.Wed_Feb_20_00.59.24_2008/neo.logs.Wed_Feb_20_01.24.38_2008.known.up.Insig1.known.dn.Fdft1/neo.logs.Wed_Feb_20_01.24.38_2008.known.up.Insig1.known.dn.Fdft1.csv") obs.oca.over.th=sum(obs.female.insig1$LEO.NB.OCA > th,na.rm=T) obs.oca.over.th.weighted=sum(obs.female.insig1$P.weighted.LEO.NB.OCA > th,na.rm=T) obs.cpa=sum(obs.female.insig1$LEO.NB.CPA > th,na.rm=T) obs.cpa.weighted=sum(obs.female.insig1$LEO.NB.CPA * obs.female.insig1$Model.P.value.AtoB > th,na.rm=T) fdr.oca = mean(oca.over.th) / obs.oca.over.th fdr.weighted.oca = mean(oca.over.th.weighted) / obs.oca.over.th.weighted fdr.cpa = mean(cpa) / obs.cpa fdr.weighted.cpa = mean(cpa.weighted) /obs.cpa.weighted print(paste("At th=",th," we see FDR for OCA of ",signif(fdr.oca,3)," (",signif(mean(oca.over.th),3),"/",obs.oca.over.th,")" )) print(paste("At th=",th," we see FDR for weighted OCA of ",signif(fdr.weighted.oca,3)," (",signif(mean(oca.over.th.weighted),3),"/",obs.oca.over.th.weighted,")" )) print(paste("At th=",th," we see FDR of CPA of ",signif(fdr.cpa,3)," (",signif(mean(cpa),3),"/",obs.cpa,")" )) print(paste("At th=",th," we see FDR of weighted CPA of ",signif(fdr.weighted.cpa,3)," (",signif(mean(cpa.weighted),3),"/",obs.cpa.weighted,")" )) # at th=.2, for females, 100 permutations done. ## > fdr.oca ## [1] 0.4174603 ## > fdr.weighted.oca ## [1] 0.2948649 ## > fdr.cpa ## [1] 0.5705217 ## > fdr.weighted.cpa ## [1] 0.4858537 ## > } ### job.list=rep("MALE.neo.seed.insig1.perm",20) ### getwd() ### #[1] "/home/jaten/dev/peculiar/insig1.MALE.perms" ### source("../neo.txt") ### submit.to.sge(job.list) ### #your job-array 361253.1-20:1 ("neo.sge.2.qsub.me.sh") has been submitted if (exists("MALE.neo.seed.insig1.perm")) { rm(MALE.neo.seed.insig1.perm) } MALE.neo.seed.insig1.perm=function() { a=load("/home/jaten/dev/peculiar/bxh/liver.1146snps.23388mrna.21clinical.bxh.apoe.null.rdat") liver.bxh.male.female=liver.bxh.male.female.cm x=liver.bxh.male.female.cm female=liver.bxh.male.female$sex==2 male=liver.bxh.male.female$sex==1 # datCombined=liver.bxh.male.female[female,-str.me$sex] datCombined=liver.bxh.male.female[male,-str.me$sex] pm=neo.get.param() which.Insig1=pmatch("Insig1",colnames(liver.bxh.male.female)) which.Fdft1=pmatch("Fdft1.MMT00082285",colnames(liver.bxh.male.female)) which.Dhcr7=pmatch("Dhcr7",colnames(liver.bxh.male.female)) which.B4301=pmatch("B430110G05Rik",colnames(liver.bxh.male.female)) pm$run.title="known.up.Insig1.known.dn.Fdft1" one.date=date() wd=getwd() sge.task = get.sge.task.or.zero() # do 10 permutations at once for (i in 1:10) { my.new.dir=gsub(":",".",gsub(" ","_",paste(sep="","seed.insig1.perm.",one.date,".task.",sge.task,".perm.",i))) # start from where we started setwd(wd) dir.create(my.new.dir) setwd(my.new.dir) dc=permute.snps.traits(pm, datCombined, block.of.cols.to.permute.labels.on=str.me$snpcols, save.perm.to.file=FALSE) seed.root.leaf.neo(pm, datCombined=dc, snpcols=str.me$snpcols, known.upstream=which.Insig1, known.downstream=which.Fdft1, genecols=str.me$genecols, require.sma.model.prob.over=.5, require.sma.leo.nb.over=.75, num.seed.snps=2, require.cor.with.known.up.over=.5) } } # end permutation version/ MALE.neo.seed.insig1.perm() if (exists("confirm.insig1.in.male")) { rm(confirm.insig1.in.male) } confirm.insig1.in.male=function() { # .3 weighted OCA or higher my.female.genes =c("Fasn.MMT00018029.chr11.bp120359985","Cyp2r1.MMT00058486.chr7.bp12048523","Cyp51.MMT00071535.chr5.bp4095740","Sc4mol.MMT00059097.chr8.bp64574528","Sc5d.MMT00032020.chr9.bp43970434","Paox.MMT00036605.chr7.bp132018721","Khk.MMT00011470.chr5.bp29258979","Slc25a1.MMT00054225.chr16.bp17530063","Wdr5.MMT00035187.chr2.bp27759428","Pfkfb1.MMT00078546.chrX.bp138292585","MGC18837.MMT00022933.chr9.bp22526985","Slc23a1.MMT00022094.chr18.bp35778092","Zdhhc6.MMT00047560.chr19.bp54943601","Acac.MMT00048501.chr11.bp83907075","Gale.MMT00077659.chr4.bp134743172","Scd1.MMT00022878.chr19.bp44497177","MMT00012966.MMT00012966.chr5.bp85212161","Qdpr.MMT00063965.chr5.bp44662614","x5730427C23Rik.MMT00054020.chr7.bp133270810","Thrsp.MMT00053283.chr7.bp89494761","AV047578.MMT00024954.chr15.bp32068789") # my.female.genes = c("Supt3h.MMT00063933.chr17.bp44317383","Pmvk.MMT00030469.chr3.bp91702875","Acly.MMT00022101.chr11.bp100059972","Sqle.MMT00000743.chr15.bp60651347","Fdft1.MMT00080352.chr14.bp54296665","x0610007P14Rik.MMT00054614.chr12.bp81807416","Slc25a1.MMT00054225.chr16.bp17530063","Leng1.MMT00012778.chr7.bp3727812","BC036563.MMT00012566.chr4.bp124076539","x6030440G05Rik.MMT00031386.chr6.bp98942613") a=load("/home/jaten/dev/peculiar/bxh/liver.1146snps.23388mrna.21clinical.bxh.apoe.null.rdat") liver.bxh.male.female=liver.bxh.male.female.cm x=liver.bxh.male.female.cm female=liver.bxh.male.female$sex==2 male=liver.bxh.male.female$sex==1 # datCombined=liver.bxh.male.female[female,-str.me$sex] datCombined=liver.bxh.male.female[male,-str.me$sex] pm=neo.get.param() which.Insig1=pmatch("Insig1",colnames(liver.bxh.male.female)) which.Fdft1=pmatch("Fdft1.MMT00082285",colnames(liver.bxh.male.female)) which.Dhcr7=pmatch("Dhcr7",colnames(liver.bxh.male.female)) which.B4301=pmatch("B430110G05Rik",colnames(liver.bxh.male.female)) pm$run.title="confirm.in.male.Insig1.top10" cn=colnames(datCombined) pm$A=which.Insig1 pm$B=match(my.female.genes,cn) neo(datCombined,snpcols=str.me$snpcols,pm=pm) } # run the above neo.seed.insig1.perm on SGE if (exists("is.na.or.nan")) { rm(is.na.or.nan) } is.na.or.nan=function(x) { is.na(x) | is.nan(x)} # helper function to determine if a named list has a particular variable if (exists("has.name")) { rm(has.name) } has.name=function(name,my.list) !is.na(match(name,names(my.list))) if(exists("is.unix")) rm(is.unix) is.unix = function() { if (.Platform$OS.type == "windows") return(FALSE); if (has.name("pkgType",.Platform) & .Platform$pkgType == "mac.binary") return(FALSE); return(TRUE); } # open a window for plotting, on windows, mac, or unix if (exists("cross.platform.windows")) { rm(cross.platform.windows) } cross.platform.windows=function() { if (.Platform$OS.type == "windows") { return(windows()); } if (has.name("pkgType",.Platform)) { if (.Platform$pkgType == "mac.binary") { return(quartz()); } } return(X11()); } # check.or.impute.missing.data() # # if CHECK == TRUE, then stop if any missing data. # otherwise, impute NA values the best we can. Since this # is hackish, we really want to insist that the user take # care of missing data on their own first. # if(exists("check.or.impute.missing.data")) rm(check.or.impute.missing.data); check.or.impute.missing.data=function(datCombined,snpcols,pm,check=TRUE) { if (check) { # check and insist upon good data in datCombined if (any(is.na(datCombined))) { stop("Missing data found in the supplied datCombined data frame. Please impute the NA and missing values first. The e1071::impute() function and the NEO::impute.nn() function may be of help here.") } # check for non-varying columns s=sd(datCombined,na.rm=TRUE) if (any(s==0)) { stop(paste(sep="","Some datCombined columns had no variation. Please remove these columns from datCombined: ",colnames(datCombined)[s==0])) } # data looks okay, use it return(datCombined) } # not checking, do rough and ready imputation instead. datCombined.snpfilled=datCombined # impute missing data by predicting using the other columns...in the non-ignored columns. if (any(is.na(datCombined[,snpcols]))) { pm.print(pm,paste("imputing missing values in SNPs...",date())); datCombined.snpfilled=impute.nn(datCombined,snpcols,pm); pm.print(pm,paste("done with imputation of NA for SNPs...",date())); } # impute any remainder traits using their median non.snp.cols=setdiff(1:ncol(datCombined),snpcols) x = as.data.frame(impute(datCombined.snpfilled[,non.snp.cols],what="median")) datCombined.snpfilled[,non.snp.cols]=x datCombined.snpfilled } # Implementation of Steve's request to have NA turned to 0, # and 0 turned to small Devil's number in some spreadsheet output. # Is vectorized. if (exists("na.to.zero.and.zero.to.small.devil")) rm(na.to.zero.and.zero.to.small.devil); na.to.zero.and.zero.to.small.devil=function(x) { w0 = (x==0) w.na = is.na(x) x[w.na] = 0 x[w0] = 666e-8 x } #### create a permuted data set, by shuffling labels on the mice between #### snps and traits #### uses supplied parameter block.col.cols.to.permute.labels.on, and #### uses pm$block.to.permute2, and #### pm$block.to.permute3 if either is non-NULL #### to enable the user to generate additional independences between #### different blocks of data at once, if so desired. #### The various blocks must be disjoint. #### #### if (exists("permute.snps.traits")) rm(permute.snps.traits); permute.snps.traits=function(pm, datCombined, block.of.cols.to.permute.labels.on, save.perm.to.file=FALSE) { block = block.of.cols.to.permute.labels.on nr=nrow(datCombined) if (nr <2) stop("bad call with 1 row of data to permute.snps.traits()") # implement additional blocks to do additional simultaneous permutation too # block2.to.permute # block3.to.permute # sanity checks-- blocks should not overlap, or else within block structure # will not be preserved. # if (length(intersect(block.of.cols.to.permute.labels.on,pm$block.to.permute2)) != 0) { stop("block.to.permute and block.to.permute2 cannot overlap!") } if (length(intersect(block.of.cols.to.permute.labels.on,pm$block.to.permute3)) != 0) { stop("block.to.permute and block.to.permute3 cannot overlap!") } if (length(intersect(pm$block.to.permute2,pm$block.to.permute3)) != 0) { stop("block.to.permute2 and block.to.permute3 cannot overlap!") } # sanity check range if (any(!(block.of.cols.to.permute.labels.on %in% (1:ncol(datCombined))))) { stop("Bad specification of block.of.cols.to.permute.labels.on: out of range") } if (any(!(pm$block.to.permute2 %in% (1:ncol(datCombined))))) { stop("Bad specification of pm$block.to.permute2: out of range") } if (any(!(pm$block.to.permute3 %in% (1:ncol(datCombined))))) { stop("Bad specification of pm$block.to.permute3: out of range") } perm = sample(nr) if (save.perm.to.file) { write.csv(file=paste(sep="",pm$neo.log.file,".permute.rows.",pm$perm.count-1,".txt"),perm,row.names=FALSE) } if (!is.null(pm$block.to.perm2)) { perm2 = sample(nr) datCombined[,pm$block.to.perm2]=datCombined[perm2,pm$block.to.perm2] } if (!is.null(pm$block.to.perm3)) { perm3 = sample(nr) datCombined[,pm$block.to.perm3]=datCombined[perm3,pm$block.to.perm3] } permutedSNPs=datCombined[perm,block] datCombined[,block]=permutedSNPs datCombined } # apply above function to multiple columns if (exists("zero.dev.on.cols")) rm(zero.dev.on.cols); zero.dev.on.cols=function(x,which.columns) { for (i in which.columns) { x[,i] = na.to.zero.and.zero.to.small.devil(x[,i]) } x } # multiplier to convert natural log to log10 log.to.log10 = log10(exp(1)); # make it easy to reload this file after any changes. if (exists("reload")) rm(reload); if (exists("reneo")) rm(reneo); reload=reneo=function() { source("neo.txt"); } # A frequent user error is to input columns that are # completely redundant with each other. This multicollinearity # means it is impossible to distinguish between # two different redundant or even highly correlated variables. # check.multi.collinearity(): checks for redundancy and # stops with a complaint to fix it first before continuing. # We try to do so in O(n log n) time for n columns to check, # rather than in O(n^2) time, by comparing correlations # with the first column and only investigating those columns with # very high correlation with that first column, and those # that share a very similar correlation with the first column. # # if (exists("check.multi.collinearity")) { rm(check.multi.collinearity) } check.multi.collinearity=function(x,pm=neo.get.param()) { r=list() # results r$has.redundant = FALSE # default r$redundant.cols = c() # default if (ncol(x) < 2) return(r); corx = cor(x[,1],x[,-1],use="p") # check #1: any highly correlated with first column? w = which(abs(corx) >= pm$datCombined.column.redundancy.reject.cor.level) if (any(w)) { r$has.redundant = TRUE r$redundant.cols = c(1,w+1) return(r) } # if only 2 columns then we are done if (ncol(x) < 3) return(r); # IVAR: we have 3 or more columns, so length(corx) >= 2 if (length(corx) < 2) stop("logic error in check.multi.collinearity") # check #2: investigate more closely any columns who # share the same correlation or close correlation with # that first column. Designed to avoid the all-by-all # time consuming O(n^2) correlation matrix computation. # remember that our indices start with the 2nd column of x being the 1st correlation in w sx = sort(corx, decreasing=T, index.return=T) scorx = sx$x scorx.index.ordered = sx$ix # This is a heuristic fast approximation to detect multicolinearity, not a catch all. # # Analysis: # # We ignore pairs of columns if their sign of correlation is different. # # Assuming that the sign of the correlation is the same, # by the triangle inequality we know that |cor(a,b) - cor(a,c)| <= |1-cor(b,c)|. # # If we assume that cor(b,c) is positive, then # cor(b,c) <= 1 - |cor(a,b) - cor(a,c)| # # Since we want cor(b,c) < pm$datCombined.column.redundancy.reject.cor.level, # we only compute cor(b,c) if we first find that 1 - |cor(a,b) - cor(a,c)| >= pm$datCombined.column.redundancy.reject.cor.level. # or in other words, 1 - pm$datCombined.column.redundancy.reject.cor.level >= |cor(a,b) - cor(a,c)|. # # flag.if.less.than = 1 - abs(pm$datCombined.column.redundancy.reject.cor.level) for (i in 1:(length(scorx)-1)) { j = i+1 ab = scorx[i] ac = scorx[j] if (ab*ac <= 0) next; # skip if different signs or if one cor is zero. # with luck, few columns will pass this check and need explicit checking... # But also the sort may not have brought the closest pairs adjecent to each other, which # is why this is a hueristic function. # if (abs(ab-ac) <= flag.if.less.than) { # compute actual correlation to check it bc.cor = cor(x[,i+1],x[,j+1],use="p") # add one to i and j b/c we never compute the correlation of the first column with itself. if (abs(bc.cor) >= pm$datCombined.column.redundancy.reject.cor.level) { r$has.redundant = TRUE r$redundant.cols = c(i+1,j+1) msg = paste("datCombined input problem detected. Columns numbered (", paste(co.lin$redundant.cols,sep=",",collapse=","),") were redundant (correlated at |",signif(bc.cor,3),"| >= pm$datCombined.column.redundancy.reject.cor.level(",signif(pm$datCombined.column.redundancy.reject.cor.level,3),")) in datCombined. These were the columns labelled: ", paste(collapse=",",sep=",",colnames(datCombined)[co.lin$redundant.cols])) #msg2 = paste("Here is their correlation matrix: ", # stringify(cor(datCombined[,co.lin$redundant.cols],use="p"))) r$msg = paste(msg,collapse="\n",sep="\n") return(r) } } } # end for i r } # # Since Excel can be installed in many places, try hard to # find it if not specified correctly by pm$excel.path. # # Then we can automatically open the spreadsheet at the # end of the run. # if (exists("locate.excel")) { rm(locate.excel) } locate.excel=function(pm=neo.get.param()) { if(!is.null(pm$excel.path)) { if (file.exists(pm$excel.path)) { return(pm$excel.path) }} # INVAR: pm$excel.path did not work, now try to guess... #helper function for pasting compactly pp=function(...) paste(sep="",...) # try to figure the system drive dr=Sys.getenv("SystemDrive") if (dr=="") { dr="C:" } # default excel.path.try = c(pp(dr,"\\Program Files\\Microsoft Office\\Office\\EXCEL.EXE"),pp(dr,"\\Program Files\\Microsoft Office\\Office10\\EXCEL.EXE"),pp(dr,"\\Program Files\\Microsoft Office\\Office11\\EXCEL.EXE"),pp(dr,"\\Program Files\\Microsoft Office\\Office12\\EXCEL.EXE"),pp(dr,"\\Program Files\\Microsoft Office\\Office13\\EXCEL.EXE"),pp(dr,"\\Program Files\\Microsoft Office\\Office14\\EXCEL.EXE")) for (i in 1:length(excel.path.try)) { if (file.exists(excel.path.try[i])) { return(excel.path.try[i]) } } # default excel.path.try[1] } # end locate.excel # # open.excel.file(): only works on Windows. # if (exists("open.excel.file")) { rm(open.excel.file) } open.excel.file=function(fname,pm) { wd=getwd() path=gsub("/","\\\\",wd) file.path=paste(sep=""," \"",path,"\\",fname,"\"") file.path=path.os.convert(file.path); # customize to OS; for windows change / to \\ excel.path=locate.excel(pm) dome=paste(sep="","\"",excel.path,"\" ",file.path) if (.Platform$OS.type == "windows") { # debug print(paste("Trying:",dome)) system(command=dome,wait=FALSE,invisible=FALSE,show.output.on.console = FALSE) } } # helper function to convert "" (factor) entries to NA if (exists("make.dataframe.numeric")) { rm(make.dataframe.numeric) } make.dataframe.numeric=function(df,do.cols=1:ncol(df)) { newdf = df for (j in do.cols) { newdf[,j] = as.numeric(as.character(df[,j])) } newdf } if (exists("new.open.excel.file")) { rm(new.open.excel.file) } new.open.excel.file=function(fname,pm) { wd=getwd() path=gsub("/","\\\\",wd) file.path=paste(sep=""," \"",path,"\\",fname,"\"") file.path=path.os.convert(file.path); # customize to OS # try hard to get a good excel path excel.path = pm$excel.path if(is.null(excel.path)) { excel.path=path.os.convert("C:\\Program Files\\Microsoft Office\\Office\\EXCEL.EXE"); } len.xl.path = nchar(excel.path) if (len.xl.path < 9) excel.path = "C:\\Program Files\\Microsoft Office\\Office11\\EXCEL.EXE" excel.path.no.quotes = excel.path if (substring(excel.path,1,1) == "\"") { excel.path.no.quotes = substring(excel.path,2,nchar(excel.path)-1) } if (file.exists(excel.path)) { } dome=paste(sep="",excel.path,file.path) if (.Platform $OS.type == "windows") { system(dome,wait=FALSE) } } if (exists("write.html.log")) { rm(write.html.log) } write.html.log=function(pm,fname="default.log.html", title.1="My Title.1",title.2="(more detail)", title.3="title3", my.text="My test content.", html.log.dir="html.logfiles",parent.dir=getwd()) { dir.create(html.log.dir,showWarnings=FALSE,recursive=TRUE) wd=getwd() path=gsub("/","\\\\",wd) file.path=paste(sep="",path,"\\",html.log.dir,"\\",fname) file.path=path.os.convert(file.path); # customize to OS zz <- file(file.path, "wt") a1= "\n\n"; a2= "\n\n
\n\n";
  a3= "\n\n
\n\n"; title.string=paste(title.2,"...",title.1," ",title.3); writeChar(a1, con=zz,eos=NULL) writeChar(title.string, con=zz,eos=NULL) writeChar(a2, con=zz,eos=NULL) writeChar(my.text, con=zz,eos=NULL) writeChar(a3, con=zz,eos=NULL) close(zz); # Use basename() to trim the html directory name so that it is relative to the output directory--so # we can copy/email this directory and still have the hyperlink work! relocatable.file.path = paste(sep="",basename(html.log.dir),"\\",basename(file.path)); my.link=paste(sep="","=HYPERLINK(\"",relocatable.file.path,"\",\"",title.1,"\")") path.os.convert(my.link); # customize to OS } # take the entries in a vector and convert the NAs to blanks: makes a character vector if(exists("convert.na.to.blank") ) rm(convert.na.to.blank); convert.na.to.blank=function(pm,v) { if (pm$turn.NA.scores.to.zero.and.zero.to.666) {v=na.to.zero.and.zero.to.small.devil(v) } v[which(is.na(as.character(v)))]="" v } # linab(): map from a pair indices (a,b) drawn from pm$A and pm$B (restricted choices # of network edges to be analyzed rather than analyzing the complete graph by default) # into a single LINEAR row number for a result set data frame. # See function enumab() below for the reverse mapping. # if(exists("linab") ) rm(linab); linab = function(a,b,pm) { lenA = length(pm$A) lenB = length(pm$B) if (lenA < 1 | lenB < 1) stop("bad call to linab(): lenA or lenB < 1.") addone = 0; # increment to 1 if a comes from pm$B and b comes from pm$A. wa = match(a,pm$A) wb = match(b,pm$B) if (is.na(wa)) { if (!is.na(wb)) stop("bad call to linab(): ") wa = match(b,pm$A) wb = match(a,pm$B) addone = 1 } if (is.na(wa) | is.na(wb) ) { stop(paste(sep="","Bad call to linab(): wa[",wa,"] or wb[",wb,"] NOT FOUND in pm$A[",pm$A,"] or pm$B[",pm$B,"]")) } index = 1 + ((wb + (wa-1)*lenB)-1)*2 + addone index } # enumab(): opposite mapping of linab. Here we # map from a single row number i to a pair of indices # (a,b) where "a" is in pm$A and "b" is in pm$B. # if(exists("enumab") ) rm(enumab); enumab = function(k,pm) { lenA = length(pm$A) lenB = length(pm$B) if (lenA < 1 | lenB < 1) stop("bad call to enumab(): lenA or lenB < 1.") if (k > 2* lenA * lenB) stop("bad call to enumab(): k > 2 * lenA * lenB.") i = (k+1) %/% 2 # irrespective of order, which pm$A,pm$B do we want? i.by.lb= ((i-1) %/% lenB) +1 i.mod.lb = ((i-1) %% lenB) + 1 if (k %% 2 == 0) { # even k ret = c(pm$B[i.mod.lb], pm$A[i.by.lb]) } else { # odd ret = c(pm$A[i.by.lb], pm$B[i.mod.lb]) } ret } # enumab.cn(): Same function as above, but return column # names instead of numbers to make matching more robust. # if(exists("enumab.cn") ) rm(enumab.cn); enumab.cn = function(k,pm) { lenA = length(pm$A.cn) lenB = length(pm$B.cn) if (lenA < 1 | lenB < 1) stop("bad call to enumab.cn(): length(pm$A.cn) or length(pm$B.cn) < 1.") i = (k+1) %/% 2 # irrespective of order, which pm$A.cn,pm$B.cn do we want? i.by.lb= ((i-1) %/% lenB) +1 i.mod.lb = ((i-1) %% lenB) + 1 if (k %% 2 == 0) { # even k ret = c(pm$B.cn[i.mod.lb], pm$A.cn[i.by.lb]) } else { # odd ret = c(pm$A.cn[i.by.lb], pm$B.cn[i.mod.lb]) } ret } # # Get 2 vectors of edges to scan from pm$A and pm$B; store in pm; # so pm=get.enumab.set(pm) is the call. # # Use: for (k9 in 1:length(pm$enumerated.A)) { # use pm$enumerate.A[k9], pm$enumerated.B[k9] } # if(exists("get.enumab.set") ) rm(get.enumab.set); get.enumab.set=function(pm) { lenA = length(pm$A) lenB = length(pm$B) if (lenA < 1 | lenB < 1) stop("bad call to get.enumab.set(): lenA or lenB < 1") count = lenA*lenB enumerated.A = rep(NA,count) enumerated.B = rep(NA,count) for (k in 1:count) { e = enumab(1+(k-1)*2,pm) enumerated.A[k] = e[1] enumerated.B[k] = e[2] } pm$enumerated.A = enumerated.A pm$enumerated.B = enumerated.B pm } # same as above, but use actual column names instead # of indices. # # Get 2 vectors of edges to scan from pm$A and pm$B; store in pm; # so pm=get.enumab.set(pm) is the call. # # Use: for (k9 in 1:length(pm$enumerated.A)) { # use pm$enumerate.A[k9], pm$enumerated.B[k9] } # if(exists("get.enumab.cn.set") ) rm(get.enumab.cn.set); get.enumab.cn.set=function(pm) { lenA = length(pm$A.cn) lenB = length(pm$B.cn) if (lenA < 1 | lenB < 1) stop("bad call to get.enumab.cn.set(): lenA or lenB < 1") count = lenA*lenB enumerated.A.cn = rep(NA,count) enumerated.B.cn = rep(NA,count) for (k in 1:count) { e = enumab.cn(1+(k-1)*2,pm) enumerated.A.cn[k] = e[1] enumerated.B.cn[k] = e[2] } pm$enumerated.A.cn = enumerated.A.cn pm$enumerated.B.cn = enumerated.B.cn pm } # convert from covariance matrix to correlation matrix # reduced from ggm::correlations() if(exists("covx2cor") ) rm(covx2cor); covx2cor=function(covx) { Dg <- 1/sqrt(diag(covx)) r <- covx * outer(Dg, Dg) r } # convert from covariance matrix to partial correlation matrix # use ggm library function if(exists("covx2pcor") ) rm(covx2pcor); covx2pcor=function(covx) { parcor(covx) } if(exists("try.failed") ) rm(try.failed); try.failed=function(x) { inherits(x, "try-error") } # # since the sem() calls sometimes fail, log them, with the data, to reproduce later if we wish. # Also log warning model did not converge calls. # if(exists("autolog.sem") ) rm(autolog.sem); autolog.sem=function(pm,ram,covx,N,...) { last.warning=NULL; # IMPORTANT CONVERSION to CORRELATION MATRIX from COVARIANCE MATRIX # We were getting singular Hessians with some covariance matrices fed to sem(), # so we will try just feeding the correlation matrix and see if that # improves matters. if (pm$sem.fit.correlation.instead.of.covariance) { covx=covx2cor(covx); } r=try(sem(ram,covx,N,...)) if(inherits(r, "try-error") || !is.null(last.warning)) { if(inherits(r, "try-error")) { r$had.error=geterrmessage(); if (!exists(".Global.SEM.fail.count")) { .Global.SEM.fail.count <<- 1; } else { .Global.SEM.fail.count <<- .Global.SEM.fail.count + 1; } logfile = paste(sep="",pm$neo.log.file,".badsem.",.Global.SEM.fail.count,".rdat"); print(paste("sem() call failed...saving call arguments to file: ",logfile)); } else { r$had.warning=last.warning; if (!exists(".Global.SEM.warn.count")) { .Global.SEM.warn.count <<- 1; } else { .Global.SEM.warn.count <<- .Global.SEM.warn.count + 1; } logfile = paste(sep="",pm$neo.log.file,".warnsem.",.Global.SEM.warn.count,".rdat"); print(paste("sem() call generated warning...saving call arguments to file: ",logfile)); } badsem=list(); badsem$pm=pm; badsem$ram=ram; badsem$covx=covx; # get human readable correlation and partial correlation matrices. badsem$cor=covx2cor(covx); badsem$pcor=covx2pcor(covx); badsem$N=N; badsem$full.call=match.call(expand.dots = FALSE); # document any ... passed arguments. badsem$reproduce="q=sem(ram=badsem$ram,S=badsem$covx,N=badsem$N)"; if (!pm$no.log) save(badsem,file=logfile); } # end if try-error r } if(exists("try.sem") ) rm(try.sem); try.sem=function(pm,ram,covx,N,ana.grad=TRUE) { a=try(autolog.sem(pm,ram,covx,N,analytic.gradient=ana.grad)); if (try.failed(a)) { print(paste("try.sem: sem() call failed with ananlytic.gradient=",ana.grad)); print("trying the other way (reversing analytical.gradient flag). Model(ram):"); print(ram); a=try(autolog.sem(pm,ram,covx,N,analytic.gradient=!ana.grad)); if (try.failed(a)) { print("sem() call failed both ways...Setting edge orienting scores to NA for ram:"); cat(ram); a=NA; class(a)="FailedSEM"; } else { cat("\n"); print(paste("try.sem() suceeded with analytical.gradient=",!ana.grad)); } } a } # # Set up the mechanism by which we pass back NA for our sem() failed calls, from try.sem(); # if(exists("summary.FailedSEM") ) rm(summary.FailedSEM); summary.FailedSEM=function(a) { z=list() z$chisq=NA z } # fit an arbitrary sem model, given a M[from,to] matrix # where which.x.cols specifies the order of the columns # and rows of the matrix M and picks out the columns # of the data frame x. # if(exists("fit.sem.from.matrix") ) rm(fit.sem.from.matrix); fit.sem.from.matrix=function(which.x.cols,M,x,pm=neo.get.param(),use.ranks=FALSE) { cn=colnames(M) if (length(cn) > 0) { if (!all(cn==colnames(x)[which.x.cols])) { stop("fit.sem.from.matrix(): colnames(x)[which.x.cols] must match the column labels on the matrix M") } } else { stop("fit.sem.from.matrix(): must have column labels on supplied matrix M to report SEM model"); } no.obs=nrow(x); # allow no.obs to override the nrow(x), if specified. if (is.null(pm$no.obs.Z)) {pm$no.obs.Z = no.obs} # so we get correct zeo2() scores subx=data.frame(x[,which.x.cols]) if (use.ranks) { for (g in 1:ncol(subx)) { subx[,g]=rank(subx[,g]) }} covx=cov(subx); sem.M = make.ram(M) fit.M = try.sem(pm,sem.M$the.ram,covx,N=no.obs) fit.M.summary = summary(fit.M) z=list() z$sem = fit.M z$sem.summary = fit.M.summary z } # # compare.local.sems: return the SEM fitting indices for # a causal, reactive, and confounded model # # is safe if M.col is a vector too... # # either supply dataframe x directly....or give both covx and no.obs # if(exists("compare.local.sems") ) rm(compare.local.sems); compare.local.sems=function(M.col,A.col,B.col,x=NULL,covx=NULL,no.obs=NULL,pm) { z=list() # return value if(is.null(x) && is.null(covx)) { stop("compare.local.sems() called without supply either data matrix x or covariance matrix cov. Aborting.") } if (is.null(x) && is.null(no.obs)) { stop("compare.local.sems() called with covariance matrix but without giving no.obs. Aborting.") } # necessary for AhiddenB code below to always work... if(length(A.col) !=1 || length(B.col) != 1) { stop("compare.local.sems called with illegal input: A or B had more or less than 1 variable specified.") } if (!is.null(x)) { if (is.null(no.obs)) { no.obs=nrow(x); # allow no.obs to override the nrow(x), if specified. } cn=colnames(x); subx=data.frame(x[,c(M.col,A.col,B.col)]) covx=cov(subx); } else { cn=colnames(covx); } if (is.null(pm$no.obs.Z)) {pm$no.obs.Z = no.obs} # so we get correct zeo2() scores dn=cn[c(M.col,A.col,B.col)]; nr=length(dn); M.empty=matrix(rep(0,nr*nr),nrow=nr,dimnames=list(dn,dn)); # adjust indices to point into new matrix M.col=match(cn[M.col],dn) A.col =match(cn[A.col],dn) B.col =match(cn[B.col],dn) # model 1 M.AtoB=M.empty; M.AtoB[M.col,A.col]=1; M.AtoB[A.col,B.col]=1; # attr(M.AtoB,"model")=1; # model 2 M.BtoA=M.empty; M.BtoA[M.col,B.col]=1; M.BtoA[B.col,A.col]=1; # attr(M.BtoA,"model")=2; # model 3 M.conf=M.empty; M.conf[M.col,A.col]=1; M.conf[M.col,B.col]=1; # attr(M.conf,"model")=3; # model 4 M.AcollideB=M.empty; M.AcollideB[M.col,A.col]=1; M.AcollideB[B.col,A.col]=1; # attr(M.AcollideB,"model")=4; # model 5 M.BcollideA=M.empty; M.BcollideA[M.col,B.col]=1; M.BcollideA[A.col,B.col]=1; # attr(M.BcollideA,"model")=5; # model 6 M.AhiddenB=M.empty; M.AhiddenB[M.col,A.col]=1; # attr(M.AhiddenB,"model")=6; intra.M.list = generate.intra.ma.pairlist(M.col,c(),pm) sem.M.conf = make.ram(M.conf, intra.M.list) sem.M.AtoB = make.ram(M.AtoB, intra.M.list) sem.M.BtoA = make.ram(M.BtoA, intra.M.list) sem.M.BcollideA = make.ram(M.BcollideA, intra.M.list) sem.M.AcollideB = make.ram(M.AcollideB, intra.M.list) intra.M.list[[ length(intra.M.list)+1]] = c(A.col,B.col) sem.M.AhiddenB = make.ram(M.AhiddenB, intra.M.list) fit.M.conf = summary(try.sem(pm,sem.M.conf$the.ram,covx,N=no.obs)) fit.M.AtoB = summary(try.sem(pm,sem.M.AtoB$the.ram,covx,N=no.obs)) fit.M.BtoA = summary(try.sem(pm,sem.M.BtoA$the.ram,covx,N=no.obs)) fit.M.BcollideA = summary(try.sem(pm,sem.M.BcollideA$the.ram,covx,N=no.obs)) fit.M.AcollideB = summary(try.sem(pm,sem.M.AcollideB$the.ram,covx,N=no.obs)) # fit.M.AhiddenB always takes like 37 iterations and returns the same # value as AcollideB anyway, i.e. the models are not identifiable. # fit.M.AhiddenB = summary(try.sem(pm,sem.M.AhiddenB$the.ram,covx,N=no.obs)) z$title=paste(sep="","M(",paste(dn[M.col],collapse=","),"),A(",paste(dn[A.col],collapse=","),"),B(",paste(dn[B.col],collapse=","),") local SEM model comparisons to M->A->B.") if (!is.null(x)) { # we want to know how much of A the markers explain, and how much of B form1=paste(dn[A.col],"~",paste(collapse=" + ",dn[M.col])) summy1=summary(lm(as.formula(form1),data=subx)) z$A.predicted.by.markers.R.squared = summy1$r.squared z$A.predicted.by.markers.Adjusted.R.squared = summy1$adj.r.squared z$A.predicted.by.markers.formula = form1 form2 = paste(dn[B.col],"~",paste(collapse=" + ",dn[M.col])) summy2=summary(lm(as.formula(form2),data=subx)) z$B.predicted.by.markers.R.squared = summy2$r.squared z$B.predicted.by.markers.Adjusted.R.squared = summy2$adj.r.square z$B.predicted.by.markers.formula = form2 form3 = paste(dn[B.col],"~",dn[A.col]) summy3=summary(lm(as.formula(form3),data=subx)) z$B.predicted.by.A.gives.R.squared = summy3$r.squared z$B.predicted.by.A.gives.Adjusted.R.squared = summy3$adj.r.squared z$B.predicted.by.A.formula = form3 } z$M.AtoB=fit.M.AtoB z$M.BtoA=fit.M.BtoA z$M.conf=fit.M.conf z$M.AcollideB=fit.M.AcollideB z$M.BcollideA=fit.M.BcollideA # z$M.AhiddenB=fit.M.AhiddenB # for confounding checking... if (length(M.col) ==1) { # single marker version z$zeo.M.A.given.B=zeo2detail(M.col,A.col,B.col,covx,pm) # $BLV is the final score z$zeo.M.B.given.A=zeo2detail(M.col,B.col,A.col,covx,pm) # $BLV } else { # average the BLVs for each marker z$zeo.M.A.given.B=multimarker.zeo2detail(M.col,A.col,B.col,covx,pm) # $BLV is the final score z$zeo.M.B.given.A=multimarker.zeo2detail(M.col,B.col,A.col,covx,pm) # $BLV } # convert to log10 z$mlogp.M.AtoB=-pchisq(fit.M.AtoB$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 z$mlogp.M.BtoA=-pchisq(fit.M.BtoA$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 z$mlogp.M.conf=-pchisq(fit.M.conf$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 z$mlogp.M.AcollideB=-pchisq(fit.M.AcollideB$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 z$mlogp.M.BcollideA=-pchisq(fit.M.BcollideA$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 # z$mlogp.M.AhiddenB=-pchisq(fit.M.AhiddenB$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 z$mlogp.in.model.order=c(z$mlogp.M.AtoB,z$mlogp.M.BtoA,z$mlogp.M.conf,z$mlogp.M.AcollideB,z$mlogp.M.BcollideA) #,z$mlogp.M.AhiddenB) so=sort(z$mlogp.in.model.order,decreasing=FALSE,index.return=TRUE) z$ranked.models=so$ix z$ranked.models.mlogp=so$x # so far AcollideB and AhiddenB look the same??? # For now, then, we EXCLUDE mlogp.M.AhiddenB from the min() below. ## ## the new statistics ZPathAB, ZPathBA, z$LEO.NB.BtoA, ## BLV, PearsonCor, PearsonCorP ## z$PathAB = fit.M.AtoB$coeff[4,1] z$PathBA = fit.M.BtoA$coeff[3,1] z$SEPathAB = fit.M.AtoB$coeff[4,2] z$SEPathBA = fit.M.BtoA$coeff[3,2] z$ZPathAB = fit.M.AtoB$coeff[4,3] z$ZPathBA = fit.M.BtoA$coeff[3,3] z$PPathAB = fit.M.AtoB$coeff[4,4] z$PPathBA = fit.M.BtoA$coeff[3,4] # Prior to 4 Feb 2008, the right-hand-sides of these two were swapped. # We now switch them to create consistency with SH implementation of BLV. z$BLV.AtoB = z$zeo.M.B.given.A$BLV z$BLV.BtoA = z$zeo.M.A.given.B$BLV # for M->A->B z$alt.model.best.min.mlogp = min(z$mlogp.M.conf,z$mlogp.M.BtoA,z$mlogp.M.BcollideA,z$mlogp.M.AcollideB) # for M->A<-B z$next.best.min.rev.mlogp = min(z$mlogp.M.conf,z$mlogp.M.BtoA,z$mlogp.M.AtoB,z$mlogp.M.BcollideA) # for M->B->A z$next.best.min.M.BtoA.numerator = min(z$mlogp.M.conf, z$mlogp.M.AtoB, z$mlogp.M.AcollideB, z$mlogp.M.BcollideA) z$LEO.NB.AtoB = -(z$mlogp.M.AtoB - z$alt.model.best.min.mlogp) z$LEO.NB.AcollideB = -(z$mlogp.M.AcollideB - z$next.best.min.rev.mlogp) z$LEO.NB.BtoA = -(z$mlogp.M.BtoA - z$next.best.min.M.BtoA.numerator) z$main.model.A.to.B.mlogp=z$mlogp.M.AtoB z$p.conf.over.p.AtoB = pchisq(fit.M.conf$chisq,1,lower.tail=FALSE)/pchisq(fit.M.AtoB$chisq,1,lower.tail=FALSE) z$p.conf.over.p.BtoA = pchisq(fit.M.conf$chisq,1,lower.tail=FALSE)/pchisq(fit.M.BtoA$chisq,1,lower.tail=FALSE) # the -log10 space version of p.conf.over.p.AtoB becomes LEO.I.AtoB (I for independent (confounded) model comparison) z$LEO.I.AtoB = z$mlogp.M.conf - z$mlogp.M.AtoB z$LEO.I.BtoA = z$mlogp.M.conf - z$mlogp.M.BtoA z$LEO.O.AtoB = z$mlogp.M.AcollideB - z$mlogp.M.AtoB z$LEO.O.BtoA = z$mlogp.M.BcollideA - z$mlogp.M.BtoA # for symmetry of A->B vs B->A to work...we should compare to M.AcollideB z$eo.losem.lod = -(z$mlogp.M.AtoB - z$mlogp.M.AcollideB); z } if(exists("compare.local.sems14") ) rm(compare.local.sems14); compare.local.sems14=function(M.col,A.col,B.col,x=NULL,covx=NULL,no.obs=NULL) { if(is.null(x) && is.null(covx)) { stop("compare.local.sems() called without supply either data matrix x or covariance matrix cov. Aborting.") } if (is.null(x) && is.null(no.obs)) { stop("compare.local.sems() called with covariance matrix but without giving no.obs. Aborting.") } # necessary for AhiddenB code below to always work... if(length(A.col) !=1 || length(B.col) != 1) { stop("compare.local.sems called with illegal input: A or B had more or less than 1 variable specified.") } if (!is.null(x)) { if (is.null(no.obs)) { no.obs=nrow(x); } # allow no.obs to override nrow(x), if specified. cn=colnames(x); subx=x[,c(M.col,A.col,B.col)]; covx=cov(subx); } else { cn=colnames(covx); } dn=cn[c(M.col,A.col,B.col)]; nr=length(dn); M.empty=matrix(rep(0,nr*nr),nrow=nr,dimnames=list(dn,dn)); # adjust indices to point into new matrix M.col=match(cn[M.col],dn) A.col =match(cn[A.col],dn) B.col =match(cn[B.col],dn) # model 1 M.AtoB=M.empty; M.AtoB[M.col,A.col]=1; M.AtoB[A.col,B.col]=1; # attr(M.AtoB,"model")=1; # model 4 M.AcollideB=M.empty; M.AcollideB[M.col,A.col]=1; M.AcollideB[B.col,A.col]=1; # attr(M.AcollideB,"model")=4; sem.M.AtoB = make.ram(M.AtoB) sem.M.AcollideB = make.ram(M.AcollideB) fit.M.AtoB = summary(try.sem(pm,sem.M.AtoB$the.ram,covx,N=no.obs)) fit.M.AcollideB = summary(try.sem(pm,sem.M.AcollideB$the.ram,covx,N=no.obs)) z=list() z$title=paste(sep="","M(",paste(dn[M.col],collapse=","),"),A(",paste(dn[A.col],collapse=","),"),B(",paste(dn[B.col],collapse=","),") local SEM model comparisons to M->A->B.") z$M.AtoB=fit.M.AtoB z$M.AcollideB=fit.M.AcollideB z$mlogp.M.AtoB=-pchisq(fit.M.AtoB$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 z$mlogp.M.AcollideB=-pchisq(fit.M.AcollideB$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 # for symmetry of A->B vs B->A to work...we should compare to M.AcollideB z$eo.losem.lod = -(z$mlogp.M.AtoB - z$mlogp.M.AcollideB); z } # # minimal.sem.compare: return the LOD score for the # A.to.B versus AcollideB models as the edge score. # # This is a version of compare.local.sem that has been # stripped down for speed. # # Function is safe if M.col is a vector too... # # Either supply x directly....or give both covx and no.obs # if(exists("minimal.sem.compare") ) rm(minimal.sem.compare); minimal.sem.compare=function(M.col,A.col,B.col,x=NULL,covx=NULL,no.obs=NULL) { if(is.null(x) && is.null(covx)) { stop("minimal.sem.compare() called without supply either data matrix x or covariance matrix cov. Aborting.") } if (is.null(x) && is.null(no.obs)) { stop("minimal.sem.compare() called with covariance matrix but without giving no.obs. Aborting.") } if (!is.null(x)) { if (is.null(no.obs)) { no.obs=nrow(x); # allow no.obs to override the nrow(x), if specified. } cn=colnames(x); subx=x[,c(M.col,A.col,B.col)]; covx=cov(subx); } else { cn=colnames(covx); } dn=cn[c(M.col,A.col,B.col)]; nr=length(dn); M.empty=matrix(rep(0,nr*nr),nrow=nr,dimnames=list(dn,dn)); # adjust indices to point into new matrix M.col=match(cn[M.col],dn) A.col =match(cn[A.col],dn) B.col =match(cn[B.col],dn) # model 1 M.AtoB=M.empty; M.AtoB[M.col,A.col]=1; M.AtoB[A.col,B.col]=1; # attr(M.AtoB,"model")=1; # model 4 M.AcollideB=M.empty; M.AcollideB[M.col,A.col]=1; M.AcollideB[B.col,A.col]=1; # attr(M.AcollideB,"model")=4; sem.M.AtoB = make.ram(M.AtoB) sem.M.AcollideB = make.ram(M.AcollideB) fit.M.AtoB = summary(try.sem(pm,sem.M.AtoB$the.ram,covx,N=no.obs)) fit.M.AcollideB = summary(try.sem(pm,sem.M.AcollideB$the.ram,covx,N=no.obs)) z=list() z$title=paste(sep="","M(",paste(dn[M.col],collapse=","),"),A(",paste(dn[A.col],collapse=","),"),B(",paste(dn[B.col],collapse=","),") local SEM model comparisons to M->A->B.") z$M.AtoB=fit.M.AtoB z$M.AcollideB=fit.M.AcollideB z$mlogp.M.AtoB=-pchisq(fit.M.AtoB$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 z$mlogp.M.AcollideB=-pchisq(fit.M.AcollideB$chisq,1,lower.tail=FALSE,log.p=TRUE)*log.to.log10 # for symmetry of A->B vs B->A to work...we should compare to M.AcollideB z$eo.losem.lod = -(z$mlogp.M.AtoB - z$mlogp.M.AcollideB); z } # end minimal.sem.compare() if(exists("df.to.latex.table") ) rm(df.to.latex.table); df.to.latex.table=function(df,sig.fig=2,use.row.names=TRUE,caption="Caption here.") { nr=nrow(df); nc=ncol(df); rn=rownames(df); if (use.row.names) { rn=paste(rn,"& "); extracol=" & "; c2="c|"; } else { rn[]=""; extracol=""; c2=""; } cn=colnames(df); gap="c|"; x1 = paste(sep="","\\begin{table}\n \\centering\n\\begin{tabular}{|",c2,paste(rep(gap,nc),collapse=""),"}\n\\hline\n"); x2 = paste(sep=""," ",extracol,paste(colnames(df),collapse=" & ")," \\\\\n\\hline\n"); x=vector("list", length(nr)) for (i in 1:nr) { x[[i]]=paste(sep="",rn[i],paste(signif(df[i,],digits=sig.fig),collapse=" & ")," \\\\\n"); } x3=paste(sep="","\\hline\n\\end{tabular}\n \\caption{",caption,"}\n \\label{Ta:df.to.latex.table}\n\\end{table}\n"); y=paste(x1,x2,paste(x,collapse=""),x3,collapse="") y } # verstion that puts x (sd) after each entry if(exists("df.to.latex.table.sd") ) rm(df.to.latex.table.sd); df.to.latex.table.sd=function(df,sig.fig=2,use.row.names=TRUE,sd.table,caption="Caption here.") { df=signif(df,sig.fig); sd.table=signif(sd.table,sig.fig) new.tab=df; nr=nrow(df); nc=ncol(df); for (i in 1:nr) { for (j in 1:nc) { new.tab[i,j] = paste(sep="",as.character(df[i,j])," (",sd.table[i,j],")"); }} rn=rownames(df); if (use.row.names) { rn=paste(rn,"& "); extracol=" & "; c2="c|"; } else { rn[]=""; extracol=""; c2=""; } cn=colnames(df); gap="c|"; x1 = paste(sep="","\\begin{table}\n \\centering\n\\begin{tabular}{|",c2,paste(rep(gap,nc),collapse=""),"}\n\\hline\n"); x2 = paste(sep=""," ",extracol,paste(colnames(df),collapse=" & ")," \\\\\n\\hline\n"); x=vector("list", length(nr)) for (i in 1:nr) { x[[i]]=paste(sep="",rn[i],paste(new.tab[i,],collapse=" & ")," \\\\\n"); } x3=paste(sep="","\\hline\n\\end{tabular}\n \\caption{",caption,"}\n \\label{Ta:df.to.latex.table}\n\\end{table}\n"); y=paste(x1,x2,paste(x,collapse=""),x3,collapse="") y } # version that puts x (sd) after each entry, PLUS a 2 stand dev symbol [*] if(exists("df.to.latex.table.sd.plus.test") ) rm(df.to.latex.table.sd.plus.test); df.to.latex.table.sd.plus.test=function(df,sig.fig=2,use.row.names=TRUE,sd.table,caption="Caption here.",sd.away.zero=2,start.table.number=NA) { if(is.na(start.table.number) && !exists(".Global.table.num.df.to.latex")) { .Global.table.num.df.to.latex <<- 1; } else if(is.na(start.table.number) && exists(".Global.table.num.df.to.latex")) { .Global.table.num.df.to.latex <<- .Global.table.num.df.to.latex + 1; } else { .Global.table.num.df.to.latex <<- start.table.number; } pos.mark="[$\\heartsuit$]"; neg.mark="[$\\spadesuit$]"; df.accur = df; sd.table.accur = sd.table; df=signif(df,sig.fig); sd.table=signif(sd.table,sig.fig) new.tab=df; nr=nrow(df); nc=ncol(df); for (i in 1:nr) { for (j in 1:nc) { if (df.accur[i,j] - sd.away.zero*sd.table.accur[i,j]>0) { mark=pos.mark; } else { mark=neg.mark; } new.tab[i,j] = paste(sep="",as.character(df[i,j])," (",sd.table[i,j],") ",mark); }} rn=rownames(df); if (use.row.names) { rn=paste(rn,"& "); extracol=" & "; c2="r|"; } else { rn[]=""; extracol=""; c2=""; } cn=colnames(df); gap="r|"; # x1 = paste(sep="","\\begin{table}\n \\centering\n\\begin{tabular}{|",c2,paste(rep(gap,nc),collapse=""),"}\n\\hline\n"); x1=paste(sep="","\\begin{table}\n\\centering\n\\begin{small}\n\\begin{tabular}{|c||",paste(rep(gap,nc),collapse=""),"}\n\\hline\n\\small effect size & \\multicolumn{",nc,"}{c|}{ ($1/\\sigma^2_{\\varepsilon}$) The ratio of genetic to environmental variance.} \\\\\n\\cline{2-6}\n($\\gamma$) & "); ## & 0.1 & 0.2 & 0.5 & 1 & 2 \\\\\n\\hline"); # x2 = paste(sep=""," ",extracol,paste(colnames(df),collapse=" & ")," \\\\\n\\hline\n"); x2 = paste(sep="",paste(colnames(df),collapse=" & ")," \\\\\n\\hline\n"); x=vector("list", length(nr)) for (i in 1:nr) { x[[i]]=paste(sep="",rn[i],paste(new.tab[i,],collapse=" & ")," \\\\\n"); } x3=paste(sep="","\\hline\n\\end{tabular}\n\\end{small}\n \\caption{",caption," ",pos.mark," means $> ",sd.away.zero,"$ standard deviations above zero, else ",neg.mark,"}\n \\label{Ta:df.to.latex.table",.Global.table.num.df.to.latex,"}\n\\end{table}\n"); y=paste(x1,x2,paste(x,collapse=""),x3,collapse="") y } # version that puts x (sd) after each entry, PLUS a 2 stand dev symbol [*] if(exists("df.to.latex.typeIerror.print") ) rm(df.to.latex.typeIerror.print); df.to.latex.typeIerror.print=function(df,sig.fig=2,use.row.names=TRUE,sd.table,caption="Caption here.",sd.away.zero=2,start.table.number=NA) { if(is.na(start.table.number) && !exists(".Global.table.num.df.to.latex")) { .Global.table.num.df.to.latex <<- 1; } else if(is.na(start.table.number) && exists(".Global.table.num.df.to.latex")) { .Global.table.num.df.to.latex <<- .Global.table.num.df.to.latex + 1; } else { .Global.table.num.df.to.latex <<- start.table.number; } pos.mark="[$\\heartsuit$]"; neg.mark="[$\\spadesuit$]"; df.accur = df; sd.table.accur = sd.table; df=signif(df,sig.fig); sd.table=signif(sd.table,sig.fig) new.tab=df; nr=nrow(df); nc=ncol(df); row.head1="M-A coef" row.head2="$(\\alpha)$" col.head="M-B effect coefficient $(\\beta)$" for (i in 1:nr) { for (j in 1:nc) { if (df.accur[i,j] + 2*sd.table.accur[i,j] >2) { mark=neg.mark; } else { mark=pos.mark; } new.tab[i,j] = paste(sep="",as.character(df[i,j])," (",sd.table[i,j],") ",mark); }} rn=rownames(df); if (use.row.names) { rn=paste(rn,"& "); extracol=" & "; c2="r|"; } else { rn[]=""; extracol=""; c2=""; } cn=colnames(df); gap="r|"; # x1 = paste(sep="","\\begin{table}\n \\centering\n\\begin{tabular}{|",c2,paste(rep(gap,nc),collapse=""),"}\n\\hline\n"); x1=paste(sep="","\\begin{table}\n\\centering\n\\begin{small}\n\\begin{tabular}{|c||",paste(rep(gap,nc),collapse=""),"}\n\\hline\n\\small ",row.head1," & \\multicolumn{",nc,"}{c|}{ ",col.head,"} \\\\\n\\cline{2-6}\n",row.head2," & "); ## & 0.1 & 0.2 & 0.5 & 1 & 2 \\\\\n\\hline"); # x2 = paste(sep=""," ",extracol,paste(colnames(df),collapse=" & ")," \\\\\n\\hline\n"); x2 = paste(sep="",paste(colnames(df),collapse=" & ")," \\\\\n\\hline\n"); x=vector("list", length(nr)) for (i in 1:nr) { x[[i]]=paste(sep="",rn[i],paste(new.tab[i,],collapse=" & ")," \\\\\n"); } x3=paste(sep="","\\hline\n\\end{tabular}\n\\end{small}\n \\caption{",caption," ",neg.mark," means score $+ 2\\mbox{sd} > ",sd.away.zero,"$, else ",pos.mark,"}\n \\label{Ta:df.to.latex.table",.Global.table.num.df.to.latex,"}\n\\end{table}\n"); y=paste(x1,x2,paste(x,collapse=""),x3,collapse="") y } # if(exists("power.study") ) rm(power.study); power.study=function() { start=-.8 step=.5 fin =.8 print(paste("starting power study for step",step,date())) alpha = seq(start,fin,step); asize=length(alpha); beta = seq(start-.1,fin,step); bsize=length(beta); delta = seq(start,fin,step); dsize=length(delta); # no.obs = c(20,50,100,200,500,1000,2000); nsize=length(no.obs); N.obs=c(50,100); nsize=length(N.obs) a=array(data = NA, dim = c(nsize,asize,bsize,dsize), dimnames = list(as.character(N.obs),as.character(alpha),as.character(beta),as.character(delta))) z=list(); z$lod=a; z$mlogp.M.conf=a; z$mlogp.M.AtoB=a; z$mlogp.M.BtoA=a; z$mlogp.M.BcollideA=a; z$mlogp.M.AcollideB=a; # z$mlogp.M.AhiddenB=a; z$alt.model.best.min.mlogp=a; for (n in nsize) { for (i in 1:asize) { for (j in 1:bsize) { for (k in 1:dsize) { my.cor=matrix(nrow=3,c(1,alpha[i],beta[j],alpha[i],1,delta[k],beta[j],delta[k],1),dimnames=list(c("M","A","B"),c("M","A","B"))) b=compare.local.sems(pm=pm,1,2,3,x=NULL,covx=my.cor,no.obs=N.obs[n]); z$mlogp.M.conf[n,i,j,k]=b$mlogp.M.conf; z$mlogp.M.AtoB[n,i,j,k]=b$mlogp.M.AtoB; z$mlogp.M.BtoA[n,i,j,k]=b$mlogp.M.BtoA; z$mlogp.M.BcollideA[n,i,j,k]=b$mlogp.M.BcollideA; z$mlogp.M.AcollideB[n,i,j,k]=b$mlogp.M.AcollideB; # z$mlogp.M.AhiddenB[n,i,j,k]=b$mlogp.M.AhiddenB; z$alt.model.best.min.mlogp[n,i,j,k]=b$alt.model.best.min.mlogp; }}}} print(paste("Finishing power study for step",step,date())); z } #q=power.study() sim.model.num=function(model.num,env.var,no.samples,MAeff,ABeff,MBeff) { s2=sqrt(2) M=s2*sample(c(0,1,2), size=no.samples, prob=c(.25,.5,.25) ,replace=T ); # variance=1 if (model.num < 1 || model.num > 6) stop("model.num out of range in sim.model.num()") if(model.num==1) { A=MAeff*M+rnorm(no.samples,sd=sqrt(env.var)) B=ABeff*A+rnorm(no.samples,sd=sqrt(env.var)) } if(model.num==2) { B=MBeff*M+rnorm(no.samples,sd=sqrt(env.var)) A=ABeff*B+rnorm(no.samples,sd=sqrt(env.var)) } if(model.num==3) { B=MBeff*M+rnorm(no.samples,sd=sqrt(env.var)) A=MAeff*M+rnorm(no.samples,sd=sqrt(env.var)) } if(model.num==4) { B=rnorm(no.samples,sd=sqrt(env.var)) A=MAeff*M+ABeff*B+rnorm(no.samples,sd=sqrt(env.var)) } if(model.num==5) { A=rnorm(no.samples,sd=sqrt(env.var)) B=MBeff*M+ABeff*A+rnorm(no.samples,sd=sqrt(env.var)) } if(model.num==6) { H=rnorm(no.samples,sd=sqrt(env.var)) A=MAeff*M+H+rnorm(no.samples,sd=sqrt(env.var)) B=H+rnorm(no.samples,sd=sqrt(env.var)) } data.frame(M,A,B) } if(exists("power.study2") ) rm(power.study2); power.study2=function() { s2=sqrt(2) # simulate no.samples=400 set.seed(1) gen.var=1; env.var=1; d1=date() g.to.e = c(.1,.2,.5,1,2); effect.coef=c(2,1,.5,.25,.1); n.choice=c(50,100,200,500,1000,2000,5000); lec=length(effect.coef) lge=length(g.to.e) lnc=length(n.choice) res=c(); lod.mean=array(data = 0, dim = c(lec,lge,lnc), dimnames = list(as.character(effect.coef),as.character(g.to.e),as.character(n.choice))); lod.sd=lod.mean eo.mean=lod.mean eo.sd =lod.mean # different true models simulated... models.to.do=1 for(i in 1:models.to.do) { for (n in 1:lnc) { no.samples=n.choice[n] for (g in 1:lge) { env.var = 1/g.to.e[g] for (e in 1:lec) { ABeff=effect.coef[e] MAeff=1; #effect.coef[e] #, otherwise get a false reading on geneic:env variance MBeff=1; #effect.coef[e] num.sim=1000 model.num=1 # disc.matrix=matrix(nrow=6,ncol=6,data=0); eo.m = vector(length=num.sim); eo.m[]=0; lod.m=eo.m for (k in 1:num.sim) { mod=sim.model.num(model.num=i,env.var,no.samples,MAeff,ABeff,MBeff); covx=cov(mod) b=compare.local.sems14(1,2,3,x=NULL,covx=covx,no.obs=no.samples); best.mod=b$ranked.models[1] #disc.matrix[i,best.mod] = disc.matrix[i,b$ranked.models[1]] +1 # if (best.mod == i) { alt.mod = b$ranked.models[2] } else { alt.mod = b$ranked.models[1] } # lod= -(b$ranked.models.mlogp[i] - b$ranked.models.mlogp[alt.mod]) # lod.m[k]=lod eo.m[k]=b$eo.losem.lod } # k # lod.mean[e,g,n]=mean(lod.m) # lod.sd[e,g,n]=sd(lod.m) eo.mean[e,g,n]=mean(eo.m) eo.sd[e,g,n]=sd(eo.m) cat(paste(e,g,n,date(),"\n")) } # e } # g } # n d2=date() z=list() # z$lod.mean=lod.mean # z$lod.sd=lod.sd z$eo.mean=eo.mean z$eo.sd=eo.sd z$d1=d1 z$d2=d2 # z$lod.m=lod.m z$eo.m=eo.m save(z,file=paste(sep="","power.study2.i.is.",i,".reps.",num.sim,".rdat")) res=c(z,res); } # end i over 6 models res } # do the ZEO stuff to compare if(exists("power.study3") ) rm(power.study3); power.study3=function() { s2=sqrt(2) # simulate no.samples=400 set.seed(1) gen.var=1; env.var=1; d1=date() g.to.e = c(.1,.2,.5,1,2); effect.coef=c(2,1,.5,.25,.1); n.choice=c(50,100,200,500,1000,2000,5000); lec=length(effect.coef); lge=length(g.to.e); lnc=length(n.choice); res=c(); lod.mean=array(data = 0, dim = c(lec,lge,lnc), dimnames = list(as.character(effect.coef),as.character(g.to.e),as.character(n.choice))); lod.sd=lod.mean eo.mean=lod.mean eo.sd =lod.mean # different true models simulated... models.to.do=1 for(i in 1:models.to.do) { for (n in 1:lnc) { no.samples=n.choice[n] for (g in 1:lge) { env.var = 1/g.to.e[g] for (e in 1:lec) { ABeff=effect.coef[e] MAeff=1; #effect.coef[e] #, otherwise get a false reading on geneic:env variance MBeff=1; #effect.coef[e] num.sim=10000 model.num=1 # disc.matrix=matrix(nrow=6,ncol=6,data=0); eo.m = vector(length=num.sim); eo.m[]=0; no.obs.Z=no.samples for (k in 1:num.sim) { mod=sim.model.num(model.num=i,env.var,no.samples,MAeff,ABeff,MBeff); covx=cov(mod) # twofer.eo and n.twofer.eo hold our results... z=zeo2(1,3,2,covx,pm); # z$eo #cor1=pcor(c(1,3,c()),covx) #Zm1B= sqrt(no.obs.Z-pm$fisher.dof.cor)* fisher1(abs(cor1)) #cor2=pcor(c(1,3,2),covx) #Zm1BgivenA= sqrt(no.obs.Z-pm$fisher.dof.pcor1)* fisher1(abs(cor2)) #s=1; #eo2 = Zm1B - s*Zm1BgivenA; eo2 = z$eo; eo.m[k]=eo2; } # k eo.mean[e,g,n]=mean(eo.m) eo.sd[e,g,n]=sd(eo.m) cat(paste(e,g,n,date(),"\n")) } # e } # g } # n d2=date() z=list() z$eo.mean=eo.mean z$eo.sd=eo.sd z$d1=d1 z$d2=d2 z$eo.m=eo.m save(z,file=paste(sep="","power.study3zeo.i.is.",i,".reps.",num.sim,".rdat")) res=c(z,res); } # end i over 6 models res } # SimCR(): function used in false.positive.study1 below, to simulate # causal and reactive PC variables around the trait T # # A-> y -> B where y is the trait. # Which method is better at both detecting the causal arrow and the reactive arrow? # if(exists("SimCR") ) rm(SimCR); SimCR=function(no.samples=100,use.traitsnp=T, nAsnp=3,nBsnp=3, nTsnp=3, env.var=.5,randomize.snps=FALSE){ # for each SNP we assume Mendel's laws set.seed(1) s2=sqrt(2); SNPA=SNPB=SNPT=c(); for (i in 1:nAsnp) { SNPA=cbind(SNPA,s2*sample(c(0,1,2), size=no.samples, prob=c(.25,.5,.25) ,replace=T )); colnames(SNPA)[i]=paste(sep="","SNP.A",i);} for (i in 1:nBsnp) { SNPB=cbind(SNPB,s2*sample(c(0,1,2), size=no.samples, prob=c(.25,.5,.25) ,replace=T )); colnames(SNPB)[i]=paste(sep="","SNP.B",i);} for (i in 1:nTsnp) { SNPT=cbind(SNPT,s2*sample(c(0,1,2), size=no.samples, prob=c(.25,.5,.25) ,replace=T )); colnames(SNPT)[i]=paste(sep="","SNP.T",i);} rSNPT=SNPT; if (use.traitsnp==FALSE) { rSNPT[,]=0;} A=rnorm(no.samples,sd=env.var) for (i in 1:nAsnp) { A = A + SNPA[,i] } A=scale(A); Trait=A+rnorm(no.samples,sd=env.var) for (i in 1:nTsnp) { Trait=Trait+rSNPT[,i]; } Trait=scale(Trait); B=Trait+rnorm(no.samples,sd=env.var) for (i in 1:nBsnp) { B=B+SNPB[,i] } B=scale(B); #DANGEROUS CODE - scramble the relationship between SNPs and everything if (randomize.snps) { SNPA=sample(SNPA) SNPB=sample(SNPB) } #Trait=scale(Trait) Model1=data.frame(SNPA,SNPB,A,B,Trait) attr(Model1,"title")="SNPA -> A -> Trait -> B <- SNPB" Model1 } if(exists("false.positive.study1") ) rm(false.positive.study1); false.positive.study1=function() { s2=sqrt(2) # simulate no.samples=400 set.seed(1) gen.var=1; env.var=.5; d1=date() num.sim=3 model.num=1 # disc.matrix=matrix(nrow=6,ncol=6,data=0); eo.m = vector(length=num.sim); eo.m[]=0; lod.m=eo.m for (k in 1:num.sim) { mod=SimCR(no.samples=100,use.traitsnp=T, nAsnp=3,nBsnp=3, nTsnp=3, env.var=.5,randomize.snps=TRUE); q=neo(mod,skip.LEO=TRUE); # eo.m[k]=q$ } # k d2=date() z=list(); z$d1 = d1 z$d2 = d2 z$eo.m = eo.m save(z,file=paste(sep="","false.positive.study1.z.rdat")) } if(exists("test.variance.logic") ) rm(test.variance.logic); test.variance.logic=function() { # todo for steve # # 3 excel files, one with snps # one with details # one with just LEO.I easy stuff. # # simulate 10 snps into A, 10 into B # the noise SNPs -- simulate them as reactive to A # so that they are correlated but not causal. (very clever). # # generate roc curve plots... # for 0 <= omega/rho.ab <= 1 # # power at leo threshold should be very close to 100%, i.e. would like true positive rate to be close to 1. # # q25 = quantile(SNP, prob=.25) # q75 = quantile(SNP, prob=.75) # SNPprop = ifelse( SNPquant <= q25,0,ifelse(SNPquant <= q75,1,2)) # # -- # # try with 10 true snps, 0 false ----> 1 true, 10 false. # using the false snps as simulated as reactive to A/B so they are correlated but not causal. # Thus we simulate LD. # # todo with jake: the srebp response pathway. # # } if(exists("generate.simulation.params.old") ) rm(generate.simulation.params.old); generate.simulation.params.old=function(n.sample.points = 2) { # min.hb = min.ha = .2 # max.hb = max.ha = .8 min.hb = min.ha = .5 max.hb = max.ha = .5 num.h.points= 1 range.ha = max.ha - min.ha range.hb = max.hb - min.hb min.gamma2=0 res=data.frame() ha.choices = seq(min.ha,max.ha,range.ha/num.h.points); for (ha in ha.choices) { #ha=ha.choices[1] hb.choices = seq(min.hb,max.hb,range.hb/num.h.points); for (hb in hb.choices) { #hb=hb.choices[1] min.errB = .05 min.errA = .05 max.gamma2 = round(min(1-hb-min.errB, 1-ha- min.errA, .6),3) gamma2.choices = seq(min.gamma2,max.gamma2,(max.gamma2-min.gamma2)/n.sample.points); for (gamma2 in gamma2.choices) { # gamma2=gamma2.choices[1] gamma=sqrt(gamma2) omega.min = 0; omega.max= round(-gamma2 + sqrt(gamma^4 +1 - gamma^2 - ha - min.errA),3); if (omega.max > .6) omega.max=.6 omega.range = omega.max-omega.min; omega.choices = seq(omega.min, omega.max, omega.range/n.sample.points); for (omega in omega.choices) { #omega = omega.choices[1] eA=1-2*omega*gamma^2 - gamma^2 - omega^2 - ha; eB=1-gamma2 - hb; rho.ab=gamma2+omega q=data.frame(ha,hb,gamma2,omega,eA,eB,rho.ab) res=rbind(res,q) } } } } round(res,3) } if(exists("generate.simulation.params") ) rm(generate.simulation.params); generate.simulation.params=function() { min.hb = min.ha = .2 max.hb = max.ha = .6 # min.hb = min.ha = .5 # max.hb = max.ha = .5 num.h.points= 3 range.ha = max.ha - min.ha range.hb = max.hb - min.hb n.sample.points = 3 omega.n.sample.points = 6 min.gamma2=0 res=data.frame() ha.choices = seq(min.ha,max.ha,range.ha/num.h.points); for (ha in ha.choices) { #ha=ha.choices[1] hb.choices = seq(min.hb,max.hb,range.hb/num.h.points); for (hb in hb.choices) { #hb=hb.choices[1] min.errB = .05 min.errA = .05 omega.top = .7 omega.max = round(min(sqrt(1-ha-min.errA),omega.top),3) omega.min = 0 omega.range = omega.max-omega.min; omega.choices = seq(omega.min, omega.max, omega.range/omega.n.sample.points) for (omega in omega.choices) { percent.max.omega = round(omega/omega.max,3) max.gamma2 = round(min((1-ha-omega^2-min.errA)/(1+2*omega),1-hb-min.errB,.6),3) # max.gamma2 = round(min(1-hb-min.errB, 1-ha- min.errA, .6),3) gamma2.choices = round(seq(min.gamma2,max.gamma2,(max.gamma2-min.gamma2)/n.sample.points),3); for (gamma2 in gamma2.choices) { # gamma2=gamma2.choices[1] gamma=sqrt(gamma2) eA=1-2*omega*gamma^2 - gamma^2 - omega^2 - ha; if (eA < 0) { stop("algebra error.") } eB=1-gamma2 - hb; if (eB < 0) { stop("algebra error.") } rho.ab=gamma2+omega if (rho.ab>1) { stop("algebra error.") } q=data.frame(ha,hb,gamma2,omega,eA,eB,rho.ab,percent.max.omega) res=rbind(res,q) } } } } round(res,3) } # =================================================== # this function computes the standard error if (exists("stderr1")) rm(stderr1) stderr1 <- function(x){ sqrt( var(x,na.rm=T)/sum(!is.na(x)) ) } # =================================================== # The function err.bp is used to create error bars in a barplot # usage: err.bp(as.vector(means), as.vector(stderrs), two.side=F) err.bp<-function(daten,error,two.side=F){ if(!is.numeric(daten)) { stop("All arguments must be numeric")} if(is.vector(daten)){ xval<-(cumsum(c(0.7,rep(1.2,length(daten)-1)))) }else{ if (is.matrix(daten)){ xval<-cumsum(array(c(1,rep(0,dim(daten)[1]-1)),dim=c(1,length(daten))))+0:(length(daten)-1)+.5 }else{ stop("First argument must either be a vector or a matrix") } } MW<-0.25*(max(xval)/length(xval)) ERR1<-daten+error ERR2<-daten-error for(i in 1:length(daten)){ segments(xval[i],daten[i],xval[i],ERR1[i]) segments(xval[i]-MW,ERR1[i],xval[i]+MW,ERR1[i]) if(two.side){ segments(xval[i],daten[i],xval[i],ERR2[i]) segments(xval[i]-MW,ERR2[i],xval[i]+MW,ERR2[i]) } } } if(exists("extract.chisq.right.log10prob") ) rm(extract.chisq.right.log10prob); extract.chisq.right.log10prob=function(sem.obj) { if (!inherits(sem.obj,"summary.sem")) stop("bad object passed to extract.chisq.right.log10prob"); -pchisq(sem.obj$chisq,sem.obj$df,lower.tail=FALSE,log.p=TRUE)*log.to.log10 } if(exists("compare.single.vs.multiple.marker.simulations") ) rm(compare.single.vs.multiple.marker.simulations); compare.single.vs.multiple.marker.simulations=function(param=NULL, no.samples=200, num.reps=50,pm=neo.get.param()) { # param=generate.simulation.params.old(n.sample.points = 2) if (is.null(param)) { param=generate.simulation.params(); } else { if (is.null(colnames(param))) { colnames(param)=c("ha","hb","gamma2","omega","eA","eB","rho.ab","percent.max.omega") param=as.data.frame(param) } } N=no.samples set.seed(1) num.param=nrow(param) total.rows = num.reps*num.param param.keep=matrix(NA,nrow=total.rows,ncol=ncol(param)) colnames(param.keep)=c("ha","hb","gamma2","omega","eA","eB","rho.ab","percent.max.omega") BLV.AtoB=rep(NA,total.rows) BLV.BtoA=rep(NA,total.rows) cor.ma.b=rep(NA,total.rows) pcor.ma.b.given.a=rep(NA,total.rows) cor.mb.a=rep(NA,total.rows) pcor.mb.a.given.b=rep(NA,total.rows) path.abs.z.f1ab.AtoB=rep(NA,total.rows) path.abs.z.f1ab.BtoA=rep(NA,total.rows) path.abs.z.f1ab.AcollideB=rep(NA,total.rows) path.abs.z.f1ab.BcollideA=rep(NA,total.rows) path.abs.z.f1ba.AtoB=rep(NA,total.rows) path.abs.z.f1ba.BtoA=rep(NA,total.rows) path.abs.z.f1ba.AcollideB=rep(NA,total.rows) path.abs.z.f1ba.BcollideA=rep(NA,total.rows) path.abs.z.f2.M.M1M2.AtoB=rep(NA,total.rows) path.abs.z.f2.M.M1M2.BtoA=rep(NA,total.rows) rmsea.f1ab.AtoB=rep(NA,total.rows) rmsea.f1ab.BtoA=rep(NA,total.rows) rmsea.f1ab.conf=rep(NA,total.rows) rmsea.f1ab.AcollideB=rep(NA,total.rows) rmsea.f1ab.BcollideA=rep(NA,total.rows) rmsea.f1ba.AtoB=rep(NA,total.rows) rmsea.f1ba.BtoA=rep(NA,total.rows) rmsea.f1ba.conf=rep(NA,total.rows) rmsea.f1ba.AcollideB=rep(NA,total.rows) rmsea.f1ba.BcollideA=rep(NA,total.rows) rmsea.f2.M.M1M2.AtoB=rep(NA,total.rows) rmsea.f2.M.M1M2.BtoA=rep(NA,total.rows) rmsea.f2.M.M1M2unresolv=rep(NA,total.rows) rmsea.f2.M.M1M2hidden.con=rep(NA,total.rows) rmsea90up.f1ab.AtoB=rep(NA,total.rows) rmsea90up.f1ab.BtoA=rep(NA,total.rows) rmsea90up.f1ab.conf=rep(NA,total.rows) rmsea90up.f1ab.AcollideB=rep(NA,total.rows) rmsea90up.f1ab.BcollideA=rep(NA,total.rows) rmsea90up.f1ba.AtoB=rep(NA,total.rows) rmsea90up.f1ba.BtoA=rep(NA,total.rows) rmsea90up.f1ba.conf=rep(NA,total.rows) rmsea90up.f1ba.AcollideB=rep(NA,total.rows) rmsea90up.f1ba.BcollideA=rep(NA,total.rows) rmsea90up.f2.M.M1M2.AtoB=rep(NA,total.rows) rmsea90up.f2.M.M1M2.BtoA=rep(NA,total.rows) rmsea90up.f2.M.M1M2unresolv=rep(NA,total.rows) rmsea90up.f2.M.M1M2hidden.con=rep(NA,total.rows) chisq.f1ab.AtoB=rep(NA,total.rows) chisq.f1ab.BtoA=rep(NA,total.rows) chisq.f1ab.conf=rep(NA,total.rows) chisq.f1ab.AcollideB=rep(NA,total.rows) chisq.f1ab.BcollideA=rep(NA,total.rows) chisq.f1ba.AtoB=rep(NA,total.rows) chisq.f1ba.BtoA=rep(NA,total.rows) chisq.f1ba.conf=rep(NA,total.rows) chisq.f1ba.AcollideB=rep(NA,total.rows) chisq.f1ba.BcollideA=rep(NA,total.rows) chisq.f2.M.M1M2.AtoB=rep(NA,total.rows) chisq.f2.M.M1M2.BtoA=rep(NA,total.rows) chisq.f2.M.M1M2unresolv=rep(NA,total.rows) chisq.f2.M.M1M2hidden.con=rep(NA,total.rows) chiprob.f1ab.AtoB=rep(NA,total.rows) chiprob.f1ab.BtoA=rep(NA,total.rows) chiprob.f1ab.conf=rep(NA,total.rows) chiprob.f1ab.AcollideB=rep(NA,total.rows) chiprob.f1ab.BcollideA=rep(NA,total.rows) chiprob.f1ba.AtoB=rep(NA,total.rows) chiprob.f1ba.BtoA=rep(NA,total.rows) chiprob.f1ba.conf=rep(NA,total.rows) chiprob.f1ba.AcollideB=rep(NA,total.rows) chiprob.f1ba.BcollideA=rep(NA,total.rows) chiprob.f2.M.M1M2.AtoB=rep(NA,total.rows) chiprob.f2.M.M1M2.BtoA=rep(NA,total.rows) chiprob.f2.M.M1M2unresolv=rep(NA,total.rows) chiprob.f2.M.M1M2hidden.con=rep(NA,total.rows) res.list=vector("list", total.rows) for (i in 1:num.param) { for (j in 1:num.reps) { k=(num.reps*(i-1))+j # where to store # for (i in 1:1) { # for (j in 1:2) { # q=data.frame(ha,hb,gamma2,omega,eA,eB,rho.ab) z =list() z$ha = ha = param$ha[i] z$hb = hb = param$hb[i] z$gamma2 = gamma2 = param$gamma2[i] z$omega = omega = param$omega[i] z$eA = eA = param$eA[i] z$eB = eB = param$eB[i] z$rho.ab = rho.ab = param$rho.ab[i] param.string=paste(sep="","ha=",ha, " hb=",hb, " g2=",gamma2, " eA=",eA," eB=",eB, " rho.ab=",rho.ab, " w=",omega) # varB= gamma2 + eB + hb; # varA = TA + 2*omega*gamma*gamma + gamma*gamma + omega*omega*varB + eA; # if B is standardized to var(B)=1, then.... # std.varA = TA + gamma^2 + omega^2 + 2*gamma*gamma*omega + eA; # set == 1 # covAB = omega*TB+gamma*gamma*(1+omega)+omega*eB # covAC = gamma+ omega*gamma # covBC = gamma # cov.SNPB.A = sqrt(TB)*omega # rho.AB = gamma^2 + omega SNPA=rnorm(no.samples); SNPB=rnorm(no.samples); C=rnorm(no.samples); errA=rnorm(no.samples,sd=sqrt(eA)); errB=rnorm(no.samples,sd=sqrt(eB)); B=errB+sqrt(gamma2)*C+ sqrt(hb)*SNPB; A=errA+omega*B+sqrt(gamma2)*C+sqrt(ha)*SNPA; x=data.frame(SNPA,SNPB,A,B); z$x=x z$fit1mb = compare.local.sems(pm=pm,M.col=2,A.col=3,B.col=4,x=x,no.obs=no.samples) z$fit1ma = compare.local.sems(pm=pm,M.col=1,A.col=3,B.col=4,x=x,no.obs=no.sam