# OpWise: Identifying differentially expressed genes in bacterial microarray # experiments # # http://microbesonline.org/OpWise # Morgan N. Price, Adam P. Arkin, and Eric J. Alm, 2005 # # # SummarizeUarrayVsOperons() takes real data and produce a list with summary statistics (such as the mean # and total deviance of each gene) for input to AnalyzeUarrayVsOperons(), which # produces the p-values. See documentation for AnalyzeUarrayVsOperons() for # an explanation of its results. You can also use the output of that function as # input to OperonAgreementVsConfidencePlot() # # If you do not have replicates, then use OpWiseNoRepP() instead -- see below # # Use MapUarrayGenes() to convert data with arbitrary labels into data with VIMSS # identifiers. This is required if you are using VIMSS (MicrobesOnline) operon # predictions and are using other identifiers in the microarray data file(s). # # SummarizeUarrayVsOperons: # The data should already be transformed to log-ratios (we normally use base 2, but # you can use whatever you want). # # data should be a data frame or matrix with # 1 row per gene, 1 column per replicate, and NA for missing values # # For genomic control experiments, data2 should be the control and should be analogous, # so that muObs will be mean(data1)-mean(data2) # # If sweep is TRUE then each column is swept (the mean is subtracted first, # using genes present in all columns) # # If loess is non-null then a loess normalization is performed on the mean vs. the # sum of the log-levels. (This sum needs to be provided for # direct comparisons but not for genomic control) # If sector is non-null then a sector normalization is performed (after the loess), using # sector as a factor over rows (the median is computed within each sector, and # subtracted from each gene in the sector). # If spots is non-null then the data is over spots not over locusId (so that locusId is non-unique) # locusId should be integers (e.g., microbesonline or VIMSS ids) # # opPred should be operon predictions with columns Gene1, Gene2, and pOp # SummarizeUarrayVsOperons <- function(opPred, locusId, data1, data2=NULL, sweep=FALSE, loess=FALSE, A=NULL, sector=NULL, spots=FALSE, name=NULL) { locusInput <- locusId; if(!is.data.frame(data1)) { stop("SummarizeUarrayVsOperons: data1 is not a data frame\n"); } if(!is.null(data2) && !is.data.frame(data2)) { stop("SummarizeUarrayVsOperons: data2 is not a data frame\n"); } n1 <- apply(data1,1,function(x) countif(is.finite(x))); n2 <- if(!is.null(data2)) apply(data2,1,function(x) countif(is.finite(x))) else NULL; if(max(abs(data1),na.rm=TRUE) > 100) { warning("SummarizeUarrayVsOperons data1 argument contains extreme values!", if(is.null(data2)) " Input should be log-ratios" else " Input should be log-levels"); } if(!is.null(data2) && max(abs(data2),na.rm=TRUE) > 100) { warning("SummarizeUarrayVsOperons data2 argument contains extreme values! Input should be log-levels"); } # sweep before averaging or normalization or normalization if(sweep) { cat("Sweeping out columns\n"); good <- n1==ncol(data1); if(!is.null(data2)) good <- good & n2==ncol(data2); for (i in 1:ncol(data1)) { data1[,i] <- data1[,i] - mean(data1[good,i]); } if(!is.null(data2)) for (i in 1:ncol(data2)) { data2[,i] <- data2[,i] - mean(data2[good,i]); } } muObs <- apply(data1,1,mean,na.rm=TRUE); if(!is.null(data2)) { mu1 <- muObs; mu2 <- apply(data2,1,mean,na.rm=TRUE); cat("Comparing treatment to control\n"); muObs <- mu1 - mu2; } if (countif(!is.na(muObs)) < 10 || max(abs(muObs)) < 0.0001) { stop("No differences apparent in data (all log-ratios are 0) -- OpWise is exiting\n"); } if(loess) { cat("Loess normalization\n"); use <- !is.na(muObs); if(is.null(data2)) use <- use & !is.null(A); if(is.null(data2)) { l <- loess(muObs[use]~A[use])$fitted; } else { l <- loess(muObs[use]~I(mu1[use]+mu2[use]))$fitted; } muObs[use] <- muObs[use] - l; } if(!is.null(sector)) { cat("Performing sector-based normalization\n"); sectorMedians <- sapply(split(muObs,sector),median,na.rm=TRUE); muObs <- muObs - sectorMedians[unclass(sector)]; } ss <- function(x) sum( withoutNA(x - mean(x,na.rm=TRUE))^2 ); if(!spots) { sumsq <- apply(data1,1,ss); if (!is.null(data2)) sumsq <- sumsq + apply(data2,1,ss); } else { # aggregate everything over spots n1 <- aggregate(n1,list(locusId),sum)$x; muObs <- aggregate(muObs,list(locusId),mean,na.rm=TRUE)$x; sumsq <- aggregate(c(data1,recursive=TRUE),list(rep(locusId,ncol(data1))),ss)$x; if(!is.null(n2)) { n2 <- aggregate(n2,list(locusId),sum)$x; sumsq <- sumsq + aggregate(c(data2,recursive=TRUE),list(rep(locusId,ncol(data2))),ss)$x; mu1 <- aggregate(mu1,list(locusId),mean,na.rm=TRUE)$x; mu2 <- aggregate(mu2,list(locusId),mean,na.rm=TRUE)$x; } nSpots = table(locusId); if(countif(nSpots>20) > 1) { cat("More than 20 spots for these loci:",names(nSpots[nSpots>20]),"\n"); } locusId <- unique(sort(locusId)); cat("Aggregated over spots -- data for",length(locusId),"loci\n"); } oppairs <- merge(opPred[opPred$pOp>.5,],data.frame(index1=1:length(locusId),Gene1=locusId)); oppairs <- merge(oppairs,data.frame(index2=1:length(locusId),Gene2=locusId)); cat("Have ",nrow(oppairs),"operon pairs to calibrate reliability from\n"); data <- list(muObs=muObs,sumsq=sumsq,n1=n1,n2=n2,oppairs=oppairs,locusId=locusId, locusInput=locusInput,data1=data1,data2=data2, sweep=sweep,loess=loess,sector=sector,spots=spots,A=A); if(!is.null(n2)) { data$mu1 <- mu1; data$mu2 <- mu2; } if(!is.null(name)) { data$name <- name; } if(is.null(n2)) { cat("SummarizeUarrayVsOperons finished with data for",length(n1),"loci and up to", max(n1),"replicates\n"); } else { cat("SummarizeUarrayVsOperons finished with data for ",length(n1),"loci and up to", max(n1),"treatment replicates and",max(n2),"treatment repliactes\n"); } return(data); } # data must contain locusId, muObs, sumsq, n1, n2, and oppairs # oppairs must contain index1, index2, and either pOp (if useP) or sOp (if !useP) # If not provided, data$fit is estimated with FitUarrayData # # The fold parameter controls the "pFold" values, and the base parameter is what # base was used to convert the data to log-scale. # # In the return values, # data$stats includes the single-gene and operon-wise p-values, and is indexed by # # locusId -- gene identifier # name -- alternate gene identifier (if included as argument to SummarizeUarrayVsOperons()) # # Single-gene analyses: # stats$p0 is P(true log-ratio > 0), which is near 0 for confident down-changers # and near 1 for confident up-changers. # stats$pFold is P(true fold-change > fold) for up-changers and # P(true fold-change < -fold) for down-changers. # Values near 1 indicate confident changers. # stats$muObs is the mean of the observations for the log-ratio # stats$xShrunk is the shrunk estimate of the log-ratio # stats$xLo is the lower end of the 95% confidence interval # stats$xHi is the higher end of the 95% confidence interval # stats$var is the shrunk estimate of the variance # stats$df is the degrees of freedom # (the posterior distribution of the log-ratio for each gene # is a t distribution with mean=x, variance=var, and degrees of freedom=df; # these suffice to compute stats$p0, stats$xLo, stats$pFold, stats$xHi.) # # Operon-wise analyses -- these are analogous to the single-gene analyses # but use data from adjacent genes likely to be in the same operon as # well as for the gene itself: # stats$pow0 (like stats$p0 above) # stats$powFold (like stats$pFold above) # # data$changers describes the agreement with operons -- plots of the # adjusted agreement with operons changersPAgreesAdj, which ranges from 0 to 1, # or its confidence interval, changersPAgreesAdjConf, versus the average # confidence within each confidence group changersCMeans reveal whether # the p-values are reasonable. AnalyzeUarrayVsOperons <- function(data,debug=FALSE,nConf=8, fold=1.5,base=2,minLogRatio=log(fold)/log(base)) { if (max(data$n1) <= 1 || (!is.null(data$n2) && max(data$n2) <= 1)) { stop("Cannot use AnalyzeUarrayVsOperons on data without replicates -- use OpWiseNoRepP instead\n"); } if(is.null(data$fit)) { data$fit <- FitUarrayData(data$oppairs,data$muObs,data$sumsq,data$n1,data$n2,debug=debug); cat("AnalyzeUarrayVsOperons built model of consistency with operons\n"); } data$stats <- TestUarrayVsModelOpWise(data,data$fit); data$stats$muObs <- data$muObs; if(!is.null(data$name)) data$stats$name <- data$name; data$changers <- changersVsOperonPairs(data$locusId,logodds(data$stats$p0),data$oppairs,both=TRUE); data$changers$confidence <- logodds2p(abs(data$changers$statistic)); data$changers$cfactor <- cutN(data$changers$confidence,n=nConf); data$changersCMeans <- sapply(split(data$changers$confidence,data$changers$cfactor),mean); data$changersPAgrees <- sapply(split(data$changers$bAgree,data$changers$cfactor),mean); data$changersPAgreesConf <- t(sapply(split(data$changers$bAgree,data$changers$cfactor), function(x) t.test(x)$conf.int)); # note this uses +-1 not 1,0 data$changersPAgreesAdj <- sapply(split(data$changers$raw/data$changers$pOp,data$changers$cfactor),mean); data$changersPAgreesAdjConf <- t(sapply(split(data$changers$raw/data$changers$pOp,data$changers$cfactor), function(x) t.test(x)$conf.int)); cat("AnalyzeUarrayVsOperons completed successfully\n"); return(data); } # microarrayData should include sysName (or other "from" field) and log-ratios, for future # input to SummarizeUarrayVsOperons(). # mapping should include sysName and locusId (or other "from" and "to" fields) -- other # elements of mapping are ignored. # # Returns: a table like microarrayData, with the same row order, but with a new # field locusId (or other "to" field) MapUarrayGenes <- function(microarrayData, mapping, from="sysName", to="locusId") { nrows <- nrow(microarrayData); microarrayData <- merge(cbind(microarrayData,OWrowNumber=1:nrows), mapping[,c(from,to)], by=from,all.x=TRUE,all.y=FALSE); if(is.null(microarrayData) || nrow(microarrayData) != nrows) { stop("Error in arguments to MapUarrayGenes()"); } microarrayData <- microarrayData[order(microarrayData$OWrowNumber),]; microarrayData[is.na(microarrayData[[to]]),to] <- -microarrayData$OWrowNumber[is.na(microarrayData[[to]])]; microarrayData$OWrowNumber <- NULL; return(microarrayData); } # Generates both single-gene and operon-wise p-values # Reads data$oppair, data$muObs, data$sumsq, data$n1, and data$n2 but not other components # oppair needs index1, index2, and either pOp or sOp (depending on useP) TestUarrayVsModelOpWise <- function(data,fit,useP=TRUE,minLogRatio=log(1.5)/log(2)) { # includes p (===p0), pUp, pDown (stringent p's), mu0, V, df out <- TestUarrayVsModel(data$muObs,data$sumsq,data$n1,data$n2,fit=fit,minLogRatio=minLogRatio); pairs <- data$oppair[if(useP) data$oppair$pOp>.5 else data$oppair$sOp,]; # compute posterior probability for each operon if(useP) { if (is.null(data$n2)) { ntot <- data$n1; nn <- data$n1; } else { # genomic control ntot <- data$n1 + data$n2 - 1; nn <- data$n1*data$n2/(data$n1+data$n2); } pairs$pOpPost <- logodds2p(logodds(pairs$pOp) + log(relLik(data$muObs[pairs$index1], data$sumsq[pairs$index1], data$muObs[pairs$index2], data$sumsq[pairs$index2], a=data$fit$a, bc=data$fit$bc, c=data$fit$c, n1tot=ntot[pairs$index1], nn1=nn[pairs$index1], n2tot=ntot[pairs$index2], nn2=nn[pairs$index2]))); pairs$pOpPost[is.na(pairs$pOpPost)] <- pairs$pOp[is.na(pairs$pOpPost)]; } else { pairs$pOpPost <- rep(1,nrow(pairs)); } pairs$row <- 1:nrow(pairs); ppair <- data.frame(row=1:nrow(pairs), pp0=TestUarrayVsModelPairs(pairs,data$muObs,data$sumsq,data$n1,data$n2,fit=fit), ppUp=TestUarrayVsModelPairs(pairs,data$muObs,data$sumsq,data$n1,data$n2, fit=fit,minLogRatio=minLogRatio,mode=1), ppDown=TestUarrayVsModelPairs(pairs,data$muObs,data$sumsq,data$n1,data$n2, fit=fit,minLogRatio=minLogRatio,mode=-1)); # 1.Gene2=2.Gene1=center # index1.x,index2.x,index2.y is the sequence of indices triples <- merge(pairs,pairs,by.x="Gene2",by.y="Gene1"); if(nrow(triples) > 0) { triples$row <- 1:nrow(triples); ptriples <- data.frame( row=1:nrow(triples), ppp0=apply(triples[,c("index1.x","index2.x","index2.y")], 1, function(j) TestUarrayVsModelN(data$muObs[j],data$sumsq[j], data$n1[j], if(is.null(data$n2)) NULL else data$n2[j], fit=fit) ), pppUp=apply(triples[,c("index1.x","index2.x","index2.y")], 1, function(j) TestUarrayVsModelN(data$muObs[j],data$sumsq[j], data$n1[j], if(is.null(data$n2)) NULL else data$n2[j], fit=fit,minLogRatio=minLogRatio,mode=1) ), pppDown=apply(triples[,c("index1.x","index2.x","index2.y")], 1, function(j) TestUarrayVsModelN(data$muObs[j],data$sumsq[j], data$n1[j], if(is.null(data$n2)) NULL else data$n2[j], fit=fit,minLogRatio=minLogRatio,mode=-1) ) ); } else { ptriples <- data.frame(row=1:nrow(triples), ppp0=rep(0.5,nrow(triples)), pppUp=rep(0.5,nrow(triples)), pppDown=rep(0.5,nrow(triples)) ); } inPair1 <- merge(data.frame(index1=1:length(data$muObs)),pairs,all.x=TRUE,all.y=FALSE); inPair2 <- merge(data.frame(index2=1:length(data$muObs)),pairs,all.x=TRUE,all.y=FALSE); if (nrow(triples)>0) { inTriple <- merge(data.frame(index2.x=1:length(data$muObs)),triples,all.x=TRUE,all.y=FALSE); } else { inTriple <- data.frame(index2.x=1:length(data$muObs),row=rep(NA,length(data$muObs))); } ppair1 <- ppair[inPair1$row,]; pOpPost1 <- pairs$pOpPost[inPair1$row]; ppair2 <- ppair[inPair2$row,]; pOpPost2 <- pairs$pOpPost[inPair2$row]; ptriples <- ptriples[inTriple$row,]; pOpPost1[is.na(pOpPost1)] <- 0; pOpPost2[is.na(pOpPost2)] <- 0; for (i in c("pp0","ppUp","ppDown")) { ppair1[[i]][is.na(ppair1[[i]])] <- 0.5; ppair2[[i]][is.na(ppair2[[i]])] <- 0.5; } for (i in c("ppp0","pppUp","pppDown")) ptriples[[i]][is.na(ptriples[[i]])] <- 0.5; # for genes not in pairs, have no change # for genes in a pair but not in a triple, use P(Op)*ppair + (1-P(Op))*p # for genes in a triple, with P(OpL) and P(OpR), use all three: # Operon-wise p = P(OpL)*P(OpR)*ptriple + (1-P(OpL))*P(OpR)*ppairR + P(OpL)*(1-P(OpR))*ppairL # + (1-P(OpL))*(1-P(OpR))*p # (If useP is false than all P(Op)=1) # associated pOp will be 0 if value is not relevant pTot0 <- (pOpPost1*pOpPost2*ptriples$ppp0 + pOpPost1*(1-pOpPost2)*ppair1$pp0 + (1-pOpPost1)*pOpPost2*ppair2$pp0 + (1-pOpPost1)*(1-pOpPost2)*out$p); pTotUp <- (pOpPost1*pOpPost2*ptriples$pppUp + pOpPost1*(1-pOpPost2)*ppair1$ppUp + (1-pOpPost1)*pOpPost2*ppair2$ppUp + (1-pOpPost1)*(1-pOpPost2)*out$pUp); pTotDown <- (pOpPost1*pOpPost2*ptriples$pppDown + pOpPost1*(1-pOpPost2)*ppair1$ppDown + (1-pOpPost1)*pOpPost2*ppair2$ppDown + (1-pOpPost1)*(1-pOpPost2)*out$pDown); return(data.frame(locusId=data$locusId, p0=out$p, pFold=ifelse(is.na(out$mu0),1,ifelse(out$mu0>0,out$pUp,out$pDown)), xUnshrunk=data$muObs, xShrunk=out$mu0, xLo=out$mu0 + sqrt(out$V)*qt(0.025,df=out$df), xHi=out$mu0 + sqrt(out$V)*qt(0.975,df=out$df), var=out$V, df=out$df, pow0=pTot0, powFold=ifelse(is.na(out$mu0),1,ifelse(out$mu0>0,pTotUp,pTotDown)) )); } TestUarrayVsModelPairs <- function(pair,mu,sumsq,n1,n2=NULL,useP=TRUE,useBias=TRUE, fit=FitUarrayData(pair,mu,sumsq,n1,n2=n2,useP=useP,useBias=useBias), minLogRatio=log(1.5)/log(2), mode=0) { sapply(1:nrow(pair),function(i) { j <- c(pair$index1[i],pair$index2[i]); TestUarrayVsModelN(mu[j],sumsq[j],n1[j], if(is.null(n2)) NULL else n2[j], fit=fit, minLogRatio=minLogRatio, mode=mode); }); } # Tests data for a SINGLE group of genes believed to have matching expression patterns # (i.e., be in an operon) # All arguments except for fit should be vectors # mode should be 0 (for P(mu>0)), 1 (for P(mu>minLogRatio)), or -1 (for P(mu< -minLogRatio)) # TestUarrayVsModelN <- function(m, sumsq, n1, n2=NULL, fit=list(a=0.3, b=0.1555555,c=.28,bc=0.1,v=3.25), mode=0, minLogRatio=log(1.5)/log(2)) { ntot <- if(is.null(n2)) n1 else n1 + n2 - 1; nn <- if(is.null(n2)) n1 else 1/(1/n1 + 1/n2); # = 0 if either n1 or n2 is 0 -> no blow ups good <- (ntot >= 0) & is.finite(m); if(!any(good)) return(0.5); # no data m <- m[good]; sumsq <- sumsq[good]; ntot <- ntot[good]; nn <- nn[good]; nprime <- 1/(1/nn + 1/fit$c); # n1c, n2c, etc. in text V <- (fit$a + sum(sumsq) + sum(nprime*m^2) - (sum(nprime*m))^2/(fit$b+sum(nprime))) / ((fit$v + sum(ntot) + 1) * (fit$b + sum(nprime))); mu0 <-sum(m * nprime)/(sum(nprime) + fit$b); df <- fit$v + sum(ntot) + 1; if (mode==0) return( pt(-mu0/sqrt(V), df=df, lower=FALSE) ); if (mode>0) return( pt((minLogRatio-mu0)/sqrt(V),df=df,lower=FALSE) ); return( pt((-mu0-minLogRatio)/sqrt(V), df=df,lower=TRUE) ); } TestUarrayVsModel <- function(m, sumsq, fit=list(a=0.3, b=0.1,c=Inf,bc=0.1,v=3.25), n1=10, n2=NULL, minLogRatio=log(1.5)/log(2)) { ntot <- if(is.null(n2)) n1 else n1 + n2 - 1; nn <- if(is.null(n2)) n1 else n1 * n2/(n1 + n2); nprime <- 1/(1/nn + 1/fit$c); V <- (fit$a + m^2 * nprime * fit$b/(fit$b + nprime) + sumsq)/((fit$b + nprime)*(fit$v + ntot + 1)); mu0 <- m * nprime/(nprime + fit$b); p <- pt(-mu0/sqrt(V), df=fit$v + ntot + 1, lower=FALSE); pUp <- pt((minLogRatio-mu0)/sqrt(V),df=fit$v+ntot+1,lower=FALSE); pDown <- pt((-mu0-minLogRatio)/sqrt(V), df=fit$v+ntot+1,lower=TRUE); bad <- ntot<1 | andNoNA(nn==0); return(data.frame(p=ifelse(bad,0.5,p), pUp=ifelse(bad,1.0,pUp), pDown=ifelse(bad,1.0,pDown), mu0=mu0, V=V, df=ifelse(bad,NA,fit$v + ntot + 1))); } # This fits a linear model with systematic errors to summarized statistics. # pair described operon pairs, and must include: # pOp, index1, and index2 (or binary sOp instead of pOp if useP is FALSE) # pOp is the probability of being an operon # index1 and index2 are the row numbers of the two genes # # mu, sumsq, n1, and (for genomic control or external reference experiments) n2 are # the mean, the total squared deviance, and the number of replicate measurements for each gene. # # Returns a fit object with a (alpha),b (beta),c (gamma),bc (beta-primed), and # a test of whether bias is present (chisqC and the p-value chisqCP from the maximum # likelihood ratio test) FitUarrayData <- function(pair,mu,sumsq, n1, n2=NULL, useP=TRUE, useBias=TRUE, debug=FALSE) { if (is.null(n2)) { ntot <- n1; nn <- n1; # N in the paper } else { # genomic control ntot <- n1 + n2 - 1; # powers of sqrt(theta) in likelihood function for each gene nn <- n1*n2/(n1+n2); # multiplier of (muObs - mu)^2 in likelihood function for each gene } fit <- fitFDist(sumsq/(ntot-1),ntot-1); # sumsq depends on ntot, not on nn a <- fit$scale*fit$df2; v <- fit$df2-1; good <- andNoNA(is.finite(sumsq/(ntot-1)) & nn >= 1); meanM2 <- mean(mu[good]^2); bc <- 1/( (v-1)/a * meanM2 - mean(1/nn[good])); # starting point if(bc < 1e-6) bc <- 0.01; f <- function(x) { if(x < 1e-6) return(1e20); v <- OpUarrayLLNoBias(mu,sumsq,n1,n2,a,x,v); v <- -v; attr(v,"gradient") <- -attr(v,"gradient"); return(v); }; nlmOut <- nlm(f, bc, check.analyticals=debug); if(debug) cat("nlm bc","min",nlmOut$minimum,"est(bc)",nlmOut$estimate,"grad",nlmOut$gradient, "code",nlmOut$code,"iter",nlmOut$iterations,"\n"); bc <- nlmOut$estimate; fit <- list(a=a,bc=bc,b=bc,c=Inf,v=v); if(!is.null(pair) && bc > 1e-5 && useBias) { p <- if(useP) pair[pair$pOp>.5,] else pair[pair$sOp,]; if(!useP) p$pOp <- rep(1,nrow(p)); p <- p[good[p$index1] & good[p$index2],]; if(debug) cat("Using",nrow(p),"good operon pairs to estimate bias\n") # - sign to maximize, 4*bc a legal and plausible starting point # use 1/c as the parameter because then nlm behaves better # (numerical derivatives issue?) # # For gradient computation: f is the log likelihood if an operon # d(log(1-p+p*exp(f(c))))/dc = p*exp(f(c))*df/dc/(1-p+p*exp(f(c))) # dg/d(1/c) = dg/dc / (d(1/c)/dc) = -c^2 * df/dc = -(1/c)^-2 * df/dc relLikCInv <- function(x) { if(length(x) != 1) stop("More than 1 value"); liks <- relLik(mu[p$index1],sumsq[p$index1], mu[p$index2],sumsq[p$index2], a=a,bc=bc,c=1/x,v=v, n1tot=ntot[p$index1], n2tot=ntot[p$index2], nn1=nn[p$index1], nn2=nn[p$index2]); dllsdc <- relLikLogDeriv(mu[p$index1],sumsq[p$index1], mu[p$index2],sumsq[p$index2], a=a,bc=bc,c=1/x,v=v, n1tot=ntot[p$index1], n2tot=ntot[p$index2], nn1=nn[p$index1], nn2=nn[p$index2]); ratios <- 1-p$pOp + p$pOp*liks; out <- -sum(log(ratios)); attr(out,"gradient") <- x^-2*sum(p$pOp*liks*dllsdc/ratios); return(out); } # c=4*bc (3:1 signal:bias) is a reasonable starting point nlmOut <- nlm(relLikCInv, 1/(4*bc), check.analyticals=debug); if(debug) cat("nlm c","min",nlmOut$minimum,"est(1/c)",nlmOut$estimate,"grad",nlmOut$gradient, "code",nlmOut$code,"iter",nlmOut$iterations,"\n"); fit$c <- 1/nlmOut$estimate; fit$b <- 1/(1/fit$bc-1/fit$c); # for max. likelihood ratio test (df=1) fit$chisqC <- 2*((-nlmOut$minimum) - (-relLikCInv(0))); # c=Inf or 1/c=0 # /2 is because we are optimizing a variable at a boundary, and under the # null hypothesis, half the time the optimal estimate will be 1/c=0. # See Self and Liang 1987 J. Am. Stat. Assoc. 82:605-610 fit$chisqCP <- pchisq(fit$chisqC,df=1,lower=FALSE)/2; } return(fit); } # Assuming genes 1 and 2 are in an operon, the likelihood ratio # for the data given that they are in an operon versus if they were not # # m1 is the observed mean, ss1 is the total squared deviance, n1 is the #data points # or degrees of freedom, nn1 is the second degrees of freedom (N_1 in the text) # and similarly for gene 2 relLik <- function(m1,ss1,m2,ss2,a=0.3,bc=.1,c=0.5,v=3.25, n1tot=10,n2tot=10, nn1=n1tot, nn2=n2tot) { if(c-bc < 1e-6) return( 0 ); b <- 1/(1/bc-1/c); n1c <- 1/(1/nn1+1/c); n2c <- 1/(1/nn2+1/c); return( (a/2)^((v+1)/2-(v+1)) * sqrt(b/(b+n1c+n2c)) * 1/sqrt((1+nn1/c)*(1+nn2/c)) * sqrt((bc+nn1)*(bc+nn2))/bc * gamma((v+n1tot+n2tot+1)/2)*gamma((v+1)/2)/(gamma((v+n1tot+1)/2)*gamma((v+n2tot+1)/2)) * ((a + ss1 + m1^2*nn1*bc/(bc+nn1))/2)^((v+n1tot+1)/2) * ((a + ss2 + m2^2*nn2*bc/(bc+nn2))/2)^((v+n2tot+1)/2) / ((a + ss1 + ss2 + n1c*m1^2 + n2c*m2^2 - (m1*n1c + m2*n2c)^2/(b+n1c+n2c))/2)^((v+n1tot+n2tot+1)/2) ); } # derivative of the logarithm of relLik() with respect to c relLikLogDeriv <- function(m1,ss1,m2,ss2,a=0.3,bc=.1,c=0.5,v=3.25, n1tot=10,n2tot=10, nn1=n1tot, nn2=n2tot) { if(c-bc < 1e-6) return( NA ); b <- 1/(1/bc-1/c); n1c <- 1/(1/nn1+1/c); n2c <- 1/(1/nn2+1/c); # derivatives dbdc <- -(c/bc - 1)^-2; dn1cdc <- (1 + c/nn1)^-2; dn2cdc <- (1 + c/nn2)^-2; dcombdc <- (dn1cdc*m1^2 + dn2cdc*m2^2 - 2*(m1*n1c + m2*n2c)*(m1*dn1cdc + m2*dn2cdc)/(b+n1c+n2c) - -(m1*n1c + m2*n2c)^2/(b+n1c+n2c)^2 * (dbdc+dn1cdc+dn2cdc) )/2; return(0.5*dbdc/b - 0.5*(dbdc+dn1cdc+dn2cdc)/(b+n1c+n2c) - 0.5*(-nn1/c^2)/(1+nn1/c) - 0.5*(-nn2/c^2)/(1+nn2/c) - (v+n1tot+n2tot+1)/2 * dcombdc / ((a + ss1 + ss2 + n1c*m1^2 + n2c*m2^2 - (m1*n1c+m2*n2c)^2/(b+n1c+n2c))/2) ); } OpUarrayLLNoBias <- function(m,sumsq,n1,n2,a,b,v) { ntot <- if(is.null(n2)) n1 else n1 + n2 - 1; nn <- if(is.null(n2)) n1 else 1/(1/n1 + 1/n2); good <- (ntot >= 0) & is.finite(m); if(!any(good)) return(0); # no data m <- m[good]; sumsq <- sumsq[good]; ntot <- ntot[good]; nn <- nn[good]; ll <- sum( log(a)*(v+1)/2 + log(b)/2 - lgamma((v+1)/2) - log(2)*(v+1)/2 - (ntot+1)/2 * log(2*pi) + (log(2*pi) - log(b+nn))/2 + lgamma((v+ntot+1)/2) - ((v+ntot+1)/2) * log((a + sumsq + m^2*nn*b/(b+nn))/2) ); attr(ll,"gradient") <- 0.5*sum(1/b - 1/(b+nn) - (v+ntot+1)*m^2*nn^2/((a+sumsq)*(b+nn)^2 + m^2*nn*b*(b+nn))); return(ll); } ### Taken from ebayes.R in LIMMA, by Gordon Smyth ### Reference: G.K. Smyth (2004) "Linear Models and Empirical Bayes Methods for Assessing ### Differential Expression in Microarray Experiments." Statistical Applications in ### Genetics and Molecular Biology 3(1) Article 3. ### fitFDist <- function(x,df1) { # Moment estimation of the parameters of a scaled F-distribution # The first degrees of freedom is given # Gordon Smyth # 8 Sept 2002. Last revised 12 Apr 2003. # Remove missing or infinite values and zero degrees of freedom o <- is.finite(x) & is.finite(df1) & (x > 0) & (df1 > 0) if(any(!o)) { x <- x[o] df1 <- df1[o] } n <- length(x) if(n==0) return(list(scale=NA,df2=NA)) # Better to work on with log(F) z <- log(x) e <- z-digamma(df1/2)+log(df1/2) emean <- mean(e) evar <- mean(n/(n-1)*(e-emean)^2-trigamma(df1/2)) if(evar > 0) { df2 <- 2*trigammaInverse(evar) s20 <- exp(emean+digamma(df2/2)-log(df2/2)) } else { df2 <- Inf s20 <- exp(emean) } list(scale=s20,df2=df2) } trigammaInverse <- function(x) { # Solve trigamma(y) = x for y # Gordon Smyth # 8 Sept 2002. Last revised 12 March 2004. # Non-numeric or zero length input if(!is.numeric(x)) stop("Non-numeric argument to mathematical function") if(length(x)==0) return(numeric(0)) # Treat out-of-range values as special cases omit <- is.na(x) if(any(omit)) { y <- x if(any(!omit)) y[!omit] <- Recall(x[!omit]) return(y) } omit <- (x < 0) if(any(omit)) { y <- x y[omit] <- NaN warning("NaNs produced") if(any(!omit)) y[!omit] <- Recall(x[!omit]) return(y) } omit <- (x > 1e7) if(any(omit)) { y <- x y[omit] <- 1/sqrt(x[omit]) if(any(!omit)) y[!omit] <- Recall(x[!omit]) return(y) } omit <- (x < 1e-6) if(any(omit)) { y <- x y[omit] <- 1/x[omit] if(any(!omit)) y[!omit] <- Recall(x[!omit]) return(y) } # Newton's method # 1/trigamma(y) is convex, nearly linear and strictly > y-0.5, # so iteration to solve 1/x = 1/trigamma is monotonically convergent Rversion <- as.numeric(version$major)+as.numeric(version$minor)/10 if(Rversion > 1.81) tetraGamma <- function(x) psigamma(x,deriv=2) else tetraGamma <- tetragamma y <- 0.5+1/x iter <- 0 repeat { iter <- iter+1 tri <- trigamma(y) dif <- tri*(1-tri/x)/tetraGamma(y) y <- y+dif if(max(-dif/y) < 1e-8) break if(iter > 50) { warning("Iteration limit exceeded") break } } y } # returns a data frame of nPairs, nGenes, statistic, opAcc (the cumulative accuracy estimate), and other stats # nPairs can be > nGenes because of genes in large operons; # nPairs can be < nGenes b/c of genes not in operons # both Gene1 (the gene in the list) and Gene2 (the gene whose data is being tested -- this may # not be the same order as in the input) are also included. # # input (locusId, statistic) need not be sorted changersVsOperonPairs <- function(locusId, # vector statistic, # must be signed (for up vs. down), e.g., zScore # must include Gene1, Gene2, and pOp; including only pOp>.5 is recommended pairs, flipPairs=TRUE, # need to add reverse of pairs (Gene2,Gene1, same pOp)? window=100, # but first set of window is cumulative average useP=FALSE, # use P(Operon)? up=TRUE, both=FALSE, minAbs=0) { # only for the 2nd gene in the pair, but this can be biasing... if (both) { out1 <- changersVsOperonPairs(locusId,statistic,pairs,flipPairs=flipPairs,useP=useP,up=TRUE,minAbs=minAbs); out2 <- changersVsOperonPairs(locusId,statistic,pairs,flipPairs=flipPairs,useP=useP,up=FALSE,minAbs=minAbs); return(rbind(out1,out2)); } if(flipPairs) { pairs2 <- rbind(pairs[,c("Gene1","Gene2","pOp")], data.frame(Gene1=pairs$Gene2,Gene2=pairs$Gene1,pOp=pairs$pOp)); } else { pairs2 <- pairs[,c("Gene1","Gene2","pOp")]; } # get statistic for both pairs, remove NA and small (unconfident) values, and sort by stat1 pairs2 <- merge(pairs2,data.frame(Gene1=locusId,stat1=statistic),by="Gene1"); pairs2 <- merge(pairs2,data.frame(Gene2=locusId,stat2=statistic),by="Gene2"); pairs2 <- pairs2[!is.na(pairs2$stat1) & !is.na(pairs2$stat2),]; pairs2 <- pairs2[abs(pairs2$stat2) >= minAbs,]; sign <- if(up) 1 else -1; pairs2 <- pairs2[order(-sign*pairs2$stat1,pairs2$Gene1),]; n <- countif(sign*pairs2$stat1>0); pairs2 <- pairs2[1:n,]; out <- data.frame(nPairs=1:n); # to compute nGenes, need to sort all of them, whether or not in pairs genes <- merge(data.frame(Gene1=locusId,stat1=statistic),pairs2[,c("Gene1","Gene2")], by="Gene1", all.x=TRUE,all.y=FALSE); genes <- genes[!is.na(genes$stat1),]; genes <- genes[order(-sign*genes$stat1,genes$Gene1),]; genes <- genes[sign*genes$stat1>0,]; # counting different values nGenes <- c(1,1+cumsum(ifelse(genes$Gene1[1:(nrow(genes)-1)] != genes$Gene1[2:nrow(genes)],1,0))); out$nGenes <- nGenes[!is.na(genes$Gene2)]; # only keep rows that were in pairs2 out$statistic <- pairs2$stat1; out$bAgree <- ifelse(sign*pairs2$stat2>0,1,0); out$pOp <- pairs2$pOp; out$raw <- ifelse(out$bAgree,1,-1)/(if(useP) out$pOp else rep(1,n)); # smooth raw over window size to give opAcc if(n > window) { out$opAcc <- cumsum(out$raw); out$opAcc[(window+1):n] <- out$opAcc[(window+1):n] - out$opAcc[1:(n-window)]; out$opAcc <- out$opAcc/pmin(1:n,window); } else { out$opAcc <- cumsum(out$raw)/(1:n); } out$cumAcc <- cumsum(out$raw)/(1:n); out$Gene1 <- pairs2$Gene1; out$Gene2 <- pairs2$Gene2; return(out); } # number of times x is TRUE countif <- function(x) { return(sum(ifelse(x,1,0))); } # percentage of time that x is TRUE countPerIf <- function(x,condition=NULL) { if(!is.null(condition)) { x <- x[condition]; } return (sum(ifelse(x,1,0))/length(x)); } withoutNA <- function(x) { x[!is.na(x)]; } andNoNA <- function(x) { ifelse(is.na(x),FALSE,x) } logodds <- function(p) { return(log(p/(1-p))); } logodds2p <- function(x) { return( 1/(1+exp(-x)) ); } cutN <- function(x,n=8,...) { return(cut(x,breaks=unique(quantile(x,seq(0,1,1/n))),include.lowest=TRUE,...)); } # Plots OperonAgreementVsConfidencePlot <- function(analyzed,...) { plot(analyzed$changersCMeans,analyzed$changersPAgreesAdj,type="b",xlim=c(0.5,1),ylim=c(0,1.1), xlab="Confidence of Measurement", ylab="Agreement with Operons (Adjusted)",...); arrows(analyzed$changersCMeans,analyzed$changersPAgreesAdjConf[,1], analyzed$changersCMeans,analyzed$changersPAgreesAdjConf[,2], code=3,angle=90,col="grey") } SignificantChangersPlot <- function(analyzed, bar=0.01) { plot(1:nrow(analyzed$stats), sort(1-2*abs(analyzed$stats$pow0-.5)), type="l",log="xy", xlab="# Genes", ylab="Significance of difference from zero (two-tailed)", ylim=c(1e-5,1),xlim=c(1,1000)); lines(1:nrow(analyzed$stats), sort(1-2*abs(analyzed$stats$p0-.5)), lty=2); lines(c(1e-40,1e20),c(bar,bar),col="darkgrey"); legend(1,1,c("Single-gene","Operon-wise"),col=c(1,1),lwd=c(1,1),lty=2:1,pch=c(30,30)); } ### Simulator -- useful for verifying the correct behavior of the method ### or for identifying discrepancies between the model and the data ### ### Usually, do something like ### SimulateUarrayVsOperons(analyzed$locusId, opPred, ### fit=analyzed$fit, ### n = 3, # actual number of replicate experiments ### n2 = NULL, # or actual number of control measurements ### pNA=countPerIf(is.na(c(raw[,2:5],recursive=TRUE)))) ### ### (where 2:6 are the numbers of the data columns in the raw data) ### ### Returns -- the same as AnalyzeUarrayVsOperons() [or as SummarizeUarrayVsOperons() if ### analyze=FALSE] SimulateUarrayVsOperons <- function(locusIds, opPreds, # with Gene1, Gene2, and pOp, or NULL # standard model taken from Shewanella salt data fit=NULL, a = if(is.null(fit)) 0.3 else fit$a, bc = if(is.null(fit)) 0.1 else fit$bc, c = if(is.null(fit)) 0.28 else fit$c, v = if(is.null(fit)) 3.25 else fit$v, b = 1/(1/bc-1/c), n = 10, # number of replicates (must be constant) control=FALSE, # true if have separate control chips n2 = n, # number of replicates for control data (if not on same chip) pNA = 0.05, # proportion of missing measurements # lots of alternatives, defaulting to standard model # sd(mu) = (changer? s : s2) + sqrt(sFactor)*sigma pChange = 1.0, # by default all genes change s = 0.0, # standard deviation of true mu s2 = 0.0, # standard deviation of true mu for non-changers sFactor = 1/b, # variance ratio for changers sFactorNC = sFactor, # variance ratio for non-changers # s.d. of replicate measurements = sigmaFactor/sqrt(rgamma(shape,scale)) sigmaFactor = 1, shape=(v+1)/2, scale=2/a, sigma = sigmaFactor/sqrt(rgamma(length(locusIds),shape=shape,scale=scale)), sdBias = 0, # systematic error (sigma is between replicates) scaledBias = 1/c, # systematic error relative to sigma^2 (like sFactor) matchSigma=TRUE, # force sigma/theta to match across operon sweep=FALSE, # sweep means to be zero debug=FALSE, seed=NULL, analyze=TRUE, # run AnalyzeUarrayVsOperons estimate=TRUE, # if false, analyze with true parameters ... # for AnalyzeUarrayVsOperons ) { if (is.null(seed)) seed <- floor(runif(1)*1e9); set.seed(seed); changer <- runif(length(locusIds)) < pChange; mu <- rnorm(length(locusIds)) * (ifelse(changer, s, s2) + sigma*sqrt(ifelse(changer,sFactor,sFactorNC))); # make mu and perhaps sigma agree for all operon pairs if (!is.null(opPreds)) { oppairs <- merge(opPreds[,c("Gene1","Gene2","pOp")], data.frame(Gene1=locusIds,index1=1:length(locusIds)),by="Gene1"); oppairs <- merge(oppairs,data.frame(Gene2=locusIds,index2=1:length(locusIds)),by="Gene2"); oppairs$sOp <- runif(nrow(oppairs)) < oppairs$pOp; oppairs <- oppairs[order(oppairs$index1,oppairs$index2),]; oppairsT <- oppairs[oppairs$sOp,]; mu[oppairsT$index1] <- mu[oppairsT$index2]; if (matchSigma) sigma[oppairsT$index1] <- sigma[oppairsT$index2]; while (countif(badpair <- (mu[oppairsT$index1] != mu[oppairsT$index2])) > 0) { if(debug) cat("Syncing operon pairs, mismatch left", countif(badpair),"\n"); mu[oppairsT$index2] <- mu[oppairsT$index1]; if(matchSigma) sigma[oppairsT$index2] <- sigma[oppairsT$index1]; } } bias <- rnorm(length(locusIds)) * (sdBias + sigma*sqrt(scaledBias)); ss <- function(x) sum( withoutNA(x - mean(x,na.rm=TRUE))^2 ); if (control) { data1 <- t(sapply(1:length(locusIds), function(x) ifelse(runif(n)0) == (muObs[oppairs$index2]>0); n1 <- apply(data1, 1, function(x) countif(!is.na(x))) n2 <- if(is.null(data2)) NULL else apply(data2, 1, function(x) countif(!is.na(x))); sim <- list(mu=mu,sigma=sigma,bias=bias,muObs=muObs,sumsq=sumsq, n1=n1,n2=n2, changer=changer, oppairs=if(is.null(opPreds)) NULL else oppairs,data1=data1,data2=data2, control=control, locusId=locusIds, # summary statistics medianSS=median(sumsq,na.rm=TRUE), sdMuObs=sd(muObs,na.rm=TRUE), seed=seed, trueFit=list(a=a,bc=bc,b=b,c=c,v=v), pOpAgree=if(is.null(opPreds)) NULL else countPerIf(withoutNA(oppairs$agree[oppairs$pOp>.5]))); if(!estimate) sim$fit <- sim$trueFit; if(analyze) return(AnalyzeUarrayVsOperons(sim,debug=debug,...)); return(sim); } ## No-replicate analyses. The p-values for each gene are named as for ## replicate-based OpWise (p0 and pow0 for single-gene and operon-wise posterior ## probabilities that the true log-ratio is greater than 0; pFold and powFold for the ## single-gene and operon-wise posterior probabilities that the true change is in ## the observed direction and by fold or more. OpWiseNoRepP <- function(muObs,pairs,fit=OpWiseNoRepFit(muObs,pairs), locusId=NULL, fold=1.5,base=2,minLogRatio=log(fold)/log(base)) { sqrtV <- sqrt( (fit$a + muObs^2*fit$b/(1+fit$b)) / ((fit$v+2)*(1+fit$b)) ); mu0 <- muObs/(1+fit$b); p0 <- pt( -mu0/sqrtV, df=fit$v+2, lower=FALSE); pUp <- pt((minLogRatio-mu0)/sqrtV, df=fit$v+2, lower=FALSE); pDown <- pt((-minLogRatio-mu0)/sqrtV, df=fit$v+2, lower=TRUE); out <- data.frame(muObs=muObs,p0=p0,pUp=pUp,pDown=pDown); if(!is.null(locusId)) out$locusId<-locusId; pairs$pOpPost <- logodds2p(logodds(pairs$pOp) + OpWiseNoRepLRatio(muObs,pairs,fit$a,fit$b,fit$v)); pairs$pOpPost[is.na(pairs$pOpPost)] <- pairs$pOp[is.na(pairs$pOpPost)]; pairs$row <- 1:nrow(pairs); pairsP <- as.data.frame(t(as.matrix(sapply(pairs$row, function(i) OpWiseNoRepTestOp(muObs[c(pairs$index1[i],pairs$index2[i])], fit$a, fit$b, fit$v, minLogRatio=minLogRatio))))); names(pairsP) <- c("pp0","ppUp","ppDown"); pairs <- cbind(pairs,pairsP); triples <- merge(pairs,pairs,by.x="Gene2",by.y="Gene1"); triples$row <- 1:nrow(triples); triplesP <- as.data.frame(t(sapply(triples$row, function(i) OpWiseNoRepTestOp(muObs[c(triples$index1.x[i],triples$index2.x[i], triples$index2.y[i])], fit$a, fit$b, fit$v, minLogRatio=minLogRatio)))); names(triplesP) <- c("ppp0","pppUp","pppDown"); triples <- cbind(triples,triplesP); inPair1 <- merge(data.frame(index1=1:length(muObs)),pairs,all.x=TRUE,all.y=FALSE); inPair2 <- merge(data.frame(index2=1:length(muObs)),pairs,all.x=TRUE,all.y=FALSE); # (has to be the middle in the triple) inTriple <- merge(data.frame(index2.x=1:length(muObs)),triples,all.x=TRUE,all.y=FALSE); # ordered by muObs and expanded with NA rows for missing entries ppair1 <- pairs[inPair1$row,]; ppair2 <- pairs[inPair2$row,]; ptriple <- triples[inTriple$row,]; for(i in c("pp0","ppUp","ppDown","pOpPost")) { ppair1[[i]][is.na(ppair1[[i]])] <- 0; ppair2[[i]][is.na(ppair2[[i]])] <- 0; } for(i in c("ppp0","pppUp","pppDown")) { ptriple[[i]][is.na(ptriple[[i]])] <- 0; } out$pow0 <- (ppair1$pOpPost*ppair2$pOpPost*ptriple$ppp0 + ppair1$pOpPost*(1-ppair2$pOpPost)*ppair1$pp0 + (1-ppair1$pOpPost)*ppair2$pOpPost*ppair2$pp0 + (1-ppair1$pOpPost)*(1-ppair2$pOpPost)*out$p0); out$powUp <- (ppair1$pOpPost*ppair2$pOpPost*ptriple$pppUp + ppair1$pOpPost*(1-ppair2$pOpPost)*ppair1$ppUp + (1-ppair1$pOpPost)*ppair2$pOpPost*ppair2$ppUp + (1-ppair1$pOpPost)*(1-ppair2$pOpPost)*out$pUp); out$powDown <- (ppair1$pOpPost*ppair2$pOpPost*ptriple$pppDown + ppair1$pOpPost*(1-ppair2$pOpPost)*ppair1$ppDown + (1-ppair1$pOpPost)*ppair2$pOpPost*ppair2$ppDown + (1-ppair1$pOpPost)*(1-ppair2$pOpPost)*out$pDown); out$pFold <- ifelse(is.na(muObs),0,ifelse(out$muObs>0,out$pUp,out$pDown)); out$powFold <- ifelse(is.na(muObs),0,ifelse(out$muObs>0,out$powUp,out$powDown)); return(out); } # Given a and v, b is implied by a according to # mean(muObs^2) = a/(v-1) * (1+1/b) OpWiseNoRepLL <- function(a, v, muObs, pairs) { if(length(a) != 1 || length(v) != 1 ) stop("More than one value"); if(a < 1e-6) return(-1e20); b <- 1/(mean(muObs^2,na.rm=TRUE)*((v-1)/a) - 1); if(b < 0 || b > 1e6) return(-1e20); m1 <- muObs[pairs$index1]; m2 <- muObs[pairs$index2]; C <- 1/(2*pi) *sqrt(b/(b+2)) * (a/2)^((v+1)/2) / gamma((v+1)/2); M = a+m1^2+m2^2-(m1+m2)^2/(b+2); p12op <- C * gamma((v+3)/2) * (M/2)^(-(v+3)/2); bb <- b/(1+b); p1nop <- dt(m1*sqrt((v+1)*bb/a),df=v+1)*sqrt((v+1)*bb/a); p2nop <- dt(m2*sqrt((v+1)*bb/a),df=v+1)*sqrt((v+1)*bb/a); ppair <- (1-pairs$pOp)*p1nop*p2nop + pairs$pOp*p12op; return(sum(log(ppair))); } OpWiseNoRepLRatio <- function(muObs,pairs,a,b,v) { m1 <- muObs[pairs$index1]; m2 <- muObs[pairs$index2]; C <- 1/(2*pi) *sqrt(b/(b+2)) * (a/2)^((v+1)/2) / gamma((v+1)/2); M = a+m1^2+m2^2-(m1+m2)^2/(b+2); p12op <- C * gamma((v+3)/2) * (M/2)^(-(v+3)/2); bb <- b/(1+b); p1nop <- dt(m1*sqrt((v+1)*bb/a),df=v+1)*sqrt((v+1)*bb/a); p2nop <- dt(m2*sqrt((v+1)*bb/a),df=v+1)*sqrt((v+1)*bb/a); return(log(p12op)-log(p1nop)-log(p2nop)); } # Doesn't try to estimate v -- it must be given OpWiseNoRepFit <- function(muObs,pairs,v0=3,b0=2) { a0 <- mean(muObs^2)*(v0-1)/(1+1/b0); pairs <- pairs[pairs$pOp>=.5 & !is.na(muObs[pairs$index1]) & !is.na(muObs[pairs$index2]),]; param <- nlm(function(param) -OpWiseNoRepLL(param[1],param[2],muObs,pairs),c(a0,v0))$estimate; a <- param[1]; v <- param[2]; b <- 1/(mean(muObs^2,na.rm=TRUE)*((v-1)/a) - 1); return(data.frame(a=a,b=b,v=v,LL=OpWiseNoRepLL(a,v,muObs,pairs))); } # P(operon mu > 0| values for operon members) # also does p0,pUp,pDown # assumes that the operon is correct OpWiseNoRepTestOp <- function(mus,a,b,v,fold=1.5,base=2,minLogRatio=log(fold)/log(base)) { mus <- withoutNA(mus); if(length(mus)<1) return(c(NA,NA,NA)); mm <- mean(mus); N <- length(mus); sV <- sum((mus-mm)^2); mu0 <- mm*N/(N+b); sqrtV <- sqrt( (a + sV + mm^2*b*N/(N+b))/((v+N+1)*(N+b)) ); p0 <- pt(-mu0/sqrtV,df=v+N+1,lower=FALSE); pUp <- pt((minLogRatio-mu0)/sqrtV,df=v+N+1,lower=FALSE); pDown <- pt((-minLogRatio-mu0)/sqrtV,df=v+N+1,lower=TRUE); return(c(p0=p0,pUp=pUp,pDown=pDown)); }