Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/build/vignette.rds and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/build/vignette.rds differ diff -Nru abn-1.2/debian/changelog abn-1.3/debian/changelog --- abn-1.2/debian/changelog 2018-11-24 23:21:29.000000000 +0000 +++ abn-1.3/debian/changelog 2018-11-24 23:21:29.000000000 +0000 @@ -1,14 +1,21 @@ -abn (1.2-1cran1ppa0) xenial; urgency=medium +abn (1.3-1cran1ppa0) xenial; urgency=medium - * Compilation for Ubuntu 16.04.4 LTS + * Compilation for Ubuntu 16.04.5 LTS + + -- Michael Rutter Sat, 24 Nov 2018 23:18:39 +0000 + +abn (1.3-1cran1) testing; urgency=low + + * cran2deb svn: 362M with DB version 1. + + -- cran2deb4ubuntu Sat, 24 Nov 2018 08:43:40 -0500 - -- Michael Rutter Sat, 11 Aug 2018 21:04:21 +0000 abn (1.2-1cran1) testing; urgency=low * cran2deb svn: 362M with DB version 1. - -- cran2deb4ubuntu Sat, 11 Aug 2018 09:19:14 -0400 + -- cran2deb4ubuntu Sat, 11 Aug 2018 09:19:26 -0400 abn (1.0.2-1cran1) testing; urgency=low diff -Nru abn-1.2/DESCRIPTION abn-1.3/DESCRIPTION --- abn-1.2/DESCRIPTION 2018-08-10 12:40:02.000000000 +0000 +++ abn-1.3/DESCRIPTION 2018-11-22 14:30:08.000000000 +0000 @@ -1,6 +1,6 @@ Package: abn -Version: 1.2 -Date: 2018-07-13 +Version: 1.3 +Date: 2018-11-20 Title: Modelling Multivariate Data with Additive Bayesian Networks Authors@R: c(person("Gilles", "Kratzer", role = c("aut", "cre"), email = "gilles.kratzer@math.uzh.ch", @@ -21,7 +21,8 @@ Imports: methods, rjags LinkingTo: Rcpp, RcppArmadillo Depends: nnet, Cairo, MASS, lme4, R (>= 2.15.1) -Suggests: INLA, Rgraphviz +Suggests: INLA, Rgraphviz, R.rsp +VignetteBuilder: R.rsp Additional_repositories: https://inla.r-inla-download.org/R/stable/ SystemRequirements: Gnu Scientific Library version >= 1.12 Description: Bayesian network analysis is a form of probabilistic graphical models which derives from empirical data a directed acyclic graph, DAG, describing the dependency structure between random variables. An additive Bayesian network model consists of a form of a DAG where each node comprises a generalized linear model, GLM. Additive Bayesian network models are equivalent to Bayesian multivariate regression using graphical modelling, they generalises the usual multivariable regression, GLM, to multiple dependent variables. 'abn' provides routines to help determine optimal Bayesian network models for a given data set, where these models are used to identify statistical dependencies in messy, complex data. The additive formulation of these models is equivalent to multivariate generalised linear modelling (including mixed models with iid random effects). The usual term to describe this model selection process is structure discovery. The core functionality is concerned with model selection - determining the most robust empirical model of data from interdependent variables. Laplace approximations are used to estimate goodness of fit metrics and model parameters, and wrappers are also included to the INLA package which can be obtained from . A comprehensive set of documented case studies, numerical accuracy/quality assurance exercises, and additional documentation are available from the 'abn' website. @@ -30,6 +31,6 @@ URL: http://www.r-bayesian-networks.org RoxygenNote: 6.0.1 NeedsCompilation: yes -Packaged: 2018-08-10 09:44:48 UTC; gkratz +Packaged: 2018-11-22 13:17:53 UTC; Kratzer Repository: CRAN -Date/Publication: 2018-08-10 12:40:02 UTC +Date/Publication: 2018-11-22 14:30:08 UTC diff -Nru abn-1.2/inst/CITATION abn-1.3/inst/CITATION --- abn-1.2/inst/CITATION 2018-08-02 13:04:41.000000000 +0000 +++ abn-1.3/inst/CITATION 2018-11-20 09:25:04.000000000 +0000 @@ -19,7 +19,7 @@ title = "abn: an R package for modelling multivariate data using additive Bayesian networks", author = personList(as.person("Gilles Kratzer"), as.person("Marta Pittavino"), as.person("Lewis Fraser Ian"), as.person("Lewis, Fraser I")), year = "2017", - note = "R package version 1.1", + note = "R package version 1.3", url = "https://CRAN.R-project.org/package=abn", textVersion = Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/inst/doc/abn_v1.0.2.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/inst/doc/abn_v1.0.2.pdf differ diff -Nru abn-1.2/inst/doc/abn_v1.0.2.R abn-1.3/inst/doc/abn_v1.0.2.R --- abn-1.2/inst/doc/abn_v1.0.2.R 2018-08-10 09:44:48.000000000 +0000 +++ abn-1.3/inst/doc/abn_v1.0.2.R 1970-01-01 00:00:00.000000000 +0000 @@ -1,833 +0,0 @@ -### R code from vignette source 'abn_v1.0.2.Rnw' - -################################################### -### code chunk number 1: abn_v1.0.2.Rnw:176-181 (eval = FALSE) -################################################### -## library( abn) # Load the library -## -## mydat <- pigs.vienna[,-11] # Get data, drop batch variable -## -## mydat[ complete.cases(mydat),] - - -################################################### -### code chunk number 2: abn_v1.0.2.Rnw:185-186 (eval = FALSE) -################################################### -## mydat[,1] <- as.factor(mydat[,1]) - - -################################################### -### code chunk number 3: abn_v1.0.2.Rnw:189-190 (eval = FALSE) -################################################### -## levels( mydat[,1]) - - -################################################### -### code chunk number 4: abn_v1.0.2.Rnw:201-202 (eval = FALSE) -################################################### -## ban <- matrix( rep(0,dim(mydat)[2]^2),ncol=dim(mydat)[2]) - - -################################################### -### code chunk number 5: abn_v1.0.2.Rnw:205-209 (eval = FALSE) -################################################### -## colnames( ban) <- rownames(ban) <- names(mydat) -## -## retain <- matrix( rep(0,dim(mydat)[2]^2),ncol=dim(mydat)[2]) -## colnames( retain) <- rownames(retain) <- names(mydat) - - -################################################### -### code chunk number 6: abn_v1.0.2.Rnw:212-217 (eval = FALSE) -################################################### -## mydists <- list( PC="binomial", PT="binomial", MS="binomial", -## HS="binomial", TAIL="binomial", -## Abscess="binomial", Pyaemia="binomial", -## EPcat="binomial", PDcat="binomial", -## plbinary="binomial") - - -################################################### -### code chunk number 7: abn_v1.0.2.Rnw:220-223 (eval = FALSE) -################################################### -## mycache <- buildscorecache( data.df=mydat, -## data.dists=mydists, dag.banned=ban, -## dag.retained=retain, max.parents=max.par) - - -################################################### -### code chunk number 8: abn_v1.0.2.Rnw:226-232 (eval = FALSE) -################################################### -## mp.dag <- mostprobable( score.cache=mycache) -## -## fabn <- fitabn( dag.m=mp.dag,data.df=mydat, -## data.dists=mydists) -## -## datadir <- tempdir() - - -################################################### -### code chunk number 9: abn_v1.0.2.Rnw:235-237 (eval = FALSE) -################################################### -## save( mycache, mp.dag, fabn, file= -## paste(datadir,"mp",max.par,".RData",sep="")) - - -################################################### -### code chunk number 10: abn_v1.0.2.Rnw:251-258 (eval = FALSE) -################################################### -## load( "RData/Rout1.RData") -## ml1 <- fabn$mlik; -## -## tographviz( dag.m=mp.dag,data.df=mydat,data.dists=mydists, -## outfile="DAG_cycle.dot") -## system( "dot -Tpdf -o DAG_cycle.pdf DAG_cycle.dot") -## system( "evince DAG_cycle.pdf&") - - -################################################### -### code chunk number 11: abn_v1.0.2.Rnw:261-262 (eval = FALSE) -################################################### -## mp.mlik <- c( -44711.62,-44685.53,-44684.64,-44684.64,-44684.64) - - -################################################### -### code chunk number 12: abn_v1.0.2.Rnw:288-290 (eval = FALSE) -################################################### -## marg.f <- fitabn( dag.m=mydag,data.df=mydat,data.dists=mydists, -## compute.fixed=TRUE,n.grid=1000) - - -################################################### -### code chunk number 13: abn_v1.0.2.Rnw:298-312 (eval = FALSE) -################################################### -## library( Cairo) # Available from CRAN -## CairoPDF( "MargPlots_PigsData.pdf") -## for( i in 1:length(marg.f$marginals)){ -## cat( "processing marginals for node:", -## nom1 <- names( marg.f$marginals)[i],"\n") -## cur.node <- marg.f$marginals[i] -## # Get the marginal for current node, this is a matrix [x,f(x)] -## cur.node <- cur.node[[1]] -## # This is always [[1]] for models without random effects -## for( j in 1:length(cur.node)){ -## cat( "processing parameter:",nom2<- names(cur.node)[j],"\n") -## cur.param <- cur.node[[j]] -## plot( cur.param,type="l",main=paste(nom1,":",nom2))}} -## dev.off() - - -################################################### -### code chunk number 14: abn_v1.0.2.Rnw:326-330 (eval = FALSE) -################################################### -## marnew <- marg.f$marginals[[1]] -## -## for( i in 2: length(marg.f$marginals)){ -## marnew <- c(marnew, marg.f$marginals[[i]])} - - -################################################### -### code chunk number 15: abn_v1.0.2.Rnw:334-342 (eval = FALSE) -################################################### -## myarea <- rep( NA,length( marnew)) -## names( myarea) <- names( marnew) -## for( i in 1:length(marnew)){ -## tmp <- spline(marnew[[i]]) -## # Spline just helps make the estimation smoother -## myarea[i] <- sum(diff(tmp$x)*tmp$y[-1])} -## # Just width x height of rectangles -## cbind( myarea) - - -################################################### -### code chunk number 16: abn_v1.0.2.Rnw:345-361 (eval = FALSE) -################################################### -## library( Cairo) -## mycols <- rep("green",length(marnew)) -## mycols[1:2] <- "red"; # PC -## mycols[3] <- "orange"; # PT -## mycols[4:5] <- "yellow"; # MS -## mycols[6:7] <- "blue"; # HS -## mycols[8:9] <- "lightblue"; # TAIL -## mycols[10] <- "mistyrose"; # Abscess -## mycols[11:12] <- "green"; # Pyaemia, lightcyan -## mycols[13:14] <- "lavender"; # EPcat -## mycols[15:18] <- "cornsilk"; # PDcat -## mycols[19:22] <- "brown" ; # plbinary -## CairoPNG( "Area_PigsData.png",pointsize=10,width=720,height=640) -## par( las=2, mar=c(8.1,4.1,4.1,2.1)) -## barplot( myarea,ylab="Area under Density",ylim=c(0,2), col=mycols) -## dev.off() - - -################################################### -### code chunk number 17: abn_v1.0.2.Rnw:364-393 (eval = FALSE) -################################################### -## print( names(marnew)) -## # want to bind all the marginals the same nodes into a matrix -## m <- marnew; -## # PT -> PC, 1 parent, 2 params; -## PC.p <- cbind( m[["PC|(Intercept)"]], m[["PC|PT"]]) -## # PC --> NO PARENTS, 1 params; -## PT.p <- cbind( m[["PT|(Intercept)"]]) -## # HS -> MS, 1 parent, 2 params; -## MS.p <- cbind( m[["MS|(Intercept)"]], m[["MS|HS"]]) -## # EPcat -> HS, 1 parent, 2 params; -## HS.p <- cbind( m[["HS|(Intercept)"]], m[["HS|EPcat"]]) -## # PDcat -> TAIL, 1 parent, 2 params; -## TAIL.p <- cbind( m[["TAIL|(Intercept)"]], m[["TAIL|PDcat"]]) -## # Abscess --> NO PARENTS, 1 param; -## Abscess.p <- cbind( m[["Abscess|(Intercept)"]]) -## # TAIL -> Pyaemia, 1 parent, 2 params; -## Pyaemia.p <- cbind(m[["Pyaemia|(Intercept)"]],m[["Pyaemia|TAIL"]]) -## # plbinary -> EPcat, 1 parent, 2 params; -## EPcat.p <- cbind( m[["EPcat|(Intercept)"]], m[["EPcat|plbinary"]]) -## # MS, EPcat, plbinary --> PDcat, 3 parents, 4 params; -## PDcat.p <- cbind( m[["PDcat|(Intercept)"]], m[["PDcat|MS"]], -## m[["PDcat|EPcat"]], m[["PDcat|plbinary"]]) -## # PC, PT, Abscess --> plbinary, 3 parents, 4 params; -## plbinary.p <- cbind(m[["plbinary|(Intercept)"]],m[["plbinary|PC"]], -## m[["plbinary|PT"]], m[["plbinary|Abscess"]]) -## -## dump( c( "PC.p","PT.p","MS.p","HS.p","TAIL.p","Abscess.p", -## "Pyaemia.p","EPcat.p","PDcat.p","plbinary.p"), -## file=paste('Jags/',"pigs_post_params.R",sep="")) - - -################################################### -### code chunk number 18: abn_v1.0.2.Rnw:405-406 (eval = FALSE) -################################################### -## orig.data <- pigs.vienna[,-11] - - -################################################### -### code chunk number 19: abn_v1.0.2.Rnw:410-411 (eval = FALSE) -################################################### -## system("jags jags_pigs_script.R") - - -################################################### -### code chunk number 20: abn_v1.0.2.Rnw:415-422 (eval = FALSE) -################################################### -## library( coda) -## boot.data <- read.coda("out1chain1.txt","out1index.txt") -## -## boot.data <- as.data.frame(boot.data) -## for( j in 1:dim(orig.data)[2]){if(is.factor(orig.data[,j])) -## { boot.data[,j]<- as.factor(boot.data[,j]) -## levels(boot.data[,j])<- levels(orig.data[,j])}} - - -################################################### -### code chunk number 21: abn_v1.0.2.Rnw:425-438 (eval = FALSE) -################################################### -## ban <- matrix( rep(0,dim(orig.data)[2]^2), -## ncol=dim(orig.data)[2]) -## colnames( ban) <- rownames(ban) <- names(orig.data) -## -## retain <- matrix( rep(0,dim(orig.data)[2]^2), -## ncol=dim(orig.data)[2]) -## colnames( retain) <- rownames(retain) <- names(orig.data) -## -## mydists <- list( PC="binomial", PT="binomial", MS="binomial", -## HS="binomial", TAIL="binomial", -## Abscess="binomial", Pyaemia="binomial", -## EPcat="binomial", PDcat="binomial", -## plbinary="binomial") - - -################################################### -### code chunk number 22: abn_v1.0.2.Rnw:441-442 (eval = FALSE) -################################################### -## max.par <- 3; - - -################################################### -### code chunk number 23: abn_v1.0.2.Rnw:445-448 (eval = FALSE) -################################################### -## boot1.cache <- buildscorecache( data.df=boot.data, -## data.dists=mydists, max.parents=max.par, -## dag.banned=ban, dag.retained=retain) - - -################################################### -### code chunk number 24: abn_v1.0.2.Rnw:451-455 (eval = FALSE) -################################################### -## boot1.mp <- mostprobable( score.cache=boot1.cache) -## -## datadir <- tempdir() -## save( boot1.mp,file=paste( datadir,"boot1run.RData",sep='')) - - -################################################### -### code chunk number 25: abn_v1.0.2.Rnw:518-547 (eval = FALSE) -################################################### -## mydata <- pigs.vienna[,-11] -## -## N <- 10240; -## # Write out manually, clearer than using rep() -## mydag <- matrix(c( -## # PC PT MS HS TAIL Abscess Pyaemia EPcat PDcat plbinary -## 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -## 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -## 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -## 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -## 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -## 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -## 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -## 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -## 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, -## 1, 1, 0, 0, 0, 1, 0, 0, 0, 0), -## byrow=TRUE,ncol=10) -## colnames(mydag) <- rownames(mydag) <- names(mydata) -## sum( mydag) # 12 arcs, as in original model, Figure 2 -## -## mydists <- list( PC="binomial", PT="binomial", MS="binomial", -## HS="binomial", TAIL="binomial", -## Abscess="binomial", Pyaemia="binomial", -## EPcat="binomial", PDcat="binomial", -## plbinary="binomial") -## -## # Use fitabn to check mydag is correct (no typos mlik = -44684.64) -## print( fitabn(dag.m=mydag,data.df=mydata,data.dists=mydists)$mlik) -## bestdag <- mydag - - -################################################### -### code chunk number 26: abn_v1.0.2.Rnw:557-569 (eval = FALSE) -################################################### -## boot.dags <- list() -## these <- grep("mp10Kboot\\d+.RData", dir()) -## num <- 1 -## for( i in dir()[these]){# Load each file -## load(i) # Provides dags, a list -## tmp <- dags[which(unlist(lapply(dags,sum))>0)] -## # Get valid entries in dags but as a list -## for( j in 1:length(tmp)){ -## # For each entry copy into boot.dags, and increment counter -## boot.dags[[num]]<- tmp[[j]]; num <- num+1 } -## rm( dags) -## } - - -################################################### -### code chunk number 27: abn_v1.0.2.Rnw:572-584 (eval = FALSE) -################################################### -## if( FALSE){ -## scores <- rep(0,length(boot.dags)) -## for(i in 1:length(boot.dags)){ -## scores[i] <- fitabn(dag.m=boot.dags[[i]],data.df=mydata, -## data.dists=mydists)$mlik -## } -## scores.b <- scores[-which(scores< -N)] -## orig.score <- fitabn(dag.m=bestdag,data.df=mydata, -## data.dists=mydists)$mlik -## plot(density(scores.b,from=min(scores.b),to=max(scores.b))) -## abline(v=orig.score,lwd=2,col="blue") -## } - - -################################################### -### code chunk number 28: abn_v1.0.2.Rnw:588-612 (eval = FALSE) -################################################### -## boot.dags.trim <- boot.dags -## for( i in 1:length(boot.dags)){ -## boot.dags.trim[[i]] <- boot.dags.trim[[i]]*bestdag } -## -## arc.freq <- lapply(boot.dags.trim,sum) -## arc.freq <- table(unlist(arc.freq)) -## library( Cairo) -## CairoPNG("PigsFreqBootRes.png",pointsize=10,width=720,height=700) -## par(las=1, mar=c(6.1,6.1,4.1,2.1)) -## barplot( arc.freq,ylab="",xlab="",col="skyblue", -## names.arg=names(arc.freq), ylim=c(0,2500)) -## par( las=1) -## mtext( "Number of arcs in bootstrap DAG",1,line=3,cex=1.5) -## par( las=3) -## mtext( "Frequency out of 10240",2,line=4,cex=1.5) -## dev.off() -## -## total.dag <- matrix(rep(0,dim(bestdag)[2]^2),ncol=dim(bestdag)[2]) -## colnames(total.dag) <- rownames(total.dag)<- colnames(bestdag) -## # Get the support for each arc, total.dag: -## for( i in 1:length(boot.dags)){ -## if(sum(boot.dags[[i]])>0){total.dag <- total.dag+boot.dags[[i]]}} -## total.dag <- total.dag*bestdag # We only want arcs in the best DAG -## total.dag - - -################################################### -### code chunk number 29: abn_v1.0.2.Rnw:618-621 (eval = FALSE) -################################################### -## f <- function(val,limit){ if(valj){ # Get most supported direction -## if(total.dag[i,j]>total.dag[j,i]){ -## m.i <- i; m.j <- j;} -## else {m.i <- j; m.j <- i;} -## # Does arc quality - exceed threshold of support -## if(total.dag[i,j]+total.dag[j,i]>N/2){ -## # We want this as more than 5000 -## bestdag.trim.nodir[m.i,m.j] <- 1}}}} -## -## tographviz( dag.m=bestdag.trim,data.df=mydata, -## data.dists=mydists, outfile="postbootpigs.dot") -## system( "dot -Tpdf -o postbootpigs.pdf postbootpigs.dot") -## system( "evince postbootpigs.pdf&") -## -## save( bestdag.trim,file=paste("bestdagpigs_trim.RData",sep='')) - - -################################################### -### code chunk number 31: abn_v1.0.2.Rnw:661-699 (eval = FALSE) -################################################### -## mydata <- pigs.vienna[,-11] -## -## mydists <- list( PC="binomial", PT="binomial", MS="binomial", -## HS="binomial", TAIL="binomial", -## Abscess="binomial", Pyaemia="binomial", -## EPcat="binomial", PDcat="binomial", -## plbinary="binomial") -## -## marg.f <- fitabn(dag.m=bestdag.trim,data.df=mydata, -## data.dists=mydists,compute.fixed=TRUE, -## n.grid=1000) -## -## library( Cairo) -## CairoPDF("Pigs_PostBootPlots.pdf") -## for( i in 1:length(marg.f$marginals)){ -## cat( "processing marginals for node:", -## nom1 <- names(marg.f$marginals)[i],"\n") -## cur.node <- marg.f$marginals[i] -## # Get marginal for current node, this is a matrix [x,f(x)] -## cur.node <- cur.node[[1]] -## # This is always [[1]] for models without random effects -## for( j in 1:length(cur.node)){ -## cat("processing parameter:", -## nom2 <- names(cur.node)[j],"\n") -## cur.param <- cur.node[[j]] -## plot( cur.param,type="l",main=paste(nom1,":",nom2))}} -## dev.off() -## -## marnew <- marg.f$marginals[[1]] -## for(i in 2: length(marg.f$marginals)){ -## marnew <- c(marnew, marg.f$marginals[[i]])} -## -## margs <- marnew; -## mymat <- matrix(rep(NA,length(margs)*3),ncol=3) -## rownames(mymat) <- names(margs) -## colnames(mymat) <- c("2.5%","50%","97.5%") -## ignore.me <- union(grep("\\(Int",names(margs)), -## grep("prec",names(margs))) - - -################################################### -### code chunk number 32: abn_v1.0.2.Rnw:702-715 (eval = FALSE) -################################################### -## comment <- rep("",length(margs)) -## for(i in 1:length(margs)){ -## tmp <- margs[[i]] -## tmp2 <- cumsum(tmp[,2])/sum(tmp[,2]) -## mymat[i,] <- c(tmp[which(tmp2>0.025)[1]-1,1], -## tmp[which(tmp2>0.5)[1],1], -## tmp[which(tmp2>0.975)[1],1]) -## myvec <- mymat[i,] -## if( !(i%in%ignore.me) && (myvec[1]<0 && myvec[3]>0)){ -## comment[i] <- "not sig. at 5%"} -## # Truncate for printing -## mymat[i,] <- as.numeric(formatC(mymat[i,],digits=3,format="f"))} -## cbind( mymat) - - -################################################### -### code chunk number 33: abn_v1.0.2.Rnw:797-803 -################################################### -library( abn) -bin.nodes <- c( 1,3,4,6,9,10,11,12,15,18,19,20,21,26,27,28,32) -var33.cat <- var33[,bin.nodes] #Categorical nodes only - -dag33 <- matrix( 0, 17, 17) -colnames( dag33) <- rownames( dag33) <- names( var33.cat)#Set names - - -################################################### -### code chunk number 34: abn_v1.0.2.Rnw:806-807 -################################################### -dag33["v11","v12"] <- 0; dag33["v11","v10"]<- 0; dag33["v4","v3"]<- 0; - - -################################################### -### code chunk number 35: abn_v1.0.2.Rnw:810-818 -################################################### -mydists.cat <- list( v1 ="binomial", v3 = "binomial", - v4 = "binomial", v6 = "binomial", v9 = "binomial", - v10 = "binomial", v11 = "binomial", v12 = "binomial", - v15 = "binomial", v18 = "binomial", v19 = "binomial", - v20 = "binomial", v21 = "binomial", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v32 = "binomial") -ind.mod.cat <- fitabn( data.df=var33.cat, dag.m=dag33, - data.dists=mydists.cat, verbose=FALSE) - - -################################################### -### code chunk number 36: abn_v1.0.2.Rnw:824-825 -################################################### -ind.mod.cat$mlik - - -################################################### -### code chunk number 37: abn_v1.0.2.Rnw:830-835 -################################################### -dag33["v11","v12"] <- 1; -dag33["v11","v10"] <- 1; -dag33["v4","v3"] <- 1; -dep.mod.cat <- fitabn( data.df=var33.cat, dag.m=dag33, - data.dists=mydists.cat, verbose=FALSE) - - -################################################### -### code chunk number 38: abn_v1.0.2.Rnw:838-839 -################################################### -dep.mod.cat$mlik - - -################################################### -### code chunk number 39: abn_v1.0.2.Rnw:842-844 -################################################### -tographviz( dag=dag33, data.df=var33.cat, data.dists=mydists.cat, - outfile="mydagcat.dot", directed=TRUE) #Create file - - -################################################### -### code chunk number 40: abn_v1.0.2.Rnw:858-861 -################################################### -var33.cts <- var33[,-bin.nodes] -dag33 <- matrix( 0, 16, 16) -colnames( dag33) <- rownames( dag33) <- names( var33.cts) - - -################################################### -### code chunk number 41: abn_v1.0.2.Rnw:864-870 -################################################### -mydists.cts <- list( v2 = "gaussian", v5 = "gaussian", - v7 = "gaussian", v8 = "gaussian", v13 = "gaussian", - v14 = "gaussian", v16 = "gaussian", v17 = "gaussian", - v22 = "gaussian", v23 = "gaussian", v24 = "gaussian", - v25 = "gaussian", v29 = "gaussian", v30 = "gaussian", - v31 = "gaussian", v33 = "gaussian") - - -################################################### -### code chunk number 42: abn_v1.0.2.Rnw:873-875 -################################################### -ind.mod.cts <- fitabn( data.df=var33.cts, dag.m=dag33, - data.dists=mydists.cts, verbose=FALSE) - - -################################################### -### code chunk number 43: abn_v1.0.2.Rnw:881-882 -################################################### -ind.mod.cts$mlik - - -################################################### -### code chunk number 44: abn_v1.0.2.Rnw:887-892 -################################################### -dag33["v33","v31"] <- 1; -dag33["v24","v23"] <- 1; -dag33["v14","v13"] <- 1; -dep.mod.cts <- fitabn( data.df=var33.cts, dag.m=dag33, - data.dists=mydists.cts, verbose=FALSE) - - -################################################### -### code chunk number 45: abn_v1.0.2.Rnw:895-899 -################################################### -dep.mod.cts$mlik - -tographviz( dag=dag33, data.df=var33.cts, data.dists=mydists.cts, - outfile="mydagcts.dot", directed=TRUE) #Create file - - -################################################### -### code chunk number 46: abn_v1.0.2.Rnw:912-914 -################################################### -dag33 <- matrix( 0, 33, 33) -colnames( dag33) <- rownames( dag33)<- names( var33) - - -################################################### -### code chunk number 47: abn_v1.0.2.Rnw:917-929 -################################################### -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial", v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") - - -################################################### -### code chunk number 48: abn_v1.0.2.Rnw:932-934 -################################################### -ind.mod <- fitabn( data.df=var33, dag.m=dag33, - data.dists=mydists.mix, verbose=FALSE) - - -################################################### -### code chunk number 49: abn_v1.0.2.Rnw:937-938 -################################################### -ind.mod$mlik - - -################################################### -### code chunk number 50: abn_v1.0.2.Rnw:941-967 -################################################### -dag33[2,1] <- 1; -dag33[4,3] <- 1; -dag33[6,4] <- 1; dag33[6,7] <- 1; -dag33[5,6] <- 1; -dag33[7,8] <- 1; -dag33[8,9] <- 1; -dag33[9,10] <- 1; -dag33[11,10] <- 1; dag33[11,12] <- 1; dag33[11,19] <- 1; -dag33[14,13] <- 1; -dag33[17,16] <- 1;dag33[17,20] <- 1; -dag33[15,14] <- 1; dag33[15,21] <- 1; -dag33[18,20] <- 1; -dag33[19,20] <- 1; -dag33[21,20] <- 1; -dag33[22,21] <- 1; -dag33[23,21] <- 1; -dag33[24,23] <- 1; -dag33[25,23] <- 1; dag33[25,26] <- 1; -dag33[26,20] <- 1; -dag33[33,31] <- 1; -dag33[33,31] <- 1; -dag33[32,21] <- 1; dag33[32,31] <- 1; dag33[32,29] <- 1; -dag33[30,29] <- 1; -dag33[28,27] <- 1; dag33[28,29] <- 1; dag33[28,31] <- 1; -dep.mod <- fitabn( data.df=var33, dag.m=dag33, - data.dists=mydists.mix, verbose=FALSE) - - -################################################### -### code chunk number 51: abn_v1.0.2.Rnw:971-974 -################################################### -dep.mod$mlik -tographviz( dag=dag33, data.df=var33, data.dists=mydists.mix, - outfile="mydag_all.dot", directed=TRUE)#Create file - - -################################################### -### code chunk number 52: abn_v1.0.2.Rnw:996-1000 -################################################### -bin.nodes <- c( 1,3,4,6,9,10,11,12,15,18,19,20,21,26,27,28,32) -var33.cat <- var33[,bin.nodes] #Categorical nodes only -dag33 <- matrix( 0, 17, 17) -colnames(dag33) <- rownames(dag33) <- names(var33.cat)#Set names - - -################################################### -### code chunk number 53: abn_v1.0.2.Rnw:1003-1007 -################################################### -banned.cat <- matrix( 0, 17, 17) -colnames(banned.cat) <- rownames(banned.cat) <- names(var33.cat) -retain.cat <- matrix( 0, 17, 17) -colnames(retain.cat) <- rownames(retain.cat) <- names(var33.cat) - - -################################################### -### code chunk number 54: abn_v1.0.2.Rnw:1010-1016 -################################################### -mydists.cat <- list( v1 = "binomial", v3 = "binomial", - v4 = "binomial", v6 = "binomial", v9 = "binomial", - v10 = "binomial", v11 = "binomial", v12 = "binomial", - v15 = "binomial", v18 = "binomial", v19 = "binomial", - v20 = "binomial", v21 = "binomial", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v32 = "binomial") - - -################################################### -### code chunk number 55: abn_v1.0.2.Rnw:1019-1022 -################################################### -mycache.cat <- buildscorecache( data.df=var33.cat, - data.dists=mydists.cat, dag.banned=banned.cat, - dag.retained=retain.cat, max.parents=1) - - -################################################### -### code chunk number 56: abn_v1.0.2.Rnw:1033-1036 (eval = FALSE) -################################################### -## heur.res.cat <- search.hillclimber( score.cache=mycache.cat, -## num.searches=1, seed=0, verbose=FALSE, -## trace=FALSE, timing.on=FALSE) - - -################################################### -### code chunk number 57: abn_v1.0.2.Rnw:1043-1050 -################################################### -var33.cts <- var33[,-bin.nodes] #Drop categorical nodes -dag33 <- matrix( 0, 16, 16) -colnames(dag33) <- rownames(dag33) <- names(var33.cts) #Set names -banned.cts <- matrix( 0, 16, 16) -colnames(banned.cts) <- rownames(banned.cts) <- names(var33.cts) -retain.cts <- matrix( 0, 16, 16) -colnames(retain.cts) <- rownames(retain.cts) <- names(var33.cts) - - -################################################### -### code chunk number 58: abn_v1.0.2.Rnw:1053-1059 -################################################### -mydists.cts <- list( v2 = "gaussian", v5 = "gaussian", - v7 = "gaussian", v8 = "gaussian", v13 = "gaussian", - v14 = "gaussian", v16 = "gaussian", v17 = "gaussian", - v22 = "gaussian", v23 = "gaussian", v24 = "gaussian", - v25 = "gaussian", v29 = "gaussian", v30 = "gaussian", - v31 = "gaussian", v33 = "gaussian") - - -################################################### -### code chunk number 59: abn_v1.0.2.Rnw:1063-1066 -################################################### -mycache.cts<- buildscorecache( data.df=var33.cts, - data.dists=mydists.cts, dag.banned=banned.cts, - dag.retained=retain.cts, max.parents=1) - - -################################################### -### code chunk number 60: abn_v1.0.2.Rnw:1069-1072 (eval = FALSE) -################################################### -## heur.res.cts<- search.hillclimber( score.cache=mycache.cts, -## num.searches=1, seed=0, verbose=FALSE, -## trace=FALSE, timing.on=FALSE) - - -################################################### -### code chunk number 61: abn_v1.0.2.Rnw:1080-1082 -################################################### -dag33 <- matrix( 0, 33, 33) -colnames(dag33) <- rownames(dag33) <- names(var33)#Set names - - -################################################### -### code chunk number 62: abn_v1.0.2.Rnw:1085-1089 -################################################### -banned.mix <- matrix( 0, 33, 33) -colnames(banned.mix) <- rownames(banned.mix) <- names(var33) -retain.mix<- matrix( 0, 33, 33) -colnames(retain.mix) <- rownames(retain.mix) <- names(var33) - - -################################################### -### code chunk number 63: abn_v1.0.2.Rnw:1092-1104 -################################################### -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial", v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") - - -################################################### -### code chunk number 64: abn_v1.0.2.Rnw:1108-1111 -################################################### -mycache.mix <- buildscorecache( data.df=var33, - data.dists=mydists.mix, dag.banned=banned.mix, - dag.retained=retain.mix, max.parents=1) - - -################################################### -### code chunk number 65: abn_v1.0.2.Rnw:1114-1117 -################################################### -heur.res.mix <- search.hillclimber( score.cache=mycache.mix, - num.searches=1, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) - - -################################################### -### code chunk number 66: abn_v1.0.2.Rnw:1130-1132 -################################################### -dag33 <- matrix( 0, 33, 33) -colnames(dag33) <- rownames(dag33) <- names(var33) #Set names - - -################################################### -### code chunk number 67: abn_v1.0.2.Rnw:1135-1139 -################################################### -banned.mix <- matrix( 0, 33, 33) -colnames(banned.mix)<- rownames(banned.mix)<- names(var33) -retain.mix <- matrix( 0, 33, 33) -colnames(retain.mix) <- rownames(retain.mix)<- names(var33) - - -################################################### -### code chunk number 68: abn_v1.0.2.Rnw:1142-1155 -################################################### -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial" , v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") -n.searches <- 10 - - -################################################### -### code chunk number 69: abn_v1.0.2.Rnw:1159-1160 -################################################### -max.par <- 1 - - -################################################### -### code chunk number 70: abn_v1.0.2.Rnw:1164-1166 -################################################### -mycache.mix <- buildscorecache( data.df=var33, data.dists=mydists.mix, -dag.banned=banned.mix, dag.retained=retain.mix, max.parents=max.par) - - -################################################### -### code chunk number 71: abn_v1.0.2.Rnw:1170-1173 -################################################### -myres.mlp <- search.hillclimber(score.cache=mycache.mix, - num.searches=n.searches, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) - - -################################################### -### code chunk number 72: abn_v1.0.2.Rnw:1185-1187 -################################################### -tographviz( dag= myres.mlp$consensus, data.df=var33, -data.dists=mydists.mix, outfile="dagcon.dot") #Create file - - diff -Nru abn-1.2/inst/doc/abn_v1.0.2.Rnw abn-1.3/inst/doc/abn_v1.0.2.Rnw --- abn-1.2/inst/doc/abn_v1.0.2.Rnw 2018-08-02 13:04:43.000000000 +0000 +++ abn-1.3/inst/doc/abn_v1.0.2.Rnw 1970-01-01 00:00:00.000000000 +0000 @@ -1,1205 +0,0 @@ - -% -*- mode: noweb; noweb-default-code-mode: R-mode; -*- -\documentclass[nojss]{jss} -%\include{graphicx} -%\usepackage[nogin]{Sweave} - -\usepackage{multicol} % This is so we can have multiple columns of text side-by-side -\columnsep=100pt % This is the amount of white space between the columns in the poster -\columnseprule=3pt % This is the thickness of the black line between the columns in the poster - -\usepackage[svgnames]{xcolor} % Specify colors by their 'svgnames', for a full list of all colors available see here: http://www.latextemplates.com/svgnames-colors - -\usepackage{times} % Use the times font -%\usepackage{palatino} % Uncomment to use the Palatino font - -% Package Imported from Exo/Make: -\usepackage{natbib} -\usepackage{calc} -\usepackage{url} -%\usepackage[colorlinks=true,pdftex]{hyperref} -\usepackage{bm} -\usepackage{rotate} - -%\usepackage{graphics} -%\usepackage{epstopdf} -\usepackage{graphicx} % Required for including images -%\graphicspath{{figures/}} % Location of the graphics files -\DeclareGraphicsExtensions{.pdf,.png,.jpg} -\usepackage{booktabs} % Top and bottom rules for table -\usepackage[font=small,labelfont=bf]{caption} % Required for specifying captions to tables and figures -\usepackage{amsfonts, amsmath, amsthm, amssymb} % For math fonts, symbols and environments -\usepackage{wrapfig} % Allows wrapping text around tables and figures - -\parindent0pt -%\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl,formatcom=\color{DarkGreen} Blue,fontsize=\relsize{-1}} -\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl, -formatcom=\color{DarkGreen}} -\newcommand{\rr}[1]{\texttt{\color{DarkGreen} #1}} -\definecolor{DarkGreen}{rgb}{0,0.39216,0} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% declarations for jss.cls - journal of statistical software -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\author{Gilles Kratzer, Marta Pittavino, Fraser Lewis, Reinhard Furrer} -\title{\textit{abn}: an \proglang{R} package for modelling multivariate -data using additive Bayesian networks} - -%\VignetteIndexEntry{A vignette for the abn package} -%% for pretty printing and a nice hypersummary also set: -%% \Plainauthor{, Second Author} %% comma-separated -\Plaintitle{Multidimensional Bayesian regression in R} -\Shorttitle{The \pkg{abn} package} - -\Abstract{ -This vignette describes the \proglang{R} package \pkg{abn} which provides functionality for identifying statistical dependencies in complex multivariate data using additive Bayesian network (ABN) models. This methodology is ideally suited for both univariate - one response variable, and multiple explanatory variables - and multivariate analysis, where in both cases all statistical dependencies between all variables in the data are sought. ABN models comprise of directed acyclic graphs (DAGs) where each node in the graph comprises a generalized linear model. Model search algorithms are used to identify those DAG structures most supported by the data. Currently implemented are models for data comprising of categorical, count and/or continuous variables. Further relevant information about \pkg{abn} can be found at: \href{http://www.r-bayesian-networks.org}{www.r-bayesian-networks.org}. -} - -% \Keywords{\proglang{R}, Bayesian Networks, additive models, structure discovery} -% \Plainkeywords{Bayesian Networks, additive models, structure discovery} -\Keywords{Graphical models, additive models, structure learning, exact order based search, parametric bootstrapping, JAGS, MCMC, parameter learning, heuristic search} -\Plainkeywords{Graphical models, additive models, structure learning, exact order based search, parametric bootstrapping, JAGS, MCMC, parameter learning, heuristic search} - -\Address{ - Gilles Kratzer\\ - Institute of Mathematics, University of Zurich\\ - Winterthurerstrasse 190, Zurich 8057, Switzerland\\ - E-mail: \email{gilles.kratzer@math.uzh.ch}\\ - - Marta Pittavino\\ - Institute of Mathematics, University of Zurich\\ - Winterthurerstrasse 190, Zurich 8057, Switzerland\\ - E-mail: \email{marta.pittavino@math.uzh.ch}\\ - - Fraser Ian Lewis\\ - Office of Health Economics\\ - United Kingdom, London\\ - E-mail: \email{flewis@ohe.org}\\ - - Reinhard Furrer\\ - Institute of Mathematics, University of Zurich\\ - Institute of Computational Science, University of Zurich\\ - Winterthurerstrasse 190, Zurich 8057, Switzerland\\ - E-mail: \email{reinhard.furrer@math.uzh.ch} -} - -%% need no \usepackage{Sweave.sty} -%%\SweaveOpts{echo=FALSE,keep.source=TRUE} -%\SweaveOpts{keep.source=TRUE,echo=FALSE,eps=FALSE,pdf=TRUE,width=11.69,height=8.27,prefix.string=abn_v10} -\newcommand{\deep}{\setlength{\jot}{12pt}} -\newcommand{\nodeep}{\setlength{\jot}{3pt}} -\begin{document} -%\SweaveOpts{concordance=TRUE} -\setkeys{Gin}{width=1.0\textwidth} -%---------------------------------------------------------------------------------------- -\section{Introduction} -Bayesian network (BN) modelling \citep{Buntine1991, HECKERMAN1995, Lauritzen1996, Jensen2001} is a form of graphical modeling which attempts to separate out indirect from direct association in complex multivariate data, a process typically referred to as structure discovery in \cite{Friedman2003}. -In the last decades, BN modelling has been widely used in biomedical science/systems biology \citep{Poon2007, Poon2008, Poon2007a, Needham2007, Dojer2006, Jansen2003, Djebbari2008, Hodges2010} to analyse multi-dimensional data. However, only in the last few years, ABN models have been applied to veterinary epidemiology field; thanks to their ability to generalize standard regression methodologies. -Unlike other widely used multivariate approaches where dimensionality is reduced through exploiting linear combinations of random variables, such as in principal component analysis, graphical modeling does not involve any such dimension reduction. - -BN is a generic and well established \textit{data mining/machine learning} methodology, which has been demonstrated in other fields of study to be ideally suited to such analysis. They have been developed for analysing multinomial, multivariate Gaussian or conditionally Gaussian networks (a mix categorical and Gaussian variables), see \cite{HECKERMAN1995, Boettcher2004, Geiger1994}. -Additive Bayesian network (ABN) models are a special type of Bayesian network (BN) models, -where each node in the graph comprises a generalized linear model. -As the latter, they consist of statistical models which derive a directed acyclic graph (DAG) from empirical data, describing the dependency structure between random variables. All types of BN models comprise of two reciprocally dependent parts: a qualitative (the structure) and a quantitative (the model parameters) part. The DAG is the graphical representation of the joint probability distribution of all random variables in the data. The model parameters are represented by a local probability distribution for all the variables in the network. - -A number of libraries for fitting such BNs are available from CRAN. These types of BN have been constructed to ensure conjugacy, that is, enable posterior distributions for the model parameters and marginal likelihood to be calculated analytically. The purpose of \pkg{abn} is to provide a library of functions for more flexible BNs which can also not rely on conjugacy, which opens up an extremely rich modeling framework but at some considerable additional computational cost. - -Currently \pkg{abn} includes functionality for fitting non-conjugate BN models which are multi-dimensional analogues of combinations of Binomial (logistic) and Gaussian regression. It includes also model with Poisson (log) distribution for count data and generalised linear models with random effects (with the previous distributions). -% It is planned to extend this to include more complex distributions for overdispersed data such a beta-binomial and negative binomial. - -The objective in BN modeling structure discovery is to perform a model search on the data to identify an optimal model. Recall that BN models have a vast search space - super-exponential in the number of nodes - and it is generally impossible to determine a globally optimal model. How best to summarize a set of locally optimal networks with different structural features is an open question, and there are a number of widely used and intuitively reasonable possibilities. For example, one option is to conduct a series of heuristic searches and then simply select the best model found in \cite{HECKERMAN1995}; alternatively, a single summary network can be constructed using results across many different searches \citep{Hodges2010,Poon2007}. Otherwise, an exact search method, as the one presented in \cite{Koivisto2004} which perform an exhaustive search, can be used. There are obvious pros and cons to either approaches and both are common in the literature and provide a good first exploration of the data. - -For a general non-technical review of BN modeling applied in biology see \cite{Needham2007}. -While, a general introduction to BN modelling in veterinary epidemiology is provided by \cite{Lewis2011}. Further applications of BN to veterinary studies were described by \cite{Ward2013, Wilson2013, Sanchez-Vazquez2012}. Graphical modelling techniques used to analyse epidemiological data were used by -\cite{Firestone2013, Firestone2014, Lewis2012, Lewis2012b, Lewis2013, Schemann2013, Ludwig2013, McCormick2013} -resulting in dozens of publications, and references therein. - -In this manual we first consider a series of examples illustrating how to fit different types of models and run different searches and summary analysis to a (synthetic) data set comprising of 250 observations from a joint distribution comprising of 17 categorical and 16 continuous variables which is included as part of the \pkg{abn} library. This data set is a single realization from a network of the same structure as that presented in \cite{Lewis2011}, which is based on real data and sufficiently complex to provide a realistic example of data mining using Bayesian Network modeling. -Then, a fully explained Case Study is provided, where all the important steps to conduct a data analysis using additive Bayesian networks are illustrated. -Another detailed introduction and further relevant case studies about \pkg{abn} can be found at: \href{http://www.r-bayesian-networks.org}{www.r-bayesian-networks.org}. - - -\section{Pigs Case Study} - -We present data on disease occurrence in pigs provided -by the industry body the `British Pig Health Scheme' -(BPHS). The main objective of BPHS is to improve the -productivity of pig production in the UK, and reducing -disease occurrence is a significant part of this process. -The data we consider here comprise of a randomly chosen -batch of 50 pigs from each of 500 randomly chosen -pig producers in the UK. These are `finishing pigs', animals -about to enter the human food chain at an abattoir. -Each animal is assessed for the presence of a range -of different disease conditions by a specialist swine veterinarian. -We consider here the following nine disease -conditions: enzootic-pneumonia (EPcat); pleurisy (plbinary); -milk spots (MS); hepatic scarring (HS); pericarditis -(PC); peritonitis (PT); lung abscess (Abscess); tail -damage (TAIL); and papular dermatitis (PDcat). The -presence of any of these conditions results in an economic -loss to the producer. Either directly due to the -relevant infected part of the animal being removed from -the food chain, or indirectly in cases such as enzooticpneumonia, -which may potentially indicate poor herd -health and efficiency losses on the farm. An additional -loss, though not directly monetary, is the presence of tail -damage which may be suggestive of welfare concerns, -which may also be linked to sub-optimal production efficiency. -Milk spots and hepatic scarring result from infestation -with Ascaris suum which is particularly important -as this is a zoonotic helminth parasite. - - -\subsection{Deciding on a search method} - -As a very rough rule of thumb if there are less than $20$ variables (and no random effects) then probably the most robust model search option is an exact search (as opposed to a heuristic) which will identify a globally best DAG. Followed then by parametric bootstrapping in order to assess whether the identified model contains excess structure (this is an adjustment for over-modelling). Although, the parametric bootstrapping might require access to a cluster computer to make this computationally feasible. This is arguably one of the most comprehensive and reliable statistical modelling approaches for identifying an empirically justified best guess (DAG) at `nature's true model', the unknown mechanisms and processes which generated the study data. - -\subsubsection{Order Based Searches} -It is generally not feasible to iterate over all possible DAG structures when dealing with more than a handful of variables, hence the reliance on heuristic searches. It is also extremely difficult to construct efficient Monte Carlo Markov chain samplers across BN structures. A solution to this was proposed in \cite{Friedman2003} where rather than sample across DAGs, it was proposed to sample across node orderings. A node order is simply the set of integers $1$ through $n$, where $n$ is the number of variables in the data. A DAG is consistent with an ordering if for each node in the order its parents come before it. For example a DAG with only an arc from 1$\rightarrow$2 is consistent with ordering $1,2,3,4$ as the parent $1$ comes before $2$, but a DAG with an arc from 2$\rightarrow$1 is not consistent with this ordering. In effect, each order is a collection of DAGs, and note that each DAG may be consistent with multiple orders, i.e. the empty DAG is consistent with every possible ordering. This introduces bias, in that averaging across orders need not give the same results as averaging across DAGs, if the latter were possible. This is relevant when estimating posterior probabilities of individual structural features, and is baised towards more parsimonious features as they are consistent with more orders. Note that this bias does not apply to maximising across orders, as in finding most probable structures (see later). The big advantage of searching across orders is that there are $n!$ different orders compared to a reasonably tight upper bound of $2^{ n \choose 2 }$ for different DAGs. - -There are (at least) two approaches for searching across orders. The first is to construct a Markov chain which samples from the posterior distribution of all orders, and is the approach presented in \citet{Friedman2003}. Alternatively, in \cite{Koivisto2004} an exact method is proposed which rather than sample across orders, performs an exhaustive search. This has the advantage that it can also be used to find the globally optimal DAG of the data - the most probable structure - as well as posterior probabilities for structural features, such as individual arcs. The drawback is that this exact approach is only feasible with smaller number of variables e.g. up to 12 or 13 when dealing with additive models. For the code provided in \pkg{abn} this exact approach is readily feasible up to 20 variables using typical desktop computing, and potentially up to 25 variable with access to a shared memory cluster computer. - -\subsubsection{Most Probable Structure} -Using the exact order based method due to \cite{Koivisto2004} it is also possible to identify the DAG with globally best network score. Identification of a most probable structure is split into two parts. Firstly we calculate a cache of individual node scores, for example using~\rr{buildscorecache}. Next, an exhaustive order based structural search is carried out using the function \rr{mostprobable} which relies on the information in the node cache. - -As in the heuristic searches it is possible to ban or retain certain arcs, for example when splitting multinomial variables. This is done in the node cache computation step. There are two different structural priors implemented in this search, the first in the uniform prior where all structures (all parent combinations) are equally likely. This is the default \rr{prior.choice=1} in \rr{mostprobable} and the other functions. Also implemented is the prior used in \cite{Koivisto2004} where all parent combinations of equal cardinality are equally weighted, this is \rr{prior.choice=2}. The latter does give the same prior weight to a parent combination with no parents and a parent combination comprising off all possible parents (since there is only one choice of each, $n-1$ choose $0$ and n-1 choose $n-1$). This may not be desirable but is included as a prior for completness. Note that the order based search is exact in the sense that it will identify a DAG who score is equal to the best possible score if it was possible to exhaustive search across all DAGs. For example, if using \rr{prior.choice=1} then the network found should have a score greater than or equal to that found using the previously described heuristic searches. The structure found need not be unique in that others may exist with the same globally optimal score, the order based search is only guaranteed to find one such structure. - -To calculate the most probable structure we again use \rr{buildscorecache()} to calculate a cache of individual node scores. Next, the function \rr{mostprobable()} does the actual exhaustive order based search, and works for both conjugate and additive models since as with calculating -the posterior probabilities this step only involves structural searching and is not concerned with the precise parameterisation of each BN model. - -\subsection{Preparing the data} -There are two main things which need to be checked in the data before it can be used with any of the abn model fitting functions. -\begin{itemize} -\item All records with missing variables must be either removed or else the values completed. There are a range of libraries available from CRAN for completing missing values using approaches such as multiple imputation. Ideally, marginalising over the missing values is preferable (as opposed completing them as this then results in essentially dealing with models of models), but this is far from trivial here and not yet (and may never be) implemented in abn. To remove all records with one or more missing values then code similar to the following probably suffices: -<>= -library( abn) # Load the library - -mydat <- pigs.vienna[,-11] # Get data, drop batch variable - -mydat[ complete.cases(mydat),] -@ -N.b. this is not actually needed with pigs.vienna. -\item All variables which are to be treated as binary must be coerced to factors. To coerce an existing variable into a factor then: -<>= -mydat[,1] <- as.factor(mydat[,1]) -@ -coerces the first variable in data.frame pigs.vienna. The levels (labels of the factor) can be anything provided there are only two and a ``success" here is take to be the second level. For example, the second value in the vector returned by: -<>= -levels( mydat[,1]) -@ -\end{itemize} -To include additional variables in the modeling, for example interaction terms or polynomials, then these must be created manually and included into the data.frame just like any other variable. - - -\subsection{Initial searches for a optimal model} -Below is some R code which will perform an exact search. We want to find the DAG with the best goodness of fit (network score - log marginal likelihood) and ideally we would search for this without any apriori complexity limit (max number of parents). However, this may be both not computationally feasible and also highly inefficient. For example, with $25000$ observation is it really realistic to consider models with up to $9$ covariates per node. - -One approach is to start off with an apriori limit of one parent per node, find the best DAG, and then repeat an identical search process (again using functions buildscorecache and mostprobably) with a parent limit of $2$. And so on, stopping when the maximal DAG found no longer changes when the complexity limit is relaxed (increased). The parent limits \rr{max.par} are not inserted, in the code below, the \rr{max.par} command can vary from one to an higher parent limits. These initial searches can be easily automatized, in the library subdirectory \rr{system.file('bootstrapping\_example', -package='abn')} is possible to find a script file \rr{initsearch.bash} that performs the initial search explained before in an automated way, increasing progressively the number of parents, from $1$ to $5$ (for this specific example). The run time for the mostprobable function increases very considerably with the number of parents. -<>= -ban <- matrix( rep(0,dim(mydat)[2]^2),ncol=dim(mydat)[2]) -@ -The ban and retain matrix must have the names set: -<>= -colnames( ban) <- rownames(ban) <- names(mydat) - -retain <- matrix( rep(0,dim(mydat)[2]^2),ncol=dim(mydat)[2]) -colnames( retain) <- rownames(retain) <- names(mydat) -@ -Setup the distribution list for each node: -<>= -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") -@ -Build a cache of all the local computations: -<>= -mycache <- buildscorecache( data.df=mydat, - data.dists=mydists, dag.banned=ban, - dag.retained=retain, max.parents=max.par) -@ -Run the actual exact search: -<>= -mp.dag <- mostprobable( score.cache=mycache) - -fabn <- fitabn( dag.m=mp.dag,data.df=mydat, - data.dists=mydists) - -datadir <- tempdir() -@ -Save the results obtained: -<>= -save( mycache, mp.dag, fabn, file= - paste(datadir,"mp",max.par,".RData",sep="")) -@ - - -\subsection{Results from the initial search} -Searches across parent limits $1$ through $5$ were run and we now examine the results. What we are looking for is simply the model with the best score (the largest, the least negative, mlik value), checking that this does not improve when more parents are permitted. This then says we have found a DAG with maximal goodness of fit. What we find (below) is that the goodness of fit does not improve when we increase the parent limit beyond $3$. - -\begin{figure}[htbp] -\centering -\includegraphics[width=8cm]{ComparisonOfNetworkScore.pdf} %\vspace{-1.0cm} -\caption{Comparison of goodness of fits for different parent limits} \label{fig1} -\end{figure} - -The code necessary to analyze the results from the script file and to visualize the resulting DAG, in a linux environment, is the following: -<>= -load( "RData/Rout1.RData") -ml1 <- fabn$mlik; - -tographviz( dag.m=mp.dag,data.df=mydat,data.dists=mydists, - outfile="DAG_cycle.dot") -system( "dot -Tpdf -o DAG_cycle.pdf DAG_cycle.dot") -system( "evince DAG_cycle.pdf&") -@ -List of resulting marginal likelihood from the initial search: -<>= -mp.mlik <- c( -44711.62,-44685.53,-44684.64,-44684.64,-44684.64) -@ -The actual DAG corresponding to the maximum marginal likelihood: \rr{mp.mlik}$=-44684.64$ is in Fig.~\ref{fig2}. - -\begin{figure}[htbp] -\centering -\includegraphics[width=6.7cm]{DAG_cycle.pdf} -\vspace{-0.8cm} -\caption{Globally optimal DAG, found with a maximum number of 3 or more parents.} \label{fig2} -\end{figure} - -\subsection{Adjustment for overfitting: parametric bootstrapping using MCMC} -We have identified a DAG which has the best (maximum) possible goodness of fit according to the log marginal likelihood. This is the standard goodness of fit metric in Bayesian modelling \citep{MACKAY1992}, and includes an implicit penalty for model complexity. While it is sometimes not always apparent from the technical literature, the log marginal likelihood can easily (and sometimes vastly) overfit with smaller data sets. Of course the difficulty is identifying what constitutes `small' here. In other words using the log marginal likelihood alone (or indeed any of the other usual metrics such as AIC or BIC) is likely to identify structural features, which, if the experiment/study was repeated many times, would likely only be recovered in a tiny faction of instances. Therefore, these features could not be considered robust. Overfitting is an ever present issue in model selection procedures, particular is common approaches such as stepwise regression searches, see \cite{Babyak2004}. - -A well established approach for addressing overfitting is to use parametric bootstrapping \citep{Friedman1999}. The basic idea is very simple. We take our chosen model and then simulate data sets from this, the same size as the original observed data, and see how often the different structural features are recovered. For example, is it reasonable for our data set of 434 observations to support a complexity of 29 arcs? Parametric bootstrapping is arguably one of the most defensible solutions for addressing overfitting, although it is likely the most computationally demanding, as for each simulated (bootstrap) data set we need to repeat the same exact model search as used with the original data. And we may need to repeat this analysis hundreds (or more) times to get robust results. - -Performing parametric bootstrapping is easy enough to code up if done in small manageable chunks. Here we provide a step-by-step guide along with necessary sample code. - -\subsection{Software needed} -We have a DAG model and MCMC software such as JAGS and WinBUGS are designed for simulating from exactly such models. So all we need to do is implement our DAG model, e.g. in Fig.~\ref{fig2}, in the appropriate JAGS (or WinBUGS) syntax (which are very similar). -Here I am going to use JAGS, developed by \cite{Plummer2003}, in preference to WinBUGS or OpenBUGS for no other reason than that is what I am most familiar with, and JAGS is relatively straightforward to install (compile from source) on a cluster. Binary versions of JAGS are also available for most common platforms (Linux, Windows, OS X). -% -To implement our DAG in JAGS we need write a model definition file (a BUG file) which contains the structure of the dependencies in the model. We also need to provide in here the probability distributions for each and every parameter in the model. Note that in Bayesian modelling the parameter estimates will not generally conform to any standard probability distribution (e.g. Gaussian) unless we are in the very special case of having conjugate priors. The marginal parameter distributions required can be estimated using the \rr{fitabn} function and then fed into the model definition. We next demonstrate one way of doing this which is to use empirical distributions, in effect we provide JAGS with a discrete distribution over a fine grid which approximates whatever shape of density we need to sample from. - -\subsection{Generating marginal densities} -The function \rr{fitabn} has functionality to estimate the marginal posterior density for each parameter in the model. The parameters can be estimated one at a time by manually giving a grid (e.g. the x values where we want to evaluate f(x)) or else all together. In the latter case a very simple algorithm will try and work out where to estimate the density. This can work better sometimes and others, although it seems to work fine here for most variables. In order to use these distributions with JAGS we must evaluate the density over an equally spaced grid as otherwise the approach used in JAGS will not sample correctly. The basic command needed here is: -<>= -marg.f <- fitabn( dag.m=mydag,data.df=mydat,data.dists=mydists, - compute.fixed=TRUE,n.grid=1000) -@ - -We should not simply assume that the marginals have been estimated accurately, and they should each be checked using some common sense. Generally speaking, estimating the goodness of fit (mlik) for a DAG comprising of GLM nodes is very reliable. This marginalises out all parameters in the model. Estimating marginal posterior densities for individual parameters, however, can run into trouble as this presupposes that the data contains sufficient information to accurately estimate the "shape" (density) for every individual parameter in the model. This is a stronger requirement than simply being able to estimate an overall goodness of fit metric. If a relatively large number of arcs have been chosen for a node with relatively few observations (i.e. ``success'' in a binary node) then this may not be possible, or at least the results are likely to be suspect. Exactly such issues - overfitting - are why we are performing the parametric bootstrapping in the first place but they can also pose some difficulties before getting to this stage. - -It is essential to first visually check the marginal densities estimated from \rr{fitabn}. -Something like the following will create one pdf file where each page is a separate plot of a marginal posterior density. - -<>= -library( Cairo) # Available from CRAN -CairoPDF( "MargPlots_PigsData.pdf") -for( i in 1:length(marg.f$marginals)){ - cat( "processing marginals for node:", - nom1 <- names( marg.f$marginals)[i],"\n") - cur.node <- marg.f$marginals[i] - # Get the marginal for current node, this is a matrix [x,f(x)] - cur.node <- cur.node[[1]] - # This is always [[1]] for models without random effects - for( j in 1:length(cur.node)){ - cat( "processing parameter:",nom2<- names(cur.node)[j],"\n") - cur.param <- cur.node[[j]] - plot( cur.param,type="l",main=paste(nom1,":",nom2))}} -dev.off() -@ - -These plots (available in \rr{system.file('bootstrapping\_example',package='abn')}) suggests that the all the marginal estimates looks fine. -Moreover, we can now perform an additional common sense check on their reliability. A probability density must integrate to unity (the area under the curve is equal to one). The densities here are estimated numerically and so we would not expect to get exactly one (n.b. no internal standarization is done so we can check this), but if the numerical estimation has worked reliably then we would expect this to be close (e.g. $0.99, 1.01$) to on. - -\begin{figure}[htbp] -\centering -\includegraphics[width=9cm]{PigsArea.png} -\caption{Approximated area under marginal densities} \label{fig3} -\end{figure} - -From Fig.~\ref{fig3} it is obvious that everything went fine in the marginal density estimation. -To check the area we used the following code, which has as final output a file called \rr{pigs\_post\_params.R} which contains all the information JAGS needs to sample from the marginal posterior distributions. -<>= -marnew <- marg.f$marginals[[1]] - -for( i in 2: length(marg.f$marginals)){ - marnew <- c(marnew, marg.f$marginals[[i]])} -@ -A straightforward question is: how reliable are the marginals? If the numerical routines work well -then the area under the density function should be close to unity. -<>= -myarea <- rep( NA,length( marnew)) -names( myarea) <- names( marnew) -for( i in 1:length(marnew)){ - tmp <- spline(marnew[[i]]) - # Spline just helps make the estimation smoother - myarea[i] <- sum(diff(tmp$x)*tmp$y[-1])} - # Just width x height of rectangles -cbind( myarea) -@ -Now visualise \rr{myarea} as a plot, the colours need to be adjusted: -<>= -library( Cairo) -mycols <- rep("green",length(marnew)) -mycols[1:2] <- "red"; # PC -mycols[3] <- "orange"; # PT -mycols[4:5] <- "yellow"; # MS -mycols[6:7] <- "blue"; # HS -mycols[8:9] <- "lightblue"; # TAIL -mycols[10] <- "mistyrose"; # Abscess -mycols[11:12] <- "green"; # Pyaemia, lightcyan -mycols[13:14] <- "lavender"; # EPcat -mycols[15:18] <- "cornsilk"; # PDcat -mycols[19:22] <- "brown" ; # plbinary -CairoPNG( "Area_PigsData.png",pointsize=10,width=720,height=640) -par( las=2, mar=c(8.1,4.1,4.1,2.1)) -barplot( myarea,ylab="Area under Density",ylim=c(0,2), col=mycols) -dev.off() -@ -Now we create the data in the right format ready for going to JAGS: -<>= -print( names(marnew)) -# want to bind all the marginals the same nodes into a matrix -m <- marnew; -# PT -> PC, 1 parent, 2 params; -PC.p <- cbind( m[["PC|(Intercept)"]], m[["PC|PT"]]) -# PC --> NO PARENTS, 1 params; -PT.p <- cbind( m[["PT|(Intercept)"]]) -# HS -> MS, 1 parent, 2 params; -MS.p <- cbind( m[["MS|(Intercept)"]], m[["MS|HS"]]) -# EPcat -> HS, 1 parent, 2 params; -HS.p <- cbind( m[["HS|(Intercept)"]], m[["HS|EPcat"]]) -# PDcat -> TAIL, 1 parent, 2 params; -TAIL.p <- cbind( m[["TAIL|(Intercept)"]], m[["TAIL|PDcat"]]) -# Abscess --> NO PARENTS, 1 param; -Abscess.p <- cbind( m[["Abscess|(Intercept)"]]) -# TAIL -> Pyaemia, 1 parent, 2 params; -Pyaemia.p <- cbind(m[["Pyaemia|(Intercept)"]],m[["Pyaemia|TAIL"]]) -# plbinary -> EPcat, 1 parent, 2 params; -EPcat.p <- cbind( m[["EPcat|(Intercept)"]], m[["EPcat|plbinary"]]) -# MS, EPcat, plbinary --> PDcat, 3 parents, 4 params; -PDcat.p <- cbind( m[["PDcat|(Intercept)"]], m[["PDcat|MS"]], - m[["PDcat|EPcat"]], m[["PDcat|plbinary"]]) -# PC, PT, Abscess --> plbinary, 3 parents, 4 params; -plbinary.p <- cbind(m[["plbinary|(Intercept)"]],m[["plbinary|PC"]], - m[["plbinary|PT"]], m[["plbinary|Abscess"]]) - -dump( c( "PC.p","PT.p","MS.p","HS.p","TAIL.p","Abscess.p", - "Pyaemia.p","EPcat.p","PDcat.p","plbinary.p"), - file=paste('Jags/',"pigs_post_params.R",sep="")) -@ - -\subsection{Building BUG model} -Once we have the posterior distributions the next step is to actually create the DAG in JAGS. This involves creating a BUG file, a file which contains a definition of our DAG (from Fig.~\ref{fig2}) in terms which can be understood by JAGS. This can easily be done by hand, if rather tedious, and should be checked carefully for errors (which JAGS will prompt about in any case). The BUG file is here. This file is fairly heavily commented (it might look complicated but most of it is just copy and paste with minor edit) - the syntax is similar to R - and should be fairly self explanatory. Note that unlike a more usual use of WinBUGS or JAGS we have no data here, we are simply providing JAGS with a set of marginal probability distributions and how they are inter-dependent, and we want it to then generate realisations from the appropriate joint probability distribution. The next step is to tell JAGS to perform this simulation, i.e. generate a single bootstrap data set of size $n=25000$ observations based on the assumption that the DAG is Fig.~\ref{fig2} is our true model. The BUG file can be found in \rr{system.file('bootstrapping\_example',package='abn')} in the file \rr{pigs\_model.bug}. - -\subsection{A single bootstrap analysis} -To run JAGS we use four separate files: i) the BUG model definition file (model.bug); ii) a file \rr{pigs\_post\_params.R} which contains the marginal distributions referred to in the BUG file; iii) a script which runs the MCMC simulation \rr{jags\_pigs\_script.R}; and finally iv) a file which sets of random number seed (\rr{inits.R} - the value in this must be changed to use different streams of random numbers). These four files are contained in this tarball. To run this example extract all the files into one directory and then at the command line type \rr{`jags jags\_pigs\_script.R'}. In terms of the MCMC component, a burn-in of $100000$ is used but as there are no data here this takes no time to run and is likely of little use and could be shorted (it is just included to allow JAGS any internal adaptations or diagnostics that it might need to do). The actual MCMC is then run for $250000$ iterations with a thin of $10$, which gives $25000$ observations for each variable - the same size as the original data. The number of MCMC steps and thin is a little arbitrary and this could be run for longer with a bigger thin, but for this data looking at autocorrelation plots for the Gaussian variables there appears no evidence of correlation at a thin of $10$ and so this seems sufficient here. - -The next step is to automate a single run, e.g. generate a single bootstrap sample and then perform an exact search on this, just as was done for the original data. This file performs a single such analysis in R - just drop this into the same directory as the four JAGS file above. For ease we also call the JAGS script from inside R which might require some messing about with the PATH variable on Windows (e.g. if you open up a cmd prompt then you should be able to type "jags" without needed to give a full path). The output from this is given below and "results" is a single matrix (DAG) which is saved in an R workspace called \rr{boot1run.RData}. - -Get the pigs data, drop batch variable -<>= -orig.data <- pigs.vienna[,-11] -@ - -Now create a single bootstrap sample: -<>= -system("jags jags_pigs_script.R") -@ -Read in boot data and convert to data.frame in same -format as the original data, e.g. coerce to factors, using the appropriate \proglang{R} package \pkg{coda}, described in \cite{Plummer2006}: -<>= -library( coda) -boot.data <- read.coda("out1chain1.txt","out1index.txt") - -boot.data <- as.data.frame(boot.data) -for( j in 1:dim(orig.data)[2]){if(is.factor(orig.data[,j])) - { boot.data[,j]<- as.factor(boot.data[,j]) - levels(boot.data[,j])<- levels(orig.data[,j])}} -@ -Now we have the boot.data in identical format as the original: -<>= -ban <- matrix( rep(0,dim(orig.data)[2]^2), - ncol=dim(orig.data)[2]) -colnames( ban) <- rownames(ban) <- names(orig.data) - -retain <- matrix( rep(0,dim(orig.data)[2]^2), - ncol=dim(orig.data)[2]) -colnames( retain) <- rownames(retain) <- names(orig.data) - -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") -@ -Set the parent limits to $3$, equal to the original data: -<>= -max.par <- 3; -@ -Build a cache on bootstrap data: -<>= -boot1.cache <- buildscorecache( data.df=boot.data, - data.dists=mydists, max.parents=max.par, - dag.banned=ban, dag.retained=retain) -@ -Run \rr{mostprobable} search on the bootstrap data: -<>= -boot1.mp <- mostprobable( score.cache=boot1.cache) - -datadir <- tempdir() -save( boot1.mp,file=paste( datadir,"boot1run.RData",sep='')) -@ - -Once this works on your local machine it is then a case of trying to automate this in the most efficient way, for example for use on a cluster. The crucial thing here is that the random number seed used each time (in the file \rr{inits.R}) must be changed for each bootstrap simulation otherwise an identical bootstrap data set will be produced! - -\subsection{Ways to summarise results from parametric bootstrapping} -The globally optimal DAG, Fig.~\ref{fig2}, has $12$ arcs. It way be that some of these are due to over-modelling which means they will be recovered in relatively few bootstrap analyses. $10240$ bootstrap analyses were conducted (on a cluster) and all the R files, MPI wrapper, and also the actual results (in R workspaces) can be found in the folder \rr{system.file('bootstrapping\_example', -package='abn')}. - -The first step is to explore the bootstrap results. Of some interest is how many of the arcs were recovered during each of the bootstrap ``simulations''. This is given in Fig.~\ref{fig4}. We can see right away that not all the arcs in the original DAG (Fig.~\ref{fig2} - with 12 arcs) were recovered - even just once. This provides overwhelming evidence that the original exact search has indeed overfitted to the original data. Not at all surprising given the relatively small sample size. We must therefore trim off some of the complexity - arcs - from the original DAG in order to justify that our chosen DAG is a robust estimate of the underlying system which generated the observed data. - -A parametric bootstrapping approach -was suggested in \cite{Friedman1999} which uses simulation to -assess whether a chosen model comprises more complexity -than could reasonably be justified given the size -of the observed data set. Using Markov chain Monte -Carlo simulation via JAGS (open source software), $10240$ -independent (assumed by inspecting autocorrelations -from the MCMC output) data sets of the same size as -the original data were generated from our chosen model -in i). For each of these bootstrap data sets an identical -exact order-based search as in i) was conducted. - -\begin{figure}[htbp] -\centering -\includegraphics[width=9cm]{Bootstrapping.png} -\caption{Number of arcs recovered in bootstrapping} \label{fig4} -\end{figure} - -There are a number of different options in terms of trimming/pruning arcs. -One common option which apes the use of majority consensus trees in phylogenetics -trees are just special cases of DAGs, is to remove all arcs which were not recovered in at least a majority $(50\%)$ of the bootstrap results. -Fig.~\ref{fig5} shows the frequency at which each arc was recovered, the maximum value possible being $10240$ $(100\%$ support). -Collating results across these $10240$ searches we find that only -$14\%$ of the globally optimal DAGs found comprised $12$ -or more arcs. Approximately $68\%$ of DAGs had $11$ or -more arcs - therefore a robust model of the original data -has no more than $11$ arcs. Almost identical results were -obtained using a random selection of $5012$ searches suggesting -that sufficient bootstrap samples had been performed. -The usual cut-off for structural support of features -(arcs) is $50\%$ in BN modeling \citep{Poon2007, Poon2007a, Poon2008, Lycett2009, Lewis2011}, -and is analogous to the widespread use of majority consensus -trees in phylogenetics. We therefore conclude that -our chosen model in i) with 11 arcs is robust. This is perhaps -not surprising given we have a large data set of $25K$ -observations. - -\begin{figure}[htbp] -\centering -\includegraphics[width=15cm]{Summary.png} -\caption{Frequencies at which each arc in the original DAG was recovered during bootstrapping} -\label{fig5} -\end{figure} - -Note that this $50\%$ is not in any way comparable with the usual $95\%$ points used in parameter estimation as these are entirely different concepts (an explanation for this can be found here). - -Another option, other than choosing a different level of support (which is entirely up to the researcher), is to consider an undirected network. That is, include all arcs if there support - considering both directions - exceeds $50\%$. This is justifiable due to likelihood equivalence which means that - generally speaking - the data cannot discriminate between different arc directions (see here for an explanation) and therefore considering arcs recovered in only one direction may be overly conservative. Again, this decision is likely problem specific. For example, from a purely statistical perspective being conservative is generally a good thing, but from the scientists point of view this may then remove most of the more interesting results from the study. Obviously a balance is required. - -In this data it turns out that removing all arcs with have less than $50\%$ support gives an identical pruned network as if we were to consider both arc directions jointly. In generally this need not be the case. Fig.~\ref{fig6} shows out optimal DAG after removing these arcs. This is our optimal model of the data. - - -All the code for analysing the results from the bootstrap analyses can be found here: -<>= -mydata <- pigs.vienna[,-11] - -N <- 10240; -# Write out manually, clearer than using rep() -mydag <- matrix(c( -# PC PT MS HS TAIL Abscess Pyaemia EPcat PDcat plbinary - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, - 1, 1, 0, 0, 0, 1, 0, 0, 0, 0), - byrow=TRUE,ncol=10) -colnames(mydag) <- rownames(mydag) <- names(mydata) -sum( mydag) # 12 arcs, as in original model, Figure 2 - -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") - -# Use fitabn to check mydag is correct (no typos mlik = -44684.64) -print( fitabn(dag.m=mydag,data.df=mydata,data.dists=mydists)$mlik) -bestdag <- mydag -@ - -\begin{figure}[htbp] -\centering -\includegraphics[width=8cm]{postbootpigs.pdf}\vspace*{-5mm} -\caption{ Optimal DAG after removal of arcs supported at less than $50\%$ in bootstrapping. Contains $8$ Arcs.}\label{fig6} -\end{figure} - -The next command reads all files with \rr{mp[number].RData} and create a list of results: -<>= -boot.dags <- list() -these <- grep("mp10Kboot\\d+.RData", dir()) -num <- 1 -for( i in dir()[these]){# Load each file - load(i) # Provides dags, a list - tmp <- dags[which(unlist(lapply(dags,sum))>0)] - # Get valid entries in dags but as a list - for( j in 1:length(tmp)){ - # For each entry copy into boot.dags, and increment counter - boot.dags[[num]]<- tmp[[j]]; num <- num+1 } - rm( dags) -} -@ -It is always good practice, have a look at mlik values for the bootstraps viz a viz the original: -<>= -if( FALSE){ - scores <- rep(0,length(boot.dags)) - for(i in 1:length(boot.dags)){ - scores[i] <- fitabn(dag.m=boot.dags[[i]],data.df=mydata, - data.dists=mydists)$mlik - } - scores.b <- scores[-which(scores< -N)] - orig.score <- fitabn(dag.m=bestdag,data.df=mydata, - data.dists=mydists)$mlik - plot(density(scores.b,from=min(scores.b),to=max(scores.b))) - abline(v=orig.score,lwd=2,col="blue") -} -@ -We now trim all arcs from the boot results which do not occur in the Master -DAG - bestdag - since we know these are due to overfitting: -<>= -boot.dags.trim <- boot.dags -for( i in 1:length(boot.dags)){ - boot.dags.trim[[i]] <- boot.dags.trim[[i]]*bestdag } - -arc.freq <- lapply(boot.dags.trim,sum) -arc.freq <- table(unlist(arc.freq)) -library( Cairo) -CairoPNG("PigsFreqBootRes.png",pointsize=10,width=720,height=700) -par(las=1, mar=c(6.1,6.1,4.1,2.1)) -barplot( arc.freq,ylab="",xlab="",col="skyblue", - names.arg=names(arc.freq), ylim=c(0,2500)) -par( las=1) -mtext( "Number of arcs in bootstrap DAG",1,line=3,cex=1.5) -par( las=3) -mtext( "Frequency out of 10240",2,line=4,cex=1.5) -dev.off() - -total.dag <- matrix(rep(0,dim(bestdag)[2]^2),ncol=dim(bestdag)[2]) -colnames(total.dag) <- rownames(total.dag)<- colnames(bestdag) -# Get the support for each arc, total.dag: -for( i in 1:length(boot.dags)){ - if(sum(boot.dags[[i]])>0){total.dag <- total.dag+boot.dags[[i]]}} -total.dag <- total.dag*bestdag # We only want arcs in the best DAG -total.dag -@ -The frequencies at which each arc in the original DAG was -recovered during bootstrapping. - -We get the majority consensus (directed DAG): -<>= -f <- function(val,limit){ if(val>= -bestdag.trim.nodir <- bestdag -bestdag.trim.nodir[,] <- 0 # Set zero -child <- NULL; parent <- NULL -for( i in 1:dim(total.dag)[1]){ - for( j in 1:dim(total.dag)[2]){ - if(i>j){ # Get most supported direction - if(total.dag[i,j]>total.dag[j,i]){ - m.i <- i; m.j <- j;} - else {m.i <- j; m.j <- i;} - # Does arc quality - exceed threshold of support - if(total.dag[i,j]+total.dag[j,i]>N/2){ - # We want this as more than 5000 - bestdag.trim.nodir[m.i,m.j] <- 1}}}} - -tographviz( dag.m=bestdag.trim,data.df=mydata, - data.dists=mydists, outfile="postbootpigs.dot") -system( "dot -Tpdf -o postbootpigs.pdf postbootpigs.dot") -system( "evince postbootpigs.pdf&") - -save( bestdag.trim,file=paste("bestdagpigs_trim.RData",sep='')) -@ - -\subsection{Estimating marginals from the final DAG} - -\begin{figure}[!t] -\centering -\includegraphics[width=8cm]{plbinaryNode.png} -\caption{ Marginal posterior density based in the node plbinary}\label{fig8} -\end{figure} - -Once we have identified our optimal DAG then it is usual to want to examine the parameters in this model, e.g. in Fig.~\ref{fig8}. These are our results, the effects of the various variables in our study. This process is very similar to when estimating the marginals for the bootstrapping but should now be easier since we should have removed any difficulties due to over-fitting. The posterior density plots for the final DAG can be found in the pdf called -\rr{Pigs\_PostBootPlots.pdf}. These all look fine. An outlook on the parameters can be done based on the variable plbinary. - - -The R code for creating the marginals and quantiles can be found below: -<>= -mydata <- pigs.vienna[,-11] - -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") - -marg.f <- fitabn(dag.m=bestdag.trim,data.df=mydata, - data.dists=mydists,compute.fixed=TRUE, - n.grid=1000) - -library( Cairo) -CairoPDF("Pigs_PostBootPlots.pdf") -for( i in 1:length(marg.f$marginals)){ - cat( "processing marginals for node:", - nom1 <- names(marg.f$marginals)[i],"\n") - cur.node <- marg.f$marginals[i] - # Get marginal for current node, this is a matrix [x,f(x)] - cur.node <- cur.node[[1]] - # This is always [[1]] for models without random effects - for( j in 1:length(cur.node)){ - cat("processing parameter:", - nom2 <- names(cur.node)[j],"\n") - cur.param <- cur.node[[j]] - plot( cur.param,type="l",main=paste(nom1,":",nom2))}} -dev.off() - -marnew <- marg.f$marginals[[1]] -for(i in 2: length(marg.f$marginals)){ - marnew <- c(marnew, marg.f$marginals[[i]])} - -margs <- marnew; -mymat <- matrix(rep(NA,length(margs)*3),ncol=3) -rownames(mymat) <- names(margs) -colnames(mymat) <- c("2.5%","50%","97.5%") -ignore.me <- union(grep("\\(Int",names(margs)), - grep("prec",names(margs))) -@ -These are not effect parameters, but background constants and precisions: -<>= -comment <- rep("",length(margs)) -for(i in 1:length(margs)){ - tmp <- margs[[i]] - tmp2 <- cumsum(tmp[,2])/sum(tmp[,2]) - mymat[i,] <- c(tmp[which(tmp2>0.025)[1]-1,1], - tmp[which(tmp2>0.5)[1],1], - tmp[which(tmp2>0.975)[1],1]) - myvec <- mymat[i,] - if( !(i%in%ignore.me) && (myvec[1]<0 && myvec[3]>0)){ - comment[i] <- "not sig. at 5%"} - # Truncate for printing - mymat[i,] <- as.numeric(formatC(mymat[i,],digits=3,format="f"))} -cbind( mymat) -@ - -The code above produce the table of percentiles that can be found in Table~\ref{fig7}. -\begin{table} -\begin{center} -\begin{tabular}{|l|c|c|c|} - \hline - & & \textbf{Odds Ratio} & \\ - \hline - & \textbf{2.5\%} & \textbf{50\%} & \textbf{97.5\%} \\ - \hline -PC$|$(Intercept) & 0.03 & 0.03 & 0.03 \\ - \hline -\textcolor{red}{ PC$|$PT} & \textcolor{red}{17.71} & \textcolor{red}{26.44} & \textcolor{red}{39.21} \\ - \hline - PT$|$(Intercept) & 0.00 & 0.00 & 0.00 \\ - \hline - MS$|$(Intercept) & 0.06 & 0.06 & 0.07 \\ - \hline - \textcolor{blue}{MS$|$HS} & \textcolor{blue}{0.20} & \textcolor{blue}{0.30} & \textcolor{blue}{0.44} \\ - \hline - HS$|$(Intercept) & 0.06 & 0.06 & 0.06 \\ - \hline - TAIL$|$(Intercept) & 0.00 & 0.00 & 0.00 \\ - \hline - \textcolor{red}{TAIL$|$PDcat} & \textcolor{red}{2.61} & \textcolor{red}{4.32} & \textcolor{red}{6.78} \\ - \hline - Abscess$|$(Intercept) & 0.00 & 0.01 & 0.01 \\ - \hline - Pyaemia$|$(Intercept) & 0.00 & 0.00 & 0.00 \\ - \hline - EPcat$|$(Intercept) & 0.36 & 0.37 & 0.38 \\ - \hline - \textcolor{red}{EPcat$|$plbinary} & \textcolor{red}{1.97} & \textcolor{red}{2.14} & \textcolor{red}{2.31} \\ - \hline - PDCat$|$(Intercept) & 0.04 & 0.04 & 0.04 \\ - \hline - \textcolor{red}{PDcat$|$plbinary} & \textcolor{red}{1.78} & \textcolor{red}{2.08} & \textcolor{red}{2.41} \\ - \hline - plbinary$|$(Intercept) & 0.10 & 0.11 & 0.11 \\ - \hline - \textcolor{red}{PC$|$(Intercept)} & \textcolor{red}{12.92} & \textcolor{red}{15.12} & \textcolor{red}{17.71} \\ - \hline - \textcolor{red}{PC$|$(Intercept)} & \textcolor{red}{2.72} & \textcolor{red}{4.36} & \textcolor{red}{6.97} \\ - \hline - \textcolor{red}{PC$|$(Intercept)} & \textcolor{red}{6.23} & \textcolor{red}{8.71} & \textcolor{red}{12.17} \\ - \hline -\end{tabular} -\caption{Marginal posterior quantiles for each parameter. The red lines indicate point estimates bigger than $1$, corresponding to risk factor. While the blue line refers to estimates smaller than $1$, indicating protective factor.} \label{fig7} -\end{center} -\end{table} - -All of the effect parameters, that is, ignoring the intercept terms (which is just a background constant) and the precision parameters - have $95\%$ confidence (or credible) intervals which do not cross the origin. Note this is not guaranteed to happen this criteria was not part of the model selection process, and all the parameters in the model have been justified using mlik and bootstrapping. But having such marginal intervals can make presenting the results possibly easier to a skeptical audience. - -\subsection{Pigs Case Study Conclusion} - -Based on the results presented in the previous section, it possible to draw a conclusion of the study. -From Table~\ref{fig7}, we can notice that, a part from HS, which has a protective effect on MS, all others parameters have a risk effect on possible disease outcome. Our final optimal ABN model of this disease system suggests that the presence of Pyaemia is independent from other conditions. Moreover, the remaining diseases split into two separate connected components. - -Some interesting biological questions, resulting from ABN model, can be related to the similarity of causal pathways between these two groups of diseases and to the common causes shared within each of these groups. - -The principal reasons because ABN models should be used are linked to their ability of generalization of the usual GLM to multiple dependent variables: fully multi-dimensional models. Specifically, results from ABN analyses can be used as a basis for developing new biological questions about factors -potentially affecting disease's presence, and inform the design of future targeted studies. - -\section{An additional example} -In the next subsections we illustrate how to fit an ABN model to different kinds of data, starting with the introdcution of the data we are going to deal with. The main purpose of BN structure discovery is to estimate the joint dependency structure of the random variables in the available data, and this is achieved by heuristically searching for optimal models and comparing their goodness of fit using Bayes factors. It is assumed that all structures are equally supported in the absence of any data - an uniformative prior on structures - and so comparing Bayes factors collapses to comparing the marginal likelihoods which is done on a log scale. The log marginal likelihood for a BN is typically referred to as the network score. - -\subsection{Simulated Data Example var33} -Figure~\ref{fig1} shows the structure of the distribution which generated the data set \rr{var33} included with \pkg{abn}. This diagram was created using the \rr{tographviz()} function of \pkg{abn} (see later examples) which translates the matrix which defines a network - a directed acyclic graph - into a text file of suitable format for processing in Graphviz, where this processing was done outside of \proglang{R}. Graphviz is freely available and operates on most platforms and can be downloaded from \href{http://www.graphviz.org}{www.graphviz.org}, there is also an R package which interfaces to Graphviz available from the Bioconductor project (requires an installation of Graphviz). -\begin{figure}[htb] -\includegraphics{mydag_all} -\vspace{-1.0cm} -\caption{Directed acyclic graph representation of the joint probability distribution which generated data set \rr{var33} which is included with \pkg{abn}. The square nodes are categorical (binary) and the oval nodes continuous variables.} \label{fig9} -\end{figure} - -\subsection{Fitting an additive BN model to categorical data} \label{sec2} -An additive BN model for categorical data can be constructed by considering each individual variable as a logistic regression of the other variables in the data, and hence the network model comprises of many combinations of local logistic regressions \citep{Rijmen2008}. The parameters in this model are the additive terms in a usual logistic regression and independent Gaussian priors are assumed for each covariate. Note that the variables here must all be binary, and so all multinomial variables need to be split into separate binary factors (and added to the original data.frame) in order to form the network model. This is analogous to forming the design matrix in a conventional additive model analysis. Similarly, interaction terms can be added by including appropriate additional columns in the data.frame. In these models the log marginal likelihood (network score) is estimated using Laplace approximations at each node. Hyperparameters for the means and variances in the Gaussian priors are fixed at zero and 1000 respectively, and other values can be given explicitly in the call to \rr{fitabn} but this is not recommended without good reason. - -To fit an additive model use \rr{fitabn(data.df,dag.m,data.dists, ...)}. In the following code we fit first the independence model with no arcs and then the same dependence model as above. Turning on \rr{verbose=TRUE} simply gives the individual log marginal likelihoods for each node (n.b. the numbering is that used internally and simply denotes the variables in the data.frame from left to right). - -The following code fits a network to the subset of the variables from \rr{var33} which are categorical. In this data these are all binary. Note that all categorical variables should be set as factors. -<>= -library( abn) -bin.nodes <- c( 1,3,4,6,9,10,11,12,15,18,19,20,21,26,27,28,32) -var33.cat <- var33[,bin.nodes] #Categorical nodes only - -dag33 <- matrix( 0, 17, 17) -colnames( dag33) <- rownames( dag33) <- names( var33.cat)#Set names -@ -Move back to independence model: -<>= -dag33["v11","v12"] <- 0; dag33["v11","v10"]<- 0; dag33["v4","v3"]<- 0; -@ -Setup the distribution list for each categorical node: -<>= -mydists.cat <- list( v1 ="binomial", v3 = "binomial", - v4 = "binomial", v6 = "binomial", v9 = "binomial", - v10 = "binomial", v11 = "binomial", v12 = "binomial", - v15 = "binomial", v18 = "binomial", v19 = "binomial", - v20 = "binomial", v21 = "binomial", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v32 = "binomial") -ind.mod.cat <- fitabn( data.df=var33.cat, dag.m=dag33, - data.dists=mydists.cat, verbose=FALSE) -@ -It is possible to change to \rr{verbose=TRUE} if one want to check -how change the score for each individual node. - -The network score for a model with conditional independencies is: -<>= -ind.mod.cat$mlik -@ -The structure of the network definition matrix is where each row is a ``child'' and each column is its ``parents'', where a \rr{1} denotes a parent (or arc) is present. Now lets fit a model with some conditional dependencies, for example where \rr{v11} is conditionally dependent upon \rr{v12} and \rr{v10}, and \rr{v4} is conditionally dependent upon \rr{v3}. - -Now fit the model with some conditional dependencies: -<>= -dag33["v11","v12"] <- 1; -dag33["v11","v10"] <- 1; -dag33["v4","v3"] <- 1; -dep.mod.cat <- fitabn( data.df=var33.cat, dag.m=dag33, - data.dists=mydists.cat, verbose=FALSE) -@ -The network score for a model with conditional dependencies is: -<>= -dep.mod.cat$mlik -@ -The network score is considerably improved and therefore suggests support for these new structural features. To produce a visual description of the model then we can export to graphviz as follows: -<>= -tographviz( dag=dag33, data.df=var33.cat, data.dists=mydists.cat, - outfile="mydagcat.dot", directed=TRUE) #Create file -@ -\rr{mydagcat.dot} can then be processed with graphviz unix shell typing: "dot -Tpdf mydagcat.dot -o mydagcat.pdf" or using gedit if on Windows. - -\begin{figure}[htb] -\includegraphics{mydag} -\vspace{-1.0cm} -\caption{Directed acyclic graph \rr{dag33} created using \rr{tographviz()} and Graphviz} \label{fig10} -\end{figure} -In \rr{tographviz()} the \rr{data.df} argument is used to determine whether the variable is a factor or not, where factors are displayed as squares and non-factors as ovals. To use the full range of visual Graphviz options simply use the file created by \rr{tographviz()} as a starting point and manually edit this in a text editor before running through \rr{dot} or one of the other Graphviz layout processors. - -\subsection{Fitting an additive BN model to continuous data} \label{sec3} -We now consider analogous models to those in Section~\ref{sec2} but where the network comprises of Gaussian linear regressions rather than logistic regressions. The structure of these models again assumes independent Gaussian priors for each of the coefficients in the additive components for the mean response at each node (with hyper means = 0 and hyper variances = 1000). The Gaussian response distribution is parameterized in terms of precision $(1/\sigma^2)$, and independent Gamma priors are used with shape=0.001 and scale=1/0.001 (where these are as defined in the \rr{rgamma} help page). By default, each variable in the data.frame is standardised to a mean of zero and standard deviation of one, this has no effect on the identification of dependencies between variables. -We start droping the categorical nodes: -<>= -var33.cts <- var33[,-bin.nodes] -dag33 <- matrix( 0, 16, 16) -colnames( dag33) <- rownames( dag33) <- names( var33.cts) -@ -Setup the distribution list for each continuous node: -<>= -mydists.cts <- list( v2 = "gaussian", v5 = "gaussian", - v7 = "gaussian", v8 = "gaussian", v13 = "gaussian", - v14 = "gaussian", v16 = "gaussian", v17 = "gaussian", - v22 = "gaussian", v23 = "gaussian", v24 = "gaussian", - v25 = "gaussian", v29 = "gaussian", v30 = "gaussian", - v31 = "gaussian", v33 = "gaussian") -@ -Now fit the model defined in dag33 - full independence -<>= -ind.mod.cts <- fitabn( data.df=var33.cts, dag.m=dag33, - data.dists=mydists.cts, verbose=FALSE) -@ -We use as default priors, a Gaussian density with $0$ mean and variance $1000$. -While for the precision, inverse of variance, we use a Gamma density of hyperparameters -$0.001$ and $1/0.001$. -This is the network score (goodness of fit, log marginal likelihood): -<>= -ind.mod.cts$mlik -@ -Now fit a model with conditional dependencies, for example let \rr{v33} -depend on \rr{v31}, and \rr{v24} depend on \rr{v23}, and \rr{v14} depend -on \rr{v13}. -<>= -dag33["v33","v31"] <- 1; -dag33["v24","v23"] <- 1; -dag33["v14","v13"] <- 1; -dep.mod.cts <- fitabn( data.df=var33.cts, dag.m=dag33, - data.dists=mydists.cts, verbose=FALSE) -@ -The network score for a model with conditional independence is: -<>= -dep.mod.cts$mlik - -tographviz( dag=dag33, data.df=var33.cts, data.dists=mydists.cts, - outfile="mydagcts.dot", directed=TRUE) #Create file -@ -\rr{mydagcat.dot} can then be processed with graphviz unix shell typing: "dot -Tpdf mydagcat.dot -o mydagcat.pdf" or using gedit if on Windows. - -\begin{figure}[htb] -\includegraphics{mydagcts} -\vspace{-1.0cm} -\caption{Directed acyclic graph \rr{dag33} for continuous variables only created using \rr{tographviz()} and Graphviz} \label{fig11} -\end{figure} - - -\subsection{Fitting an additive BN model to mixed data} \label{sec4} -To conclude the fitting of a single pre-specified model to data, e.g. based on expert opinion, we consider an additive BN model which comprises both binary and Gaussian nodes and this comprises of a combination of Binomial (logistic) and Gaussian linear models. Again \rr{fitabn()} is used and the code is almost identical to the previous examples. -<>= -dag33 <- matrix( 0, 33, 33) -colnames( dag33) <- rownames( dag33)<- names( var33) -@ -Setup distribution list for each mixed node: -<>= -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial", v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") -@ -Now fit the model defined in dag33, full independence: -<>= -ind.mod <- fitabn( data.df=var33, dag.m=dag33, - data.dists=mydists.mix, verbose=FALSE) -@ -The network score with no conditional dependencies is: -<>= -ind.mod$mlik -@ -We now fit a BN model which has the same structure as the joint distribution used to generate the data and later create a visual graph of this model. We then define a model with many independencies: -<>= -dag33[2,1] <- 1; -dag33[4,3] <- 1; -dag33[6,4] <- 1; dag33[6,7] <- 1; -dag33[5,6] <- 1; -dag33[7,8] <- 1; -dag33[8,9] <- 1; -dag33[9,10] <- 1; -dag33[11,10] <- 1; dag33[11,12] <- 1; dag33[11,19] <- 1; -dag33[14,13] <- 1; -dag33[17,16] <- 1;dag33[17,20] <- 1; -dag33[15,14] <- 1; dag33[15,21] <- 1; -dag33[18,20] <- 1; -dag33[19,20] <- 1; -dag33[21,20] <- 1; -dag33[22,21] <- 1; -dag33[23,21] <- 1; -dag33[24,23] <- 1; -dag33[25,23] <- 1; dag33[25,26] <- 1; -dag33[26,20] <- 1; -dag33[33,31] <- 1; -dag33[33,31] <- 1; -dag33[32,21] <- 1; dag33[32,31] <- 1; dag33[32,29] <- 1; -dag33[30,29] <- 1; -dag33[28,27] <- 1; dag33[28,29] <- 1; dag33[28,31] <- 1; -dep.mod <- fitabn( data.df=var33, dag.m=dag33, - data.dists=mydists.mix, verbose=FALSE) -@ - -The network score for a model with conditional independence is: -<>= -dep.mod$mlik -tographviz( dag=dag33, data.df=var33, data.dists=mydists.mix, - outfile="mydag_all.dot", directed=TRUE)#Create file -@ - -\begin{figure}[htb] -\includegraphics{mydag_all} -\vspace{-1.0cm} -\caption{Directed acyclic graph \rr{dag33} for mixed continuous and discrete variables} \label{fig12} -\end{figure} - -\subsection{Model fitting validation} -In order to validate the additive models for mixed binary and Gaussian models, estimates of the posterior distributions for the model parameters using Laplace approximations were compared with those estimated using Markov chain Monte Carlo. These were always in very close agreement for the range of models and data examined. This is an indirect validation of the Laplace estimate of the network score, e.g. if the posterior densities match closely then this implies that the denominator (the marginal likelihood - network score) must also be accurately estimated, as a ``gold standard'' estimate of the network score is generally unavailable for such non-conjugate models. - -%\newpage -\section{Further insights into searching strategy} \label{sec5} -The key objective of the \pkg{abn} library is to enable estimation of statistical dependencies in data comprising of multiple variables - that is, find a DAG which is robust and representative of the dependency structure of the (unknown) stochastic system which generated the observed data. The challenge here is that with such a vast model space it is impossible to enumerate over all possible DAGs, and there may be very many different DAGs with similar goodness of fit. In the next sections we first consider searching for additive (non-conjugate) models. - -\subsection{Single search for optimal additive BN model from categorical data} -To run a single search heuristic use \rr{search.hillclimber()}. This commences from a randomly created DAG which is constructed by randomly adding arcs to an empty network until all possible arcs have been tried. The function \rr{search.hillclimber()} then searches stepwise from the initial random network for an improved structure, where three stepwise operations are possible: i) add an arc; ii) remove and arc; or iii) reverse and arc. The stepwise search is subject to a number of conditions, firstly only moves that do not generate a cycle are permitted, secondly, a parent limit is imposed which fixes the maximum number of parents which each child node can have (arcs go from parent to child), and thirdly it is possible to ban or retain arcs. If provided, \rr{banned.m} is a matrix which defines arcs that are not allowed to be considered in the search process (or in the creation of the initial random network). Similarly, \rr{retain.m} includes arcs which must always be included in any model. It is also possible to specific an explicit starting matrix, \rr{start.m} and if using a retain matrix then \rr{start.m} should contain at least all those arcs present in \rr{retain.m}. Note that only very rudimentary checking is done to make sure that the ban, retain and start networks - if user supplied - -are not contradictory. - -To improve the computational performance of \rr{search.hillclimber()} a cache of all possible goodness of fit must be built in advance, using the function \rr{buildscorecache()}. Rather than re-calculate the score for each individual node in the network (the overall network score is the product of all the scores for the individual nodes) the score for each unique node found during the search is stored in the cache created by the function \rr{buildscorecache()}. - -<>= -bin.nodes <- c( 1,3,4,6,9,10,11,12,15,18,19,20,21,26,27,28,32) -var33.cat <- var33[,bin.nodes] #Categorical nodes only -dag33 <- matrix( 0, 17, 17) -colnames(dag33) <- rownames(dag33) <- names(var33.cat)#Set names -@ -Create banned and retain empty DAGs: -<>= -banned.cat <- matrix( 0, 17, 17) -colnames(banned.cat) <- rownames(banned.cat) <- names(var33.cat) -retain.cat <- matrix( 0, 17, 17) -colnames(retain.cat) <- rownames(retain.cat) <- names(var33.cat) -@ -Setup distribution list for each categorical node: -<>= -mydists.cat <- list( v1 = "binomial", v3 = "binomial", - v4 = "binomial", v6 = "binomial", v9 = "binomial", - v10 = "binomial", v11 = "binomial", v12 = "binomial", - v15 = "binomial", v18 = "binomial", v19 = "binomial", - v20 = "binomial", v21 = "binomial", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v32 = "binomial") -@ -Build cache of all the local computations this information is needed later when running a model search: -<>= -mycache.cat <- buildscorecache( data.df=var33.cat, - data.dists=mydists.cat, dag.banned=banned.cat, - dag.retained=retain.cat, max.parents=1) -@ -Running a single search heuristic for an additive BN uses \rr{search.hillclimber()}. -It uses a parameter prior specifications (as detailed above). Several additional -arguments are available which relate to the numerical routines used in the Laplace -approximation to calculate the network score. The defaults appear to work reasonably -well in practice and if it is not possible to calculate a robust value for this approximation -in any model, for example due to a singular design matrix at one or more nodes, then the model -is simply assigned a log network score of $-\infty$ which effectively removes it from the model search. - -Run a single search heuristic for an additive BN: -<>= -heur.res.cat <- search.hillclimber( score.cache=mycache.cat, - num.searches=1, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -Setting \rr{trace=TRUE}, the majority consensus network is -plotted as the searches progress. - -\subsection{Single search for optimal additive BN model for continuous data} \label{sec4a} -As above but for a network of Gaussian nodes. -<>= -var33.cts <- var33[,-bin.nodes] #Drop categorical nodes -dag33 <- matrix( 0, 16, 16) -colnames(dag33) <- rownames(dag33) <- names(var33.cts) #Set names -banned.cts <- matrix( 0, 16, 16) -colnames(banned.cts) <- rownames(banned.cts) <- names(var33.cts) -retain.cts <- matrix( 0, 16, 16) -colnames(retain.cts) <- rownames(retain.cts) <- names(var33.cts) -@ -Setup distribution list for each continuous node: -<>= -mydists.cts <- list( v2 = "gaussian", v5 = "gaussian", - v7 = "gaussian", v8 = "gaussian", v13 = "gaussian", - v14 = "gaussian", v16 = "gaussian", v17 = "gaussian", - v22 = "gaussian", v23 = "gaussian", v24 = "gaussian", - v25 = "gaussian", v29 = "gaussian", v30 = "gaussian", - v31 = "gaussian", v33 = "gaussian") -@ -Build cache of all local computations, -information needed later when running a model search: -<>= -mycache.cts<- buildscorecache( data.df=var33.cts, - data.dists=mydists.cts, dag.banned=banned.cts, - dag.retained=retain.cts, max.parents=1) -@ -Run a single search heuristic for an additive BN: -<>= -heur.res.cts<- search.hillclimber( score.cache=mycache.cts, - num.searches=1, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -Setting trace=TRUE, the majority consensus network is -plotted as the searches progress. - -\subsection{Single search for optimal additive BN model for mixed data} - -Model searching for mixed data is again very similar to the previous examples. Note that in this example the parameter priors are specified explicitly (although those given are the same as the defaults). The \rr{+1} in the hyperparameter specification is because a constant term is included in the additive formulation for each node. -<>= -dag33 <- matrix( 0, 33, 33) -colnames(dag33) <- rownames(dag33) <- names(var33)#Set names -@ -Create empty DAGs: -<>= -banned.mix <- matrix( 0, 33, 33) -colnames(banned.mix) <- rownames(banned.mix) <- names(var33) -retain.mix<- matrix( 0, 33, 33) -colnames(retain.mix) <- rownames(retain.mix) <- names(var33) -@ -Setup distribution list for mixed node: -<>= -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial", v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") -@ -Build cache of all local computations, -information needed later when running a model search: -<>= -mycache.mix <- buildscorecache( data.df=var33, - data.dists=mydists.mix, dag.banned=banned.mix, - dag.retained=retain.mix, max.parents=1) -@ -Run a single search heuristic for an additive BN: -<>= -heur.res.mix <- search.hillclimber( score.cache=mycache.mix, - num.searches=1, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -Setting \rr{trace=TRUE}, the majority consensus network is -plotted as the searches progress. - -\subsection{Multiple Search Strategies} -To estimate a robust additive BN for a given dataset is it necessary to run many searches and then summarize the results of these searches. The function \rr{search.hillclimber()} \\ -with \rr{num.searches>1} run multiple searches. It is necessary to use a single joint node cache over all searches, using the function \rr{buildscorecache}. - -Conceptually it may seem more efficient to use one global node cache to allow node information to be shared between different searches, however, in practice as the search space is so vast for some problems this can result in extremely \emph{slow} searches. As the cache becomes larger it can take much more time to search it (and it may need to be searched a very large number of times) than to simply perform the appropriate numerical computation. Profiling using the google performance tool google-pprof suggests that more than 80\% of the computation time may be taken up by lookups. When starting searches from different random places in the model space the number of individual node structures in common between any two searches, relative to the total number of different node structures searched over can be very small meaning a common node cache is inefficient. This may not be the case when starting networks are relatively similar. - -To help with performance monitoring it is possible to turn on timings using \rr{timing.on=TRUE} which then outputs the number of seconds of CPU time each individual search takes (using standard libc functions declared in time.h). - -<>= -dag33 <- matrix( 0, 33, 33) -colnames(dag33) <- rownames(dag33) <- names(var33) #Set names -@ -Create empty DAGs: -<>= -banned.mix <- matrix( 0, 33, 33) -colnames(banned.mix)<- rownames(banned.mix)<- names(var33) -retain.mix <- matrix( 0, 33, 33) -colnames(retain.mix) <- rownames(retain.mix)<- names(var33) -@ -Setup distribution list for mixed node: -<>= -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial" , v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") -n.searches <- 10 -@ -The number $10$ is an example only, it must be much larger in practice. -Set the parent limits: -<>= -max.par <- 1 -@ -We set only one parent, because the search with \rr{buildscorecache()} take some minutes. -Now we build the cache: -<>= -mycache.mix <- buildscorecache( data.df=var33, data.dists=mydists.mix, -dag.banned=banned.mix, dag.retained=retain.mix, max.parents=max.par) -@ -Repeat but this time have the majority consensus network plotted -as the searches progress: -<>= -myres.mlp <- search.hillclimber(score.cache=mycache.mix, - num.searches=n.searches, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -\subsection{Creating a Summary Network: Majority Consensus}\label{mcn} -Having run many heuristic searches, then the next challenge is to summarise these results to allow for ready identification of the joint dependencies most supported by the data. One common, and very simple approach is to produce a single robust BN model of the data mimicing the approach used in phylogenetics to create majority consensus trees. A majority consensus DAG is constructed from all the arcs present in at least 50\% of the locally optimal DAGs found in the search heuristics. This creates a single summary network. Combining results from different runs of \rr{search.hillclimber()} or \rr{search.hillclimber()} is straightforward, although note that it is necessary to check for duplicate random starting networks, as while highly unlikely this is theoretically possible. The following code provides a simple way to produce a majority consensus network and Figure~\ref{fig13} shows the resulting network - note that this is an example only and many thousands of searches may need to be conducted to achieve robust results. One simple ad-hoc method for assessing how many searches are needed is to run a number of searches and split the results into two (random) groups, and calculate the majority consensus network within each group. If these are the same then it suggests that sufficient searches have been run. -To plot the majority consensus network use the result of the function \rr{search.hillclimber}, see below for some example. - -\begin{figure}[htb] -\includegraphics{dagcon} -\vspace{-1.0cm} -\caption{Example majority consensus network (from the results of only 10 searches)} \label{fig13} -\end{figure} - -<>= -tographviz( dag= myres.mlp$consensus, data.df=var33, -data.dists=mydists.mix, outfile="dagcon.dot") #Create file -@ -\rr{dagcon.dot} can then be processed with graphviz un a unix shell typing: "dot -Tpdf dagcon.dot -o dagcon.pdf" or using gedit if on Windows. - -\subsection{Creating a Summary Network: Pruning} -Rather than use the majority consensus network as the most appropriate model of the data, an alternative approach is to choose the single best model found during a large number of searches. To determine sufficient heuristic searches have been run to provide reasonable coverage of all the features of the model landscape, then again checking for a stable majority consensus network as in Section~\ref{mcn}, seems a sensible approach. Once the best overall DAG has been identified then the next task is to check this model for over-fitting. Unlike with the majority consensus network, which effective ``averages'' over many different competing models and therefore should generally comprise only robust structural features, choosing the DAG from a single model search is far more likely to contain some spurious features. When dealing with smaller data sets, say, of several hundred observations then this is extremely likely, as can easily be demonstrated using simulated data. A simple assessment of overfitting can be made by comparing the number of arcs in the majority consensus network with the number of arcs in the best fitting model. We have found that in larger data sets the majority consensus and best fitting model can be almost identical, while in smaller data sets the best fitting models may have many more arcs - suggesting a degree of overfitting. - -An advantage of choosing a DAG from an individual search is that unlike averaging over lots of different structures, as in the construction of a majority consensus network, the model chosen here has a structure which was actually found during a search across the model landscape. In contrast, the majority consensus network is a derived model which may never have been found chosen during even an exhaustive search, indeed it may even comprise of contradictory features as is a usual risk in averaging over different explanations (models) of data. In addition, a majority consensus network need also not be acyclic, although in practice this can be easily corrected by reversing one or more arcs to produce an appropriate DAG. - -A simple compromise between the risk of over-fitting in choosing the single highest scoring DAG, and the risk of inappropriately averaging across different distinct data generating processes, is to prune the highest scoring DAG using the majority consensus model. In short, an element by element multiply of the highest scoring DAG and the majority consensus DAG, which gives a new DAG which only contains the structural features in \emph{both} models. - -\section{Summary} -The \pkg{abn} library provides a range of Bayesian network models to assist with identifying statistical dependencies in complex data, in particular models which are multidimensional analogues of generalised linear models. This process is typically referred to as structure learning, or structure discovery, and is computational extremely challenging. Heuristics are the only options for data comprising of larger numbers of variables. As with all model selection, over-modelling is an everpresent danger and using either: i) summary models comprising of structural features present in many locally optimal models or else; ii) using parametric bootstrapping to determine the robustness of the features in a single locally optimal model are likely essential to provide robust results. An alternative presented was exact order based searches, in particular finding the globally most probable structure. This approach is appealing as it is exact, but despite collapsing DAGs into orderings for larger scale problems it may not be feasible. -For further in-depth analysis about \pkg{abn} refer to the website: \href{http://www.r-bayesian-networks.org}{www.r-bayesian-networks.org}. - -\newpage -\bibliography{abn} - -\end{document} \ No newline at end of file Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/inst/doc/abn_v1.3.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/inst/doc/abn_v1.3.pdf differ diff -Nru abn-1.2/inst/doc/abn_v1.3.pdf.asis abn-1.3/inst/doc/abn_v1.3.pdf.asis --- abn-1.2/inst/doc/abn_v1.3.pdf.asis 1970-01-01 00:00:00.000000000 +0000 +++ abn-1.3/inst/doc/abn_v1.3.pdf.asis 2018-11-19 14:43:19.000000000 +0000 @@ -0,0 +1,2 @@ +%\VignetteIndexEntry{ABN - Vignette} +%\VignetteEngine{R.rsp::asis} diff -Nru abn-1.2/man/link_strength.Rd abn-1.3/man/link_strength.Rd --- abn-1.2/man/link_strength.Rd 2018-08-02 13:04:41.000000000 +0000 +++ abn-1.3/man/link_strength.Rd 2018-11-20 11:43:17.000000000 +0000 @@ -20,7 +20,7 @@ \usage{ link.strength(dag.m = NULL, - data.df = data.df, + data.df = NULL, data.dists = NULL, method = c("mi.raw","mi.raw.pc","mi.corr","ls","ls.pc","stat.dist"), discretization.method = "doane") diff -Nru abn-1.2/MD5 abn-1.3/MD5 --- abn-1.2/MD5 2018-08-10 12:40:02.000000000 +0000 +++ abn-1.3/MD5 2018-11-22 14:30:09.000000000 +0000 @@ -1,19 +1,19 @@ 3d0b152397a9871aec610367d9384ae8 *ChangeLog -30bb25fefd8154b3e885a291c5715780 *DESCRIPTION +584f092929e00c80a5c7a08188d5043c *DESCRIPTION 0893d9c3ce60deabfc61af8fa36afc7a *NAMESPACE f0db51f7950e2cdb42607b002b5843f9 *R/RcppExports.R 3aaa2182bb4ebcfc1535884363e15922 *R/abn-infotheo.R f106dfd056a9390f93d0248443ed98b1 *R/abn-internal.R -4101e6aad8ceaf737aed0d6a91a75235 *R/abn-toolbox.R +36837d8b22420fa3dc4a0380252949c3 *R/abn-toolbox.R 3e70c342fbcad595228b36325170929f *R/build_score_cache.R 709f982f33c3ddfe236b61e7e33c7a5e *R/build_score_cache_mle.R b48bdc6c72fe2ab565703394175404bf *R/calc_node_inla_glm.R f88fc90904fcad527c87f6e188584f3d *R/calc_node_inla_glmm.R dcf00a9a2fcfe77b59dcfaf0bb21d151 *R/fitabn.R -0cc76aac47af392eec133ae0872d7c32 *R/fitabn_mle.R +2f2c2db5f5feaa42250e55c13313bda8 *R/fitabn_mle.R 675c5e6882e3cad98ada118092137ba1 *R/getmarginals.R -6eb5fa7ce9fabece83a127f76c45c560 *R/heuristic_search.R -f6817acba3b33e24a7f52e89b8897bc3 *R/link-strength.R +19eb342fbd2da905bfaa318c740155b6 *R/heuristic_search.R +d54aec5abd753f7612562c45c6066c1c *R/link-strength.R 1951b84db135a2ea066ff90be3c5b148 *R/markov-blanket.R 58e590169fd38cc99471dfe1fbf8f3a4 *R/mostprobable.R da799c0dfcb5f9436f19a72804c175c8 *R/plot-abn.R @@ -21,7 +21,7 @@ 83f0e3e657b4461afbaa3d51fb4cd678 *R/simulate-abn.R 1c79cc8716cda48038ef57a7d090f148 *R/tographviz.R b2b22ab9fa6ca6f6855528640b77023d *README.md -216a01c0451d2026cfd5430ac95d0d91 *build/vignette.rds +edfc9ebfcf55328d6e33f419555482e0 *build/vignette.rds 67c9db5dc1a8e12ccf5880ffe206ce56 *cleanup a607b1af0d1a08fe814185008930f29b *configure ee1e7fa4489f28e41f9b80c26037e37b *configure.ac @@ -35,7 +35,7 @@ 166b261826eb76072e84995d8b05d8e8 *data/ex7data.RData ad7cc249cc3b4bc990671b8f037e1f57 *data/pigs.vienna.RData 256a987f46ef4559ee4e35ca9377ea30 *data/var33.RData -7c728b294b5bb0b62f988b63d807b874 *inst/CITATION +a7c14ee19e8ec6cefc10420004190ca3 *inst/CITATION bdd0d6cda099cd898fb5b6cddfdff3cd *inst/bootstrapping_example/10KBootstrapping.R 590062e05ff21cc0ab91abf7f18e442f *inst/bootstrapping_example/10Kbootstrapping.bash 8595e1a2b9a5b568c8256b3841df0f33 *inst/bootstrapping_example/MargPlots_PigsData.pdf @@ -51,9 +51,8 @@ 1ec8c6aa3e4d72389d645716356efebb *inst/bootstrapping_example/pigs_post_params.R 48b6d9f03b7f95878151d74b81193745 *inst/bootstrapping_example/script1.R 0671a342cc3e85e14ef51eb30c4cb56a *inst/bootstrapping_example/simulate_1par.bug -46b7999e87e39327021813dea3c1de29 *inst/doc/abn_v1.0.2.R -a73a52ab21bc415f0a2ad08c5b703b24 *inst/doc/abn_v1.0.2.Rnw -221cd4117aebafa21a668e02f63f5b4c *inst/doc/abn_v1.0.2.pdf +79c3694e62338c078ad33d891db32881 *inst/doc/abn_v1.3.pdf +8e2174087211100a4ffea15ee515e19d *inst/doc/abn_v1.3.pdf.asis 55691f86625075f62c18de7019bf0af2 *man/abninla-internal.Rd 3bb0a82b0ebbc14ed7e0b8e50e118403 *man/build_score_cache.Rd b2c462b48b840eb4ca74e846edf9621d *man/build_score_cache_mle.Rd @@ -74,7 +73,7 @@ 8d64b52aec735208b7cb5078080d9e5c *man/fitabn_mle.Rd 5fe412237e412f299865a6d9b9ed54b3 *man/heuristic_search.Rd 8f5c77af2892cd8bf33b633798a24082 *man/infoDag.Rd -31fb14d852bec13c7d7c08bac44c7ef8 *man/link_strength.Rd +0054060ec132276b51b1bae9a3bc4b76 *man/link_strength.Rd 7dae8e6fd9586639cdd7df1b8490260a *man/logit.Rd c142f1930bf97322483bddad49e4b953 *man/mb.Rd 0df18014f7a3d1bb263edfa952746117 *man/miData.Rd @@ -90,7 +89,7 @@ e169857bb5fa5838c9123cf619ebd294 *src/Makevars.in 8bfbf7a9b23a9469fb89f604d3069e90 *src/Makevars.win 249718ff2342383d10a56a53246b175b *src/RcppExports.cpp -7542d6331cee938b5490878bf7650590 *src/abn_toolbox.cpp +64a743c4fb75bc3eec6a56bd2667fa8f *src/abn_toolbox.cpp 3bd8a187b67740d0a545b89e782dcd6c *src/buildcachematrix.c 03fa8f4ee9431f283a1cd72d7ad456a4 *src/buildcachematrix.h 9f723636b1fdf6d3623b583c2a1777db *src/cycles.c @@ -106,8 +105,8 @@ a455e6e2a728d7941ae6cbd6bb6171e3 *src/irls_gaussian.cpp 20e1e3970144a08ae197e4da0f5aee5e *src/irls_gaussian_fast.cpp d41d8cd98f00b204e9800998ecf8427e *src/irls_multinomial.cpp -0ad4e8d27adb77c62b5b7e75d9528cef *src/irls_poisson.cpp -97a5497665846844c779bc872512b7a7 *src/irls_poisson_fast.cpp +90b7fe101234cf556ecf6150c1a275eb *src/irls_poisson.cpp +7dc410528e1dde8f0e2ff2cb6407c48d *src/irls_poisson_fast.cpp 1522c9181ed3483e1c38768d4162025a *src/mi.cpp bcc01ae8c17227fe3b97f082d8b9d994 *src/mobius.c 8cf1430fd00296d57e99f7b9af10bec6 *src/mobius.h @@ -145,27 +144,4 @@ 9452d13c98f64e143ecfd4bb5c83d2f6 *tests/testthat/test.R 020c08f2bbecf5867806bbb66f7d9a2f *tests/testthat/testdata/buildscorecache_ex1.Rdata 2980ac038ce41407030f522db8e162fa *tests/testthat/testdata/fitabn_ex0.Rdata -8d8c3b38738d83a01586f21db16a35a4 *vignettes/Bootstrapping.png -540b610a1ecc3c31cf91bd9c9daa8747 *vignettes/ComparisonOfNetworkScore.pdf -3c7c89e5a700f7341b836086f7e6cebc *vignettes/DAG_cycle.pdf -8595e1a2b9a5b568c8256b3841df0f33 *vignettes/MargPlots_PigsData.pdf -a0275e92692042a3a3c7dc2e2e227665 *vignettes/PigsArea.png -68e948e277bf8ba8320b3a3efa2ff03d *vignettes/Pigs_PostBootPlots.pdf -d92ce4b0e2efa3e2d82e2a7f277b8a9c *vignettes/Summary.png -15a80dc3c2707e0b538df36797fb096d *vignettes/abn.bib -a90980520d8d40b79b9bf2588ac697c1 *vignettes/abn_v1.0.2-concordance.tex -a73a52ab21bc415f0a2ad08c5b703b24 *vignettes/abn_v1.0.2.Rnw -eb27b9008e47b2dbfde6cfa20f296938 *vignettes/dagcon.pdf -2fb39b851e85388b1515db6bfb9e7780 *vignettes/dagcon.png -83f998e7348c4a86da5140de5c91388a *vignettes/map1_10var.dot -ef7fa22bd9592187230d7ccde81d81c5 *vignettes/map1_10var.pdf -a380d79cf49e1724ba890465c4ed0b46 *vignettes/map1_10var.png -1c62c2e74c0cde8a58c9f47c64ba50b9 *vignettes/map_1par.dot -beb58cf7b79c881362576a4917f9b250 *vignettes/map_1par.pdf -6bf7f4e1ac4828e197927fb990d5f3ae *vignettes/mydag.dot -e00af17e695a21b679439c87499fe615 *vignettes/mydag.pdf -962e20e460e9e63f3aed32ef8812707f *vignettes/mydag_all.pdf -5a75c927097afd12fdff01e11381e812 *vignettes/mydagcts.pdf -ef1701d87dc5102c5075d82914455107 *vignettes/plbinaryNode.png -43ed69fc44caa66a1118bd020126bb51 *vignettes/postbootpigs.pdf -94bc5f87bf9ec5ef1daa5959498b4934 *vignettes/var33_MASTER.pdf +8e2174087211100a4ffea15ee515e19d *vignettes/abn_v1.3.pdf.asis diff -Nru abn-1.2/R/abn-toolbox.R abn-1.3/R/abn-toolbox.R --- abn-1.2/R/abn-toolbox.R 2018-08-02 13:04:41.000000000 +0000 +++ abn-1.3/R/abn-toolbox.R 2018-10-22 10:18:39.000000000 +0000 @@ -374,4 +374,11 @@ return(moral) } } + +##------------------------------------------------------------------------- +##External functions for MCMC +##------------------------------------------------------------------------- + + + ##EOF \ No newline at end of file diff -Nru abn-1.2/R/fitabn_mle.R abn-1.3/R/fitabn_mle.R --- abn-1.2/R/fitabn_mle.R 2018-08-02 13:04:41.000000000 +0000 +++ abn-1.3/R/fitabn_mle.R 2018-10-15 14:11:47.000000000 +0000 @@ -129,20 +129,7 @@ num.na<-fit[[2]] fit<-fit[[1]] }else{ - - - - - - # num.na<-0 - # if(qr(X)$rank/ncol(X)!=1 & as.character(data.dists[i])=="binomial"){ - # - # Y<-as.numeric(as.character(Y)) - # - # fit<-irls_binomial_cpp_br(A = X, b = Y, maxit = 25,tol = 10^-8) - # - # }else{ - + switch(as.character(data.dists[i]), "binomial"={ diff -Nru abn-1.2/R/heuristic_search.R abn-1.3/R/heuristic_search.R --- abn-1.2/R/heuristic_search.R 2018-08-02 13:04:41.000000000 +0000 +++ abn-1.3/R/heuristic_search.R 2018-10-08 10:19:36.000000000 +0000 @@ -67,7 +67,10 @@ score.init <- vector(mode = "numeric",length = n.var) - sc<-cbind(score.cache$node.defn[,as.numeric(colnames(dag.tmp))],score.cache$mlik) + if(score %in% c("bic", "aic", "mdl")){sc<-cbind(score.cache$node.defn[,as.numeric(colnames(dag.tmp))],-score.cache[[score]]) + }else{ + sc<-cbind(score.cache$node.defn[,as.numeric(colnames(dag.tmp))],score.cache[[score]]) + } for(lines in 1:n.var){ @@ -239,15 +242,29 @@ ##return - if(verbose){ - + + ##todo: inverse detailed score for aic/bic/mdl + + if(score %in% c("bic","aic","mdl")){ + if(verbose){ return(list(dags = out.dags, - scores = out.scores, - detailed.score = out.detailed)) + scores = lapply(X = out.scores,FUN = function(x){-x}), + detailed.score = lapply(X = out.detailed,FUN = function(x){-x}))) }else{ return(list(dag = out.dags[[which.max(unlist(out.scores))]], - score = max(unlist(out.scores)))) + score = -max(unlist(out.scores)))) + } + }else{ + if(verbose){ + return(list(dags = out.dags, + scores = out.scores, + detailed.score = out.detailed)) + }else{ + + return(list(dag = out.dags[[which.max(unlist(out.scores))]], + score = max(unlist(out.scores)))) + } } }#eof \ No newline at end of file diff -Nru abn-1.2/R/link-strength.R abn-1.3/R/link-strength.R --- abn-1.2/R/link-strength.R 2018-08-02 13:04:41.000000000 +0000 +++ abn-1.3/R/link-strength.R 2018-09-20 06:39:19.000000000 +0000 @@ -9,7 +9,7 @@ ##Function that return estimation of link strength for a Bayesian Network ##------------------------------------------------------------------------- -link.strength <- function(dag.m=NULL,data.df = data.df, data.dists=NULL, method=c("mi.raw","mi.raw.pc","mi.corr","ls","ls.pc","stat.dist"), discretization.method="doane"){ +link.strength <- function(dag.m=NULL,data.df = NULL, data.dists = NULL, method = c("mi.raw","mi.raw.pc","mi.corr","ls","ls.pc","stat.dist"), discretization.method="doane"){ ##for compatibility purpose dag <- dag.m diff -Nru abn-1.2/src/abn_toolbox.cpp abn-1.3/src/abn_toolbox.cpp --- abn-1.2/src/abn_toolbox.cpp 2018-08-10 09:44:48.000000000 +0000 +++ abn-1.3/src/abn_toolbox.cpp 2018-11-22 13:17:53.000000000 +0000 @@ -15,7 +15,7 @@ Rcpp::NumericVector result(n); for(int i = 0; i < n; ++i) { - result[i] = log( x[i] / (1.0 - x[i]) ); + result[i] = std::log(static_cast( x[i] / (1.0 - x[i]) )); } return result; @@ -29,7 +29,7 @@ Rcpp::NumericVector result(n); for (int i=0; i < n; ++i) { - result[i] = 1.0 / (1.0 + exp (-1.0 * x[i])); + result[i] = 1.0 / (1.0 + std::exp(static_cast( (-1.0 * x[i])))); } return result; } diff -Nru abn-1.2/src/irls_poisson.cpp abn-1.3/src/irls_poisson.cpp --- abn-1.2/src/irls_poisson.cpp 2018-08-10 09:44:48.000000000 +0000 +++ abn-1.3/src/irls_poisson.cpp 2018-11-22 13:17:53.000000000 +0000 @@ -6,7 +6,7 @@ using namespace R; // [[Rcpp::export]] -int factorial(int n) +double factorial(int n) { return (n == 1 || n == 0) ? 1 : factorial(n - 1) * n; } diff -Nru abn-1.2/src/irls_poisson_fast.cpp abn-1.3/src/irls_poisson_fast.cpp --- abn-1.2/src/irls_poisson_fast.cpp 2018-08-10 09:44:48.000000000 +0000 +++ abn-1.3/src/irls_poisson_fast.cpp 2018-11-22 13:17:53.000000000 +0000 @@ -6,7 +6,7 @@ using namespace R; // [[Rcpp::export]] -int factorial_fast(int n) +double factorial_fast(int n) { return (n == 1 || n == 0) ? 1 : factorial_fast(n - 1) * n; } diff -Nru abn-1.2/vignettes/abn.bib abn-1.3/vignettes/abn.bib --- abn-1.2/vignettes/abn.bib 2018-08-02 13:04:43.000000000 +0000 +++ abn-1.3/vignettes/abn.bib 1970-01-01 00:00:00.000000000 +0000 @@ -1,942 +0,0 @@ -% This file was created with JabRef 2.10b2. -% Encoding: UTF-8 - - -@Book{Babyak2004, - Title = {What you see may not be what you get: a brief, nontechnical introduction to overfitting in regression-type models.}, - Author = {Babyak, Michael A.}, - Editor = {Vol 66(3), 411-421}, - Publisher = {Psychosomatic Medicine}, - Year = {2004}, - - Owner = {martapit}, - Timestamp = {2014.12.03} -} - -@PhdThesis{Boettcher2004, - Title = {{Learning Bayesian networks with mixed variables}}, - Author = {Boettcher, S. G.}, - School = {Aalborg University - Department of Mathematical Sciences}, - Year = {2004}, - - Owner = {marta}, - Timestamp = {2013.04.22} -} - -@InProceedings{Buntine1991, - Title = {Theory refinement on Bayesian networks}, - Author = {Buntine, W}, - Booktitle = {Uncertainty in artificial intelligence: proceedings of the seventh conference}, - Year = {1991}, - Pages = {52-60}, - Publisher = {Morgan Kaufmann Publishers Inc.}, - - Owner = {martapit}, - Timestamp = {2015.12.07} -} - -@Article{COOPER1992, - Title = {{A Bayesian method for the induction of probabilistic networks from data}}, - Author = {Cooper, G. F. and Herskovits, E.}, - Journal = {Machine Learning}, - Year = {1992}, - - Month = oct, - Number = {4}, - Pages = {309--347}, - Volume = {9}, - - Abstract = {This paper presents a Bayesian method for constructing probabilistic networks from databases. In particular, we focus on constructing Bayesian belief networks. Potential applications include computer-assisted hypothesis testing, automated scientific discovery, and automated construction of probabilistic expert systems. We extend the basic method to handle missing data and hidden (latent) variables. We show how to perform probabilistic inference by averaging over the inferences of multiple belief networks. Results are presented of a preliminary evaluation of an algorithm for constructing a belief network from a database of cases. Finally, we relate the methods in this paper to previous work, and we discuss open problems.}, - Af = {COOPER, GFEOLEOLHERSKOVITS, E}, - C1 = {NOET SYST INC,BALTIMORE,MD 21218.}, - De = {PROBABILISTIC NETWORKS; BAYESIAN BELIEF NETWORKS; MACHINE LEARNING;EOLEOLINDUCTION}, - Ga = {JQ511}, - J9 = {MACH LEARN}, - Ji = {Mach. Learn.}, - La = {English}, - Nr = {71}, - Owner = {marta}, - Pa = {SPUIBOULEVARD 50, PO BOX 17, 3300 AA DORDRECHT, NETHERLANDS}, - Pg = {39}, - Pi = {DORDRECHT}, - Publisher = {Kluwer Academic Publ}, - Rp = {COOPER, GF (reprint author), UNIV PITTSBURGH,DEPT MED,MED INFORMAT SECT,PITTSBURGH,PA 15261, USA.}, - Sc = {Computer Science}, - Sn = {0885-6125}, - Tc = {1043}, - Timestamp = {2013.04.22}, - Ut = {WOS:A1992JQ51100002}, - Wc = {Computer Science, Artificial Intelligence}, - Z9 = {1128} -} - -@Article{Djebbari2008, - Title = {Seeded Bayesian Networks: Constructing genetic networks from microarray data}, - Author = {Djebbari, A. and Quackenbush, J.}, - Journal = {BMC Systems Biology}, - Year = {2008}, - - Month = jul, - Pages = {57}, - Volume = {2}, - - Af = {Djebbari, AmiraEOLEOLQuackenbush, John}, - C1 = {[Quackenbush, John] Harvard Univ, Sch Publ Hlth, Dana Farber Canc Inst, Dept Biostat & Computat Biol, Boston, MA 02115 USA.EOLEOLHarvard Univ, Sch Publ Hlth, Dept Biostat, Boston, MA 02115 USA.}, - Em = {amirad@gmail.com; johnq@jimmy.harvard.edu}, - Ga = {328NB}, - J9 = {BMC SYST BIOL}, - Ji = {BMC Syst. Biol.}, - La = {English}, - Nr = {38}, - Owner = {marta}, - Pa = {236 GRAYS INN RD, FLOOR 6, LONDON WC1X 8HL, ENGLAND}, - Pg = {13}, - Pi = {LONDON}, - Publisher = {Biomed Central Ltd}, - Rp = {Quackenbush, J (reprint author), Harvard Univ, Sch Publ Hlth, Dana Farber Canc Inst, Dept Biostat & Computat Biol, Boston, MA 02115 USA.}, - Sc = {Mathematical & Computational Biology}, - Sn = {1752-0509}, - Tc = {30}, - Timestamp = {2013.04.22}, - Ut = {WOS:000257804000001}, - Wc = {Mathematical & Computational Biology}, - Z9 = {30} -} - -@Article{Dojer2006, - Title = {Applying dynamic Bayesian networks to perturbed gene expression data}, - Author = {Dojer, N. and Gambin, A. and Mizera, A. and Wilczynski, B. and Tiuryn, J.}, - Journal = {BMC Bioinformatics}, - Year = {2006}, - - Month = may, - Pages = {249}, - Volume = {7}, - - Af = {Dojer, NorbertEOLEOLGambin, AnnaEOLEOLMizera, AndrzejEOLEOLWilczynski, BartekEOLEOLTiuryn, Jerzy}, - C1 = {Warsaw Univ, Inst Informat, PL-02097 Warsaw, Poland.EOLEOLPolish Acad Sci, Inst Fundamental Technol Res, PL-00049 Warsaw, Poland.EOLEOLPolish Acad Sci, Inst Math, PL-00956 Warsaw, Poland.}, - Em = {dojer@mimuw.edu.pl; aniag@mimuw.edu.pl; amizera@ippt.gov.pl;EOLEOLbartek@mimuw.edu.pl; tiuryn@mimuw.edu.pl}, - Ga = {067LI}, - J9 = {BMC BIOINFORMATICS}, - Ji = {BMC Bioinformatics}, - La = {English}, - Nr = {25}, - Owner = {marta}, - Pa = {MIDDLESEX HOUSE, 34-42 CLEVELAND ST, LONDON W1T 4LB, ENGLAND}, - Pg = {11}, - Pi = {LONDON}, - Publisher = {Biomed Central Ltd}, - Ri = {Wilczynski, Bartek/A-5710-2009; Gambin, Anna/I-3580-2012}, - Rp = {Tiuryn, J (reprint author), Warsaw Univ, Inst Informat, Banacha 2, PL-02097 Warsaw, Poland.}, - Sc = {Biochemistry & Molecular Biology; Biotechnology & Applied Microbiology;EOLEOLMathematical & Computational Biology}, - Sn = {1471-2105}, - Tc = {54}, - Timestamp = {2013.04.22}, - Ut = {WOS:000239303500001}, - Wc = {Biochemical Research Methods; Biotechnology & Applied Microbiology;EOLEOLMathematical & Computational Biology}, - Z9 = {63} -} - -@Article{Firestone2013, - Title = {{Understanding the associations between on-farm biosecurity practice and equine influenza infection during the 2007 outbreak in Australia}}, - Author = {Firestone, Simon M. and Lewis, Fraser I. and Schemann, Kathrin and Ward, Michael P. and Toribio, Jenny Ann and Dhand, Navneet K.}, - Journal = {Preventive Veterinary Medicine}, - Year = {2013}, - Number = {1}, - Pages = {28--36}, - Volume = {110}, - - Address = {{PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS}}, - Affiliation = {{Firestone, SM (Reprint Author), Univ Sydney, Fac Vet Sci, 425 Werombi Rd, Camden, NSW 2570, Australia. Firestone, Simon M.; Schemann, Kathrin; Ward, Michael P.; Toribio, Jenny-Ann L. M. L.; Dhand, Navneet K., Univ Sydney, Fac Vet Sci, Camden, NSW 2570, Australia. Firestone, Simon M., Univ Melbourne, Fac Vet Sci, Parkville, Vic 3010, Australia. Lewis, Fraser I., Univ Zurich, Vetsuisse Fac, CH-8057 Zurich, Switzerland.}}, - Author-email = {{simon.firestone@unimelb.edu.au}}, - Doc-delivery-number = {{143SH}}, - Funding-acknowledgement = {{Rural Industries Research and Development Corporation (RIRDC); Australian Biosecurity Cooperative Research Centre for Emerging Infectious Diseases (ABCRC); Swiss National Science Foundation {[}IZK0Z3\_140188]}}, - Funding-text = {{This research was jointly funded by the Rural Industries Research and Development Corporation (RIRDC), the Australian Biosecurity Cooperative Research Centre for Emerging Infectious Diseases (ABCRC) and the Swiss National Science Foundation (grant: IZK0Z3\_140188). The authors are grateful for access to the Schrodinger super-computer at the University of Zurich, and also for the time and cooperation of the horse premises owners and managers interviewed, the NSW DPI for making the equine influenza dataset available, and the following individuals for contributions to data compilation and comments on study design: Barbara Moloney (NSW DPI), Nina Kung (QLD DPIF), Brendan Cowled, Evan Sergeant and Nigel Perkins (AusVet).}}, - ISSN = {{0167-5877}}, - Journal-iso = {{Prev. Vet. Med.}}, - Keywords-plus = {{BAYESIAN NETWORKS; STRUCTURE DISCOVERY; RISK-FACTORS; HORSES; DETERMINANTS; PERCEPTIONS; KNOWLEDGE; SPREAD}}, - Language = {{English}}, - Number-of-cited-references = {{46}}, - Orcid-numbers = {{Lewis, Fraser/0000-0003-1145-8271 }}, - Owner = {martapit}, - Publisher = {{ELSEVIER SCIENCE BV}}, - Research-areas = {{Veterinary Sciences}}, - Researcherid-numbers = {{Lewis, Fraser/A-3635-2011 Ward, Michael/C-5758-2009}}, - Times-cited = {{1}}, - Timestamp = {2015.04.23}, - Type = {{Article}}, - Unique-id = {{ISI:000318888000004}}, - Web-of-science-categories = {{Veterinary Sciences}} -} - -@Article{Firestone2014, - Title = {{Applying Bayesian network modelling to understand the links between on-farm biosecurity practice during the 2007 equine influenza outbreak and horse managers' perceptions of a subsequent outbreak}}, - Author = {Firestone, Simon M. and Lewis, Fraser I. and Schemann, Kathrin and Ward, Michael P. and Toribio, Jenny Ann and Taylor, Melanie R. and Dhand, Navneet K.}, - Journal = {Preventive Veterinary Medicine}, - Year = {2014}, - Number = {3}, - Pages = {243--251}, - Volume = {116}, - - Address = {{PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS}}, - Affiliation = {{Firestone, SM (Reprint Author), Univ Melbourne, Fac Vet Sci, Parkville, Vic 3010, Australia. Firestone, Simon M., Univ Melbourne, Fac Vet Sci, Parkville, Vic 3010, Australia. Firestone, Simon M.; Schemann, Kathrin; Ward, Michael P.; Toribio, Jenny-Ann L. M. L.; Dhand, Navneet K., Univ Sydney, Fac Vet Sci, Camden, NSW 2570, Australia. Lewis, Fraser I., Univ Zurich, Vetsuisse Fac, CH-8057 Zurich, Switzerland. Taylor, Melanie R., Univ Western Sydney, Sch Med, Penrith, NSW 2751, Australia.}}, - Author-email = {{simon.firestone@unimelb.edu.au}}, - Doc-delivery-number = {{AQ5UG}}, - Eissn = {{1873-1716}}, - Funding-acknowledgement = {{Rural Industries Research and Development Corporation (RIRDC) {[}IZK0Z3\_140188]; Australian Biosecurity Cooperative Research Centre for Emerging Infectious Diseases (ABCRC) {[}IZK0Z3\_140188]; Swiss National Science Foundation {[}IZK0Z3\_140188]}}, - Funding-text = {{This research was jointly funded by the Rural Industries Research and Development Corporation (RIRDC), the Australian Biosecurity Cooperative Research Centre for Emerging Infectious Diseases (ABCRC) and the Swiss National Science Foundation (Grant: IZK0Z3\_140188). The authors are grateful for access to the `edward' high performance computing cluster at the University of Melbourne and systems administrative support, and also for the time and cooperation of the horse premises owners and managers interviewed, the NSW DPI for making the equine influenza dataset available, and the following individuals for contributions to data compilation: Barbara Moloney (NSW DPI), Nina Kung (QLD DPIF), Brendan Cowled (AusVet) and Graeme Garner (DAFF).}}, - ISSN = {{0167-5877}}, - Journal-iso = {{Prev. Vet. Med.}}, - Keywords-plus = {{PROTECTION MOTIVATION; AUSTRALIA; DETERMINANTS; KNOWLEDGE; SPREAD}}, - Language = {{English}}, - Number-of-cited-references = {{43}}, - Owner = {martapit}, - Publisher = {{ELSEVIER SCIENCE BV}}, - Research-areas = {{Veterinary Sciences}}, - Researcherid-numbers = {{Ward, Michael/C-5758-2009}}, - Times-cited = {{0}}, - Timestamp = {2015.04.23}, - Type = {{Article}}, - Unique-id = {{ISI:000342873500003}}, - Web-of-science-categories = {{Veterinary Sciences}} -} - -@InProceedings{Friedman1999, - Title = {Data analysis with Bayesian networks: A Bootstrap approach}, - Author = {Friedman, N. and Goldszmidt, M. and Wyner, A.}, - Booktitle = {Proc. Fifteenth Conference on Uncertainty in Artificial Intelligence (UAI'99) (pp.206-215). San Francisco: Morgan Kaufmann}, - Year = {1999}, - - Abstract = {In recent years there has been significant progress in algorithms and methods for inducing Bayesian networks from data. However, in complex data analysis problems, we need to go beyond being satisfied with inducing networks with high scores. We need to provide confidence measures on features of these networks: Is the existence of an edge between two nodes warranted? Is the Markov blanket of a given node robust? Can we say something about the ordering of the variables? We should be able to address these questions, even when the amount of data is not enough to induce a high scoring network. In this paper we propose Efron's Bootstrap as a computationally efficient approach far answering these questions. In addition, we propose to use these confidence measures to induce better structures from the data, and to detect the presence of latent variables.}, - Owner = {martapit}, - Timestamp = {2014.12.05} -} - -@Article{Friedman2003, - Title = {{Being Bayesian about network structure. A Bayesian approach to structure discovery in Bayesian networks}}, - Author = {Friedman, N. and Koller, D.}, - Journal = {Machine Learning}, - Year = {2003}, - - Month = jan, - Number = {1-2}, - Pages = {95--125}, - Volume = {50}, - - Abstract = {In many multivariate domains, we are interested in analyzing the dependency structure of the underlying distribution, e.g., whether two variables are in direct interaction. We can represent dependency structures using Bayesian network models. To analyze a given data set, Bayesian model selection attempts to find the most likely (MAP) model, and uses its structure to answer these questions. However, when the amount of available data is modest, there might be many models that have non-negligible posterior. Thus, we want compute the Bayesian posterior of a feature, i.e., the total posterior probability of all models that contain it. In this paper, we propose a new approach for this task. We first show how to efficiently compute a sum over the exponential number of networks that are consistent with a fixed order over network variables. This allows us to compute, for a given order, both the marginal probability of the data and the posterior of a feature. We then use this result as the basis for an algorithm that approximates the Bayesian posterior of a feature. Our approach uses a Markov Chain Monte Carlo (MCMC) method, but over orders rather than over network structures. The space of orders is smaller and more regular than the space of structures, and has much a smoother posterior "landscape". We present empirical results on synthetic and real-life datasets that compare our approach to full model averaging (when possible), to MCMC over network structures, and to a non-Bayesian bootstrap approach.}, - Af = {Friedman, NEOLEOLKoller, D}, - C1 = {Hebrew Univ Jerusalem, Sch Engn \& Comp Sci, IL-91904 Jerusalem, Israel.EOLEOLStanford Univ, Dept Comp Sci, Stanford, CA 94305 USA.}, - De = {Bayesian networks; structure learning; MCMC; Bayesian model averaging}, - Ga = {594FB}, - J9 = {MACH LEARN}, - Ji = {Mach. Learn.}, - La = {English}, - Nr = {29}, - Owner = {marta}, - Pa = {VAN GODEWIJCKSTRAAT 30, 3311 GZ DORDRECHT, NETHERLANDS}, - Pg = {31}, - Pi = {DORDRECHT}, - Publisher = {Kluwer Academic Publ}, - Rp = {Friedman, N (reprint author), Hebrew Univ Jerusalem, Sch Engn \& Comp Sci, IL-91904 Jerusalem, Israel}, - Sc = {Computer Science}, - Sn = {0885-6125}, - Tc = {160}, - Timestamp = {2012.06.28}, - Ut = {WOS:000178037200004}, - Wc = {Computer Science, Artificial Intelligence}, - Z9 = {168} -} - -@InProceedings{Geiger1994, - Title = {{Learning Gaussian networks}}, - Author = {Geiger, Dan and Heckerman, David}, - Booktitle = {{Proceedings of Tenth Conference on Uncertainty in Artificial Intelligence}}, - Year = {1994}, - - Address = {San Francisco, CA, USA}, - Pages = {235--243}, - Publisher = {Morgan Kaufmann Publishers Inc.}, - Series = {{UAI'94}}, - - Acmid = {2074425}, - ISBN = {1-55860-332-8}, - Location = {Seattle, WA}, - Numpages = {9}, - Owner = {marta}, - Timestamp = {2013.04.22} -} - -@Article{HECKERMAN1995, - Title = {{Learning Bayesian networks: the combination of knowledge and statistical data}}, - Author = {Heckerman, D. and Geiger, D. and Chickering, D. M.}, - Journal = {Machine Learning}, - Year = {1995}, - Number = {3}, - Pages = {197--243}, - Volume = {20}, - - Af = {HECKERMAN, DEOLEOLGEIGER, DEOLEOLCHICKERING, DM}, - C1 = {TECHNION ISRAEL INST TECHNOL,DEPT COMP SCI,IL-32000 HAIFA,ISRAEL.}, - De = {BAYESIAN NETWORKS; LEARNING; DIRICHLET; LIKELIHOOD EQUIVALENCE; MAXIMUMEOLEOLBRANCHING; HEURISTIC SEARCH}, - Ga = {RX354}, - J9 = {MACH LEARN}, - Ji = {Mach. Learn.}, - La = {English}, - Nr = {51}, - Owner = {marta}, - Pa = {SPUIBOULEVARD 50, PO BOX 17, 3300 AA DORDRECHT, NETHERLANDS}, - Pg = {47}, - Pi = {DORDRECHT}, - Publisher = {Kluwer Academic Publ}, - Rp = {HECKERMAN, D (reprint author), MICROSOFT RES,9S,REDMOND,WA 98052, USA}, - Sc = {Computer Science}, - Sn = {0885-6125}, - Tc = {729}, - Timestamp = {2012.06.28}, - Ut = {WOS:A1995RX35400001}, - Wc = {Computer Science, Artificial Intelligence}, - Z9 = {800} -} - -@Article{Hodges2010, - Title = {Bayesian network expansion identifies new ros and biofilm regulators}, - Author = {Hodges, A. P. and Dai, D. J. and Xiang, Z. S. and Woolf, P. and Xi, C. W. and He, Y. Q.}, - Journal = {PLOS One}, - Year = {2010}, - - Month = mar, - Number = {3}, - Pages = {e9513}, - Volume = {5}, - - Af = {Hodges, Andrew P.EOLEOLDai, DongjuanEOLEOLXiang, ZuoshuangEOLEOLWoolf, PeterEOLEOLXi, ChuanwuEOLEOLHe, Yongqun}, - C1 = {[Hodges, Andrew P.; Woolf, Peter; He, Yongqun] Univ Michigan, Ctr Computat Med & Bioinformat, Ann Arbor, MI 48109 USA.EOLEOL[Dai, Dongjuan; Xi, Chuanwu] Univ Michigan, Dept Environm Hlth Sci, Ann Arbor, MI 48109 USA.EOLEOL[Xiang, Zuoshuang; He, Yongqun] Univ Michigan, Unit Lab Anim Med, Ann Arbor, MI 48109 USA.EOLEOL[Woolf, Peter] Univ Michigan, Dept Chem Engn, Ann Arbor, MI 48109 USA.EOLEOL[Woolf, Peter] Univ Michigan, Dept Biomed Engn, Ann Arbor, MI 48109 USA.EOLEOL[He, Yongqun] Univ Michigan, Dept Microbiol & Immunol, Ann Arbor, MI 48109 USA.}, - Em = {yongqunh@umich.edu}, - Fu = {National Institutes of Health (NIH)[U54-DA-021519, 5 T32 GM070449-04];EOLEOLUniversity of Michigan}, - Fx = {This research was supported in part by National Institutes of HealthEOLEOL(NIH) Grant U54-DA-021519. A. P. H. was also supported by a NIH TrainingEOLEOLGrant (5 T32 GM070449-04) and a 2008 Rackham Spring/Summer ResearchEOLEOLGrant at the University of Michigan. Additional support for A. P. H. wasEOLEOLprovided by the University of Michigan Bioinformatics Program. TheEOLEOLfunders had no role in study design, data collection and analysis,EOLEOLdecision to publish, or preparation of the manuscript.}, - Ga = {562OZ}, - J9 = {PLOS ONE}, - Ji = {PLOS One}, - La = {English}, - Nr = {51}, - Owner = {marta}, - Pa = {185 BERRY ST, STE 1300, SAN FRANCISCO, CA 94107 USA}, - Pg = {9}, - Pi = {SAN FRANCISCO}, - Publisher = {Public Library Science}, - Rp = {Hodges, AP (reprint author), Univ Michigan, Ctr Computat Med & Bioinformat, Ann Arbor, MI 48109 USA}, - Sc = {Life Sciences & Biomedicine - Other Topics}, - Sn = {1932-6203}, - Tc = {4}, - Timestamp = {2012.06.28}, - Ut = {WOS:000275063400018}, - Wc = {Biology}, - Z9 = {4} -} - -@Article{Jansen2003, - Title = {A Bayesian networks approach for predicting protein-protein interactions from genomic data}, - Author = {Jansen, R. and Yu, H. Y. and Greenbaum, D. and Kluger, Y. and Krogan, N. J. and Chung, S. B. and Emili, A. and Snyder, M. and Greenblatt, J. F. and Gerstein, M.}, - Journal = {Science}, - Year = {2003}, - - Month = oct, - Number = {5644}, - Pages = {449--453}, - Volume = {302}, - - Af = {Jansen, REOLEOLYu, HYEOLEOLGreenbaum, DEOLEOLKluger, YEOLEOLKrogan, NJEOLEOLChung, SBEOLEOLEmili, AEOLEOLSnyder, MEOLEOLGreenblatt, JFEOLEOLGerstein, M}, - C1 = {Yale Univ, Dept Mol Biophys & Biochem, New Haven, CT 06520 USA.EOLEOLYale Univ, Dept Mol Cellular & Dev Biol, New Haven, CT 06520 USA.EOLEOLYale Univ, Dept Comp Sci, New Haven, CT 06520 USA.EOLEOLUniv Toronto, Banting & Best Dept Med Res, Toronto, ON M5G 1L6, Canada.EOLEOLUniv Toronto, Dept Mol & Med Res, Toronto, ON M5G 1L6, Canada.}, - Ga = {732UD}, - J9 = {SCIENCE}, - Ji = {Science}, - La = {English}, - Nr = {30}, - Owner = {marta}, - Pa = {1200 NEW YORK AVE, NW, WASHINGTON, DC 20005 USA}, - Pg = {5}, - Pi = {WASHINGTON}, - Publisher = {Amer Assoc Advancement Science}, - Rp = {Gerstein, M (reprint author), Yale Univ, Dept Mol Biophys & Biochem, 266 Whitney Ave,POB 208114, New Haven, CT 06520 USA.}, - Sc = {Science & Technology - Other Topics}, - Sn = {0036-8075}, - Tc = {574}, - Timestamp = {2013.04.22}, - Ut = {WOS:000185963200044}, - Wc = {Multidisciplinary Sciences}, - Z9 = {613} -} - -@Book{Jensen2001, - Title = {{Bayesian network and decision graphs}}, - Author = {Jensen, F. V.}, - Publisher = {Springer-Verlag, New York}, - Year = {2001}, - - Owner = {marta}, - Timestamp = {2012.06.28} -} - -@Article{Koivisto2004, - Title = {{Exact Bayesian structure discovery in Bayesian networks}}, - Author = {Koivisto, M. and Sood, K.}, - Journal = {Journal of Machine Learning Research}, - Year = {2004}, - - Month = may, - Pages = {549--573}, - Volume = {5}, - - Abstract = {Learning a Bayesian network structure from data is a well-motivated but computationally hard task. We present an algorithm that computes the exact posterior probability of a subnetwork, e. g., a directed edge; a modified version of the algorithm finds one of the most probable network structures. This algorithm runs in time O(n2(n) + n(k+1)C(m)), where n is the number of network variables, k is a constant maximum in-degree, and C(m) is the cost of computing a single local marginal conditional likelihood for m data instances. This is the first algorithm with less than super-exponential complexity with respect to n. Exact computation allows us to tackle complex cases where existing Monte Carlo methods and local search procedures potentially fail. We show that also in domains with a large number of variables, exact computation is feasible, given suitable a priori restrictions on the structures; combining exact and inexact methods is also possible. We demonstrate the applicability of the presented algorithm on four synthetic data sets with 17, 22, 37, and 100 variables.}, - Af = {Koivisto, MEOLEOLSood, K}, - C1 = {Univ Helsinki, Dept Comp Sci, HIIT Basic Res Unit, FIN-00014 Helsinki, Finland.EOLEOLNatl Publ Hlth Inst, Dept Mol Med, FIN-00251 Helsinki, Finland.}, - De = {complex interactions; dynamic programming; layering; structure learning}, - Em = {MIKKO.KOIVISTO@CS.HELSINKI.FI; KISMAT.SOOD@KTL.FI}, - Ga = {026GN}, - J9 = {J MACH LEARN RES}, - Ji = {J. Mach. Learn. Res.}, - La = {English}, - Nr = {28}, - Owner = {marta}, - Pa = {31 GIBBS STREET, BROOKLINE, MA 02446 USA}, - Pg = {25}, - Pi = {BROOKLINE}, - Publisher = {Microtome Publishing}, - Rp = {Koivisto, M (reprint author), Univ Helsinki, Dept Comp Sci, HIIT Basic Res Unit, FIN-00014 Helsinki, Finland.}, - Sc = {Automation \& Control Systems; Computer Science}, - Sn = {1532-4435}, - Tc = {41}, - Timestamp = {2013.04.23}, - Ut = {WOS:000236327500005}, - Wc = {Automation \& Control Systems; Computer Science, Artificial Intelligence}, - Z9 = {41} -} - -@Book{Lauritzen1996, - Title = {Graphical Models}, - Author = {Lauritzen, S. L.}, - Publisher = {Oxford Univ Press, New York}, - Year = {1996}, - - Owner = {marta}, - Timestamp = {2012.06.28} -} - -@Article{Lewis2013, - Title = {{Improving epidemiologic data analyses through multivariate regression modelling}}, - Author = {Lewis, Fraser and Ward, Michael}, - Journal = {Emerging Themes in Epidemiology}, - Year = {2013}, - Number = {1}, - Pages = {4}, - Volume = {10}, - - __markedentry = {[martapit:]}, - ISSN = {1742-7622}, - Owner = {martapit}, - Pubmedid = {23683753}, - Timestamp = {2015.03.23} -} - -@InProceedings{Lewis2012b, - Title = {{Bayesian networks as a tool for epidemiological systems analysis}}, - Author = {Lewis, F. I.}, - Booktitle = {{9th International Conference On Mathematical Problems In Engineering, Aerospace and Sciences (icnpaa 2012), Amer Inst Physics}}, - Year = {2012}, - Editor = {Sivasundaram, S}, - Pages = {610--617}, - Series = {{AIP Conference Proceedings}}, - Volume = {1493}, - - Affiliation = {{Lewis, FI (Reprint Author), Univ Zurich, Epidemiol Sect, Winterthurerstr 270, CH-8057 Zurich, Switzerland. Univ Zurich, Epidemiol Sect, CH-8057 Zurich, Switzerland.}}, - Doc-delivery-number = {{BDA08}}, - ISBN = {{978-0-7354-1105-0}}, - ISSN = {{0094-243X}}, - Keywords-plus = {{STRUCTURE DISCOVERY; HIV-1 ENVELOPE; SITES}}, - Language = {{English}}, - Number-of-cited-references = {{25}}, - Owner = {martapit}, - Research-areas = {{Mathematics; Physics}}, - Times-cited = {{0}}, - Timestamp = {2014.12.18}, - Type = {{Proceedings Paper}}, - Unique-id = {{ISI:000312264400089}}, - Web-of-science-categories = {{Mathematics, Applied; Physics, Applied}} -} - -@Article{Lewis2011, - Title = {Structure discovery in Bayesian networks: An analytical tool for analysing complex animal health data}, - Author = {Lewis, F. I. and Brulisauer, F. and Gunn, G. J.}, - Journal = {Preventive Veterinary Medicine}, - Year = {2011}, - - Month = jun, - Number = {2}, - Pages = {109--115}, - Volume = {100}, - - Af = {Lewis, F. I.EOLEOLBruelisauer, F.EOLEOLGunn, G. J.}, - C1 = {[Lewis, F. I.; Bruelisauer, F.; Gunn, G. J.] Scottish Agr Coll, Edinburgh EH9 3JG, Midlothian, Scotland.}, - Cl = {Nantes, FRANCE}, - Ct = {Annual Meeting of theEOLEOLSociety-for-Veterinary-Epidemiology-and-Preventive-Medicine}, - Cy = {2010}, - De = {Bayesian network; Structure discovery; Animal health; Epidemiology}, - Em = {fraser.lewis@sac.ac.uk}, - Ga = {786XX}, - J9 = {PREV VET MED}, - Ji = {Prev. Vet. Med.}, - La = {English}, - Nr = {28}, - Owner = {marta}, - Pa = {PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS}, - Pg = {7}, - Pi = {AMSTERDAM}, - Publisher = {Elsevier Science Bv}, - Rp = {Lewis, FI (reprint author), Scottish Agr Coll, Kings Bldg,W Mains Rd, Edinburgh EH9 3JG, Midlothian, Scotland}, - Sc = {Veterinary Sciences}, - Si = {SI}, - Sn = {0167-5877}, - Tc = {0}, - Timestamp = {2012.06.28}, - Ut = {WOS:000292342300004}, - Wc = {Veterinary Sciences}, - Z9 = {0} -} - -@Article{Lewis2012, - Title = {{Revealing the complexity of health determinants in resource-poor settings}}, - Author = {Lewis, F. I. and McCormick, B. J. J.}, - Journal = {American Journal of Epidemiology}, - Year = {2012}, - - Month = dec, - Number = {11}, - Pages = {1051--1059}, - Volume = {176}, - - Af = {Lewis, Fraser I.EOLEOLMcCormick, Benjamin J. J.}, - C1 = {[Lewis, Fraser I.] Univ Zurich, Vetsuisse Fac, Epidemiol Sect, CH-8057 Zurich, Switzerland.EOLEOL[McCormick, Benjamin J. J.] NIH, Fogarty Int Ctr, Bethesda, MD 20892 USA.}, - De = {Bayesian network; diarrhea; epidemiologic determinants; graphical model;EOLEOLsocioeconomic factors}, - Em = {fraseriain.lewis@uzh.ch}, - Fu = {Bill and Melinda Gates Foundation; Fogarty International Center of theEOLEOLNational Institutes of Health as part of the MAL-ED project network}, - Fx = {This research was partly funded (B. J. J. M.) by the Bill and MelindaEOLEOLGates Foundation and the Fogarty International Center of the NationalEOLEOLInstitutes of Health as part of the MAL-ED (pronounced "mal a dee")EOLEOLproject network (i.e., the Interactions of Malnutrition and EntericEOLEOLInfections: Consequences for Child Health and Development).}, - Ga = {048HB}, - J9 = {AM J EPIDEMIOL}, - Ji = {Am. J. Epidemiol.}, - La = {English}, - Nr = {48}, - Owner = {marta}, - Pa = {JOURNALS DEPT, 2001 EVANS RD, CARY, NC 27513 USA}, - Pg = {9}, - Pi = {CARY}, - Publisher = {Oxford Univ Press Inc}, - Ri = {Lewis, Fraser/A-3635-2011}, - Rp = {Lewis, FI (reprint author), Univ Zurich, Vetsuisse Fac, Epidemiol Sect, Winterthurerstr 270, CH-8057 Zurich, Switzerland.}, - Sc = {Public, Environmental \& Occupational Health}, - Sn = {0002-9262}, - Tc = {0}, - Timestamp = {2013.04.23}, - Ut = {WOS:000311900700012}, - Wc = {Public, Environmental \& Occupational Health}, - Z9 = {0} -} - -@Article{Ludwig2013, - Title = {Identifying associations in Escherichia coli antimicrobial resistance patterns using additive Bayesian networks}, - Author = {Ludwig, Antoinette and Berthiaume, Philippe and Boerlin, Patrick and Gow, Sheryl and L{\'e}ger, David and Lewis, Fraser I.}, - Journal = {Preventive Veterinary Medicine}, - Year = {2013}, - - Month = may, - Number = {1}, - Pages = {64--75}, - Volume = {110}, - - Booktitle = {Special Issue: Bayesian Graphical Modelling: Applications in Veterinary Epidemiology}, - ISSN = {0167-5877}, - Owner = {marta}, - Timestamp = {2013.05.27} -} - -@Article{Lycett2009, - Title = {Detection of mammalian virulence determinants in highly pathogenic avian influenza H5N1 viruses: multivariate analysis of published data}, - Author = {Lycett, S. J. and Ward, M. J. and Lewis, F. I. and Poon, A. F. Y. and Pond, S. L. K. and Brown, A. J. L.}, - Journal = {Journal of Virology}, - Year = {2009}, - - Month = oct, - Number = {19}, - Pages = {9901--9910}, - Volume = {83}, - - Af = {Lycett, S. J.EOLEOLWard, M. J.EOLEOLLewis, F. I.EOLEOLPoon, A. F. Y.EOLEOLPond, S. L. KosakovskyEOLEOLBrown, A. J. Leigh}, - C1 = {[Lycett, S. J.] Univ Edinburgh, Ashworth Labs, Inst Evolutionary Biol, Edinburgh EH9 3JT, Midlothian, Scotland.EOLEOL[Poon, A. F. Y.; Pond, S. L. Kosakovsky] Univ Calif San Diego, San Diego, CA 92103 USA.}, - Em = {samantha.lycett@ed.ac.uk}, - Fu = {Biotechnology and Biological Sciences Research Council [Bb/B009670/1];EOLEOLNational Institutes of Health [AI43638, AI47745, AI57167]; University ofEOLEOLCalifornia Universitywide AIDS Research Program [IS02-SD-701];EOLEOLUniversity of California, San Diego, Center for AIDS Research/NIAIDEOLEOLDevelopmental Award [AI36214]}, - Fx = {This study was funded by the Biotechnology and Biological SciencesEOLEOLResearch Council (Bb/B009670/1). In addition, this research wasEOLEOLsupported in part by the National Institutes of Health (AI43638,EOLEOLAI47745, and AI57167), the University of California Universitywide AIDSEOLEOLResearch Program (grant number IS02-SD-701), and a University ofEOLEOLCalifornia, San Diego, Center for AIDS Research/NIAID DevelopmentalEOLEOLAward (AI36214).}, - Ga = {491WX}, - J9 = {J VIROL}, - Ji = {J. Virol.}, - La = {English}, - Nr = {86}, - Owner = {marta}, - Pa = {1752 N ST NW, WASHINGTON, DC 20036-2904 USA}, - Pg = {10}, - Pi = {WASHINGTON}, - Publisher = {Amer Soc Microbiology}, - Ri = {Leigh Brown, Andrew/F-3802-2010; Lewis, Fraser/A-3635-2011; Pond, Sergei/G-9830-2012}, - Rp = {Lycett, SJ (reprint author), Univ Edinburgh, Ashworth Labs, Inst Evolutionary Biol, Rm 65,Kings Bldg,W Mains Rd, Edinburgh EH9 3JT, Midlothian, Scotland.}, - Sc = {Virology}, - Sn = {0022-538X}, - Tc = {15}, - Timestamp = {2013.04.22}, - Ut = {WOS:000269614300025}, - Wc = {Virology}, - Z9 = {15} -} - -@Article{MACKAY1992, - Title = {Bayesian Interpolation}, - Author = {Mackay, D. J. C.}, - Journal = {Neural Computation}, - Year = {1992}, - - Month = may, - Number = {3}, - Pages = {415--447}, - Volume = {4}, - - Af = {MACKAY, DJC}, - C1 = {CALTECH 139-74,COMPUTAT & NEURAL SYST,PASADENA,CA 91125.}, - Ga = {HZ476}, - J9 = {NEURAL COMPUT}, - Ji = {Neural Comput.}, - La = {English}, - Nr = {36}, - Owner = {marta}, - Pa = {55 HAYWARD ST JOURNALS DEPT, CAMBRIDGE, MA 02142}, - Pg = {33}, - Pi = {CAMBRIDGE}, - Publisher = {Mit Press}, - Sc = {Computer Science}, - Sn = {0899-7667}, - Tc = {1292}, - Timestamp = {2013.04.23}, - Ut = {WOS:A1992HZ47600010}, - Wc = {Computer Science, Artificial Intelligence}, - Z9 = {1344} -} - -@Article{McCormick2013, - Title = {Using Bayesian networks to explore the role of weather as a potential determinant of disease in pigs}, - Author = {McCormick, B.J.J. and Sanchez-Vazquez, M.J. and Lewis, F.I.}, - Journal = {Preventive Veterinary Medicine}, - Year = {2013}, - - Month = may, - Number = {1}, - Pages = {54--63}, - Volume = {110}, - - Booktitle = {Special Issue: Bayesian Graphical Modelling: Applications in Veterinary Epidemiology}, - ISSN = {0167-5877}, - Owner = {marta}, - Timestamp = {2013.06.21} -} - -@Article{Needham2007, - Title = {A primer on learning in Bayesian networks for computational biology}, - Author = {Needham, C. J. and Bradford, J. R. and Bulpitt, A. J. and Westhead, D. R.}, - Journal = {PLOS Computational Biology}, - Year = {2007}, - - Month = aug, - Number = {8}, - Pages = {e129}, - Volume = {3}, - - Af = {Needham, Chris J.EOLEOLBradford, James R.EOLEOLBulpitt, Andrew J.EOLEOLWesthead, David R.}, - C1 = {Univ Leeds, Sch Computing, Leeds, W Yorkshire, England.EOLEOLUniv Leeds, Inst Mol & Cellular Biol, Leeds, W Yorkshire, England.}, - Em = {chrisn@comp.leeds.ac.uk}, - Ga = {214VU}, - J9 = {PLOS COMPUT BIOL}, - Ji = {PLoS Comput. Biol.}, - La = {English}, - Nr = {18}, - Owner = {marta}, - Pa = {185 BERRY ST, STE 1300, SAN FRANCISCO, CA 94107 USA}, - Pg = {8}, - Pi = {SAN FRANCISCO}, - Publisher = {Public Library Science}, - Rp = {Needham, CJ (reprint author), Univ Leeds, Sch Computing, Leeds, W Yorkshire, England}, - Sc = {Biochemistry & Molecular Biology; Mathematical & Computational Biology}, - Sn = {1553-734X}, - Tc = {46}, - Timestamp = {2012.06.28}, - Ut = {WOS:000249767100002}, - Wc = {Biochemical Research Methods; Mathematical & Computational Biology}, - Z9 = {46} -} - -@Article{Plummer2003, - Title = {JAGS: a program for analysis of Bayesian graphical models using Gibbs sampling.}, - Author = {Plummer, M.}, - Journal = {Proc 3rd Int Work Dist Stat Comp}, - Year = {2003}, - - Owner = {martapit}, - Timestamp = {2014.12.03} -} - -@Article{Plummer2006, - Title = {CODA: Convergence Diagnosis and Output Analysis for MCMC}, - Author = {Martyn Plummer and Nicky Best and Kate Cowles and Karen Vines}, - Journal = {R News}, - Year = {2006}, - Number = {1}, - Pages = {7-11}, - Volume = {6}, - - Owner = {martapit}, - Timestamp = {2015.05.06} -} - -@Article{Poon2008, - Title = {Spidermonkey: rapid detection of co-evolving sites using Bayesian graphical models}, - Author = {Poon, A. F. Y. and Lewis, F. I. and Frost, S. D. W. and Pond, S. L. K.}, - Journal = {Bioinformatics}, - Year = {2008}, - - Month = sep, - Number = {17}, - Pages = {1949--1950}, - Volume = {24}, - - Af = {Poon, Art F. Y.EOLEOLLewis, Fraser I.EOLEOLFrost, Simon D. W.EOLEOLPond, Sergei L. Kosakovsky}, - C1 = {[Poon, Art F. Y.; Frost, Simon D. W.; Pond, Sergei L. Kosakovsky] Univ Calif San Diego, Dept Pathol, Div Comparat Pathol & Med, San Diego, CA 92103 USA.EOLEOL[Lewis, Fraser I.] Scottish Agr Coll, Epidemiol Res Unit, Inverness IV2 4JZ, Scotland.}, - Fu = {National Institutes of Health [AI43638, AI47745, AI57167]; University ofEOLEOLCalifornia San Diego Centers for AIDS Research/National Institute ofEOLEOLAllergy and Infectious Disease (NIAID) [AI36214]; Wellcome Trust}, - Fx = {We thank Selene Zarate and Leon Martinez-Castilla for their assistanceEOLEOLin beta-testing.Funding: This work was supported by grants AI43638,EOLEOLAI47745 and AI57167 from the National Institutes of Health, and byEOLEOLUniversity of California San Diego Centers for AIDS Research/NationalEOLEOLInstitute of Allergy and Infectious Disease (NIAID) developmental awardsEOLEOLAI36214 to S.D.W.F. and S.L.K.P.F.I.L. received financial support fromEOLEOLthe Wellcome Trust.}, - Ga = {343NL}, - J9 = {BIOINFORMATICS}, - Ji = {Bioinformatics}, - La = {English}, - Nr = {11}, - Owner = {marta}, - Pa = {GREAT CLARENDON ST, OXFORD OX2 6DP, ENGLAND}, - Pg = {2}, - Pi = {OXFORD}, - Publisher = {Oxford Univ Press}, - Ri = {Frost, Simon/F-3648-2010; Lewis, Fraser/A-3635-2011; Pond, Sergei/G-9830-2012}, - Rp = {Poon, AFY (reprint author), Univ Calif San Diego, Dept Pathol, Div Comparat Pathol & Med, San Diego, CA 92103 USA.}, - Sc = {Biochemistry & Molecular Biology; Biotechnology & Applied Microbiology;EOLEOLComputer Science; Mathematical & Computational Biology; Mathematics}, - Sn = {1367-4803}, - Tc = {13}, - Timestamp = {2013.04.22}, - Ut = {WOS:000258860700018}, - Wc = {Biochemical Research Methods; Biotechnology & Applied Microbiology;EOLEOLComputer Science, Interdisciplinary Applications; Mathematical &EOLEOLComputational Biology; Statistics & Probability}, - Z9 = {14} -} - -@Article{Poon2007, - Title = {Evolutionary interactions between N-linked glycosylation sites in the HIV-1 envelope}, - Author = {Poon, A. F. Y. and Lewis, F. I. and Pond, S. L. K. and Frost, S. D. W.}, - Journal = {PLOS Computational Biology}, - Year = {2007}, - - Month = jan, - Number = {1}, - Pages = {e11}, - Volume = {3}, - - Af = {Poon, Art F. Y.EOLEOLLewis, Fraser I.EOLEOLPond, Sergei L. KosakovskyEOLEOLFrost, Simon D. W.}, - C1 = {Univ Calif San Diego, Dept Pathol, La Jolla, CA 92093 USA.EOLEOLUniv Edinburgh, Inst Evolutionary Biol, Edinburgh, Midlothian, Scotland.}, - Em = {afpoon@ucsd.edu}, - Ga = {130ZY}, - J9 = {PLOS COMPUT BIOL}, - Ji = {PLoS Comput. Biol.}, - La = {English}, - Nr = {53}, - Owner = {marta}, - Pa = {185 BERRY ST, STE 1300, SAN FRANCISCO, CA 94107 USA}, - Pg = {10}, - Pi = {SAN FRANCISCO}, - Publisher = {Public Library Science}, - Rp = {Poon, AFY (reprint author), Univ Calif San Diego, Dept Pathol, La Jolla, CA 92093 USA}, - Sc = {Biochemistry & Molecular Biology; Mathematical & Computational Biology}, - Sn = {1553-734X}, - Tc = {23}, - Timestamp = {2012.06.28}, - Ut = {WOS:000243838700011}, - Wc = {Biochemical Research Methods; Mathematical & Computational Biology}, - Z9 = {23} -} - -@Article{Poon2007a, - Title = {An evolutionary-network model reveals stratified interactions in the V3 loop of the HIV-1 envelope}, - Author = {Poon, A. F. Y. and Lewis, F. I. and Pond, S. L. K. and Frost, S. D. W.}, - Journal = {PLOS Computational Biology}, - Year = {2007}, - - Month = nov, - Number = {11}, - Pages = {e231}, - Volume = {3}, - - Af = {Poon, Art F. Y.EOLEOLLewis, Fraser I.EOLEOLPond, Sergei L. KosakovskyEOLEOLFrost, Simon D. W.}, - C1 = {Univ Calif San Diego, Dept Pathol, La Jolla, CA 92093 USA.EOLEOLUniv Edinburgh, Inst Evolutionary Biol, Edinburgh EH8 9YL, Midlothian, Scotland.}, - Em = {afpoon@ucsd.edu}, - Ga = {236NP}, - J9 = {PLOS COMPUT BIOL}, - Ji = {PLoS Comput. Biol.}, - La = {English}, - Nr = {89}, - Owner = {marta}, - Pa = {185 BERRY ST, STE 1300, SAN FRANCISCO, CA 94107 USA}, - Pg = {12}, - Pi = {SAN FRANCISCO}, - Publisher = {Public Library Science}, - Ri = {Frost, Simon/F-3648-2010; Lewis, Fraser/A-3635-2011}, - Rp = {Poon, AFY (reprint author), Univ Calif San Diego, Dept Pathol, La Jolla, CA 92093 USA.}, - Sc = {Biochemistry & Molecular Biology; Mathematical & Computational Biology}, - Sn = {1553-734X}, - Tc = {30}, - Timestamp = {2013.04.22}, - Ut = {WOS:000251310000022}, - Wc = {Biochemical Research Methods; Mathematical & Computational Biology}, - Z9 = {31} -} - -@Article{Rijmen2008, - Title = {{Bayesian networks with a logistic regression model for the conditional probabilities}}, - Author = {Rijmen, F.}, - Journal = {International Journal of Approximate Reasoning}, - Year = {2008}, - Number = {2}, - Pages = {659--666}, - Volume = {48}, - - Af = {Rijmen, Frank}, - C1 = {Vrije Univ Amsterdam Med Ctr, Dept Clin Epidemiol \& Biostat, Amsterdam, Netherlands.}, - De = {Bayesian networks; logistic regression; generalized linear models;EOLEOLrestricted conditional probabilities}, - Em = {frijmen@ets.org}, - Ga = {311NK}, - J9 = {INT J APPROX REASON}, - Ji = {Int. J. Approx. Reasoning}, - La = {English}, - Nr = {13}, - Owner = {marta}, - Pa = {360 PARK AVE SOUTH, NEW YORK, NY 10010-1710 USA}, - Pg = {8}, - Pi = {NEW YORK}, - Publisher = {Elsevier Science Inc}, - Rp = {Rijmen, F (reprint author), Vrije Univ Amsterdam Med Ctr, Dept Clin Epidemiol \& Biostat, Amsterdam, Netherlands.}, - Sc = {Computer Science}, - Sn = {0888-613X}, - Tc = {3}, - Timestamp = {2013.04.22}, - Ut = {WOS:000256606700017}, - Wc = {Computer Science, Artificial Intelligence}, - Z9 = {4} -} - -@Article{Sanchez-Vazquez2012, - Title = {Identifying associations between pig pathologies using a multi-dimensional machine learning methodology}, - Author = {Sanchez-Vazquez, Manuel and Nielen, Mirjam and Edwards, Sandra and Gunn, George and Lewis, Fraser}, - Journal = {BMC Veterinary Research}, - Year = {2012}, - Number = {1}, - Pages = {151}, - Volume = {8}, - - __markedentry = {[martapit:6]}, - ISSN = {1746-6148}, - Owner = {martapit}, - Pubmedid = {22937883}, - Timestamp = {2015.04.28} -} - -@Article{Schemann2013, - Title = {Untangling the complex inter-relationships between horse managers' perceptions of effectiveness of biosecurity practices using Bayesian graphical modelling}, - Author = {Schemann, Kathrin and Lewis, Fraser I. and Firestone, Simon M. and Ward, Michael P. and Toribio, Jenny-Ann L. M. L. and Taylor, Melanie R. and Dhand, Navneet K.}, - Journal = {Preventive Veterinary Medicine}, - Year = {2013}, - - Month = {may}, - Number = {1, SI}, - Pages = {37-44}, - Volume = {110}, - - Address = {{PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS}}, - Affiliation = {{Schemann, K (Reprint Author), Univ Sydney, Fac Vet Sci, 425 Werombi Rd, Camden, NSW 2570, Australia. Schemann, Kathrin; Firestone, Simon M.; Ward, Michael P.; Toribio, Jenny-Ann L. M. L.; Dhand, Navneet K., Univ Sydney, Fac Vet Sci, Camden, NSW 2570, Australia. Lewis, Fraser I., Univ Zurich, Vetsuisse Fac, CH-8057 Zurich, Switzerland. Firestone, Simon M., Univ Melbourne, Fac Vet Sci, Parkville, Vic 3010, Australia. Taylor, Melanie R., Univ Western Sydney, Sch Med, Penrith, NSW 2751, Australia.}}, - Author-email = {{kathrin.schemann@sydney.edu.au}}, - Doc-delivery-number = {{143SH}}, - Funding-acknowledgement = {{Rural Industries Research and Development Corporation (RIRDC); Swiss National Science Foundation {[}IZK0Z3\_140188]}}, - Funding-text = {{This research was funded by the Rural Industries Research and Development Corporation (RIRDC) and the Swiss National Science Foundation (grant: IZK0Z3\_140188). The authors gratefully acknowledge the time and cooperation of the interviewed horse managers and thank the New South Wales Department of Primary Industries for making the equine influenza dataset available for selecting horse owners. The authors are also grateful for access to the Schrodinger supercomputer at the University of Zurich.}}, - ISSN = {{0167-5877}}, - Journal-iso = {{Prev. Vet. Med.}}, - Keywords-plus = {{EQUINE INFLUENZA OUTBREAK; STRUCTURE DISCOVERY; HOLSTEIN COWS; PATH-ANALYSIS; AUSTRALIA; NETWORKS; DISORDERS}}, - Language = {{English}}, - Number-of-cited-references = {{31}}, - Orcid-numbers = {{Lewis, Fraser/0000-0003-1145-8271 }}, - Owner = {martapit}, - Publisher = {{ELSEVIER SCIENCE BV}}, - Research-areas = {{Veterinary Sciences}}, - Researcherid-numbers = {{Lewis, Fraser/A-3635-2011 Ward, Michael/C-5758-2009}}, - Times-cited = {{1}}, - Timestamp = {2015.04.23}, - Type = {{Article}}, - Unique-id = {{ISI:000318888000005}}, - Web-of-science-categories = {{Veterinary Sciences}} -} - -@Article{Ward2013, - Title = {Bayesian graphical modelling: applications in veterinary epidemiology}, - Author = {Ward, Michael P. and Lewis, Fraser I.}, - Journal = {Preventive Veterinary Medicine}, - Year = {2013}, - - Month = {may}, - Number = {1}, - Pages = {1--3}, - Volume = {110}, - - Booktitle = {Special Issue: Bayesian Graphical Modelling: Applications in Veterinary Epidemiology}, - ISSN = {0167-5877}, - Owner = {marta}, - Timestamp = {2013.06.21} -} - -@Article{Wilson2013, - Title = {Use of a Bayesian network model to identify factors associated with the presence of the tick Ornithodoros erraticus on pig farms in southern Portugal.}, - Author = {Wilson, Anthony J. and Ribeiro, Rita and Boinas, Fernando}, - Journal = {Preventive Veterinary Medicine}, - Year = {2013}, - - Month = {may}, - Number = {1, SI}, - Pages = {45-53}, - Volume = {110}, - - __markedentry = {[martapit:6]}, - Address = {{PO BOX 211, 1000 AE AMSTERDAM, NETHERLANDS}}, - Affiliation = {{Wilson, AJ (Reprint Author), Pirbright Inst, Ash Rd, Woking GU24 0NF, Surrey, England. Wilson, Anthony J., Pirbright Inst, Woking GU24 0NF, Surrey, England. Ribeiro, Rita; Boinas, Fernando, Univ Tecn Lisboa, Fac Med Vet, CIISA, P-1300477 Lisbon, Portugal.}}, - Author-email = {{anthony.wilson@pirbright.ac.uk}}, - Doc-delivery-number = {{143SH}}, - Funding-acknowledgement = {{Biotechnology and Biological Sciences Research Council {[}BBS/B/00603]; JNICT, INVOTAN Permanent Commission scholarship {[}PO/86/00]; Fundacao para a Ciencia e Tecnologia {[}PTDC/SAU-ESA-6540-2006]}}, - Funding-text = {{Anthony Wilson is funded by the Biotechnology and Biological Sciences Research Council (grant number BBS/B/00603). Fernando Boinas was funded by JNICT, INVOTAN Permanent Commission scholarship (ref PO/86/00) and by Fundacao para a Ciencia e Tecnologia (PTDC/SAU-ESA-6540-2006). The funders had no involvement in the study design, in the collection, analysis or interpretation of data, in the writing of the manuscript, or in the decision to submit the manuscript for publication.}}, - ISSN = {{0167-5877}}, - Journal-iso = {{Prev. Vet. Med.}}, - Keywords-plus = {{AFRICAN SWINE FEVER; INFERENCE; SPAIN}}, - Language = {{English}}, - Number-of-cited-references = {{33}}, - Owner = {martapit}, - Publisher = {{ELSEVIER SCIENCE BV}}, - Research-areas = {{Veterinary Sciences}}, - Researcherid-numbers = {{Institute, Pirbright/K-4476-2014}}, - Times-cited = {{2}}, - Timestamp = {2015.04.23}, - Type = {{Article}}, - Unique-id = {{ISI:000318888000006}}, - Web-of-science-categories = {{Veterinary Sciences}} -} - diff -Nru abn-1.2/vignettes/abn_v1.0.2-concordance.tex abn-1.3/vignettes/abn_v1.0.2-concordance.tex --- abn-1.2/vignettes/abn_v1.0.2-concordance.tex 2018-08-02 13:04:43.000000000 +0000 +++ abn-1.3/vignettes/abn_v1.0.2-concordance.tex 1970-01-01 00:00:00.000000000 +0000 @@ -1,11 +0,0 @@ -\Sconcordance{concordance:abn_v1.0.2.tex:abn_v1.0.2.Rnw:% -1 175 1 51 0 7 1 4 0 5 1 4 0 4 1 4 0 12 1 4 0 7 1 4 0 8 1 4 0 6 1 % -4 0 9 1 4 0 5 1 4 0 21 1 4 0 4 1 4 0 28 1 4 0 22 1 4 0 18 1 4 0 12 % -1 4 0 19 1 4 0 32 1 4 0 13 1 4 0 5 1 4 0 11 1 4 0 16 1 4 0 4 1 4 0 % -6 1 4 0 7 1 4 0 92 1 4 0 22 1 4 0 15 1 4 0 28 1 4 0 9 1 4 0 24 1 4 % -0 54 1 4 0 16 1 4 0 88 1 8 0 4 1 4 0 11 1 4 0 7 1 7 0 10 1 4 0 4 1 % -7 0 5 1 4 0 17 1 4 0 9 1 4 0 5 1 4 0 7 1 7 0 10 1 4 0 7 1 8 0 15 1 % -4 0 15 1 4 0 5 1 4 0 4 1 7 0 29 1 4 0 7 1 9 0 26 1 4 0 7 1 4 0 9 1 % -4 0 6 1 4 0 14 1 4 0 14 1 4 0 9 1 4 0 7 1 4 0 6 1 4 0 10 1 4 0 7 1 % -4 0 15 1 4 0 7 1 4 0 6 1 4 0 15 1 4 0 7 1 4 0 16 1 4 0 5 1 4 0 6 1 % -4 0 7 1 4 0 14 1 4 0 17 1} diff -Nru abn-1.2/vignettes/abn_v1.0.2.Rnw abn-1.3/vignettes/abn_v1.0.2.Rnw --- abn-1.2/vignettes/abn_v1.0.2.Rnw 2018-08-02 13:04:43.000000000 +0000 +++ abn-1.3/vignettes/abn_v1.0.2.Rnw 1970-01-01 00:00:00.000000000 +0000 @@ -1,1205 +0,0 @@ - -% -*- mode: noweb; noweb-default-code-mode: R-mode; -*- -\documentclass[nojss]{jss} -%\include{graphicx} -%\usepackage[nogin]{Sweave} - -\usepackage{multicol} % This is so we can have multiple columns of text side-by-side -\columnsep=100pt % This is the amount of white space between the columns in the poster -\columnseprule=3pt % This is the thickness of the black line between the columns in the poster - -\usepackage[svgnames]{xcolor} % Specify colors by their 'svgnames', for a full list of all colors available see here: http://www.latextemplates.com/svgnames-colors - -\usepackage{times} % Use the times font -%\usepackage{palatino} % Uncomment to use the Palatino font - -% Package Imported from Exo/Make: -\usepackage{natbib} -\usepackage{calc} -\usepackage{url} -%\usepackage[colorlinks=true,pdftex]{hyperref} -\usepackage{bm} -\usepackage{rotate} - -%\usepackage{graphics} -%\usepackage{epstopdf} -\usepackage{graphicx} % Required for including images -%\graphicspath{{figures/}} % Location of the graphics files -\DeclareGraphicsExtensions{.pdf,.png,.jpg} -\usepackage{booktabs} % Top and bottom rules for table -\usepackage[font=small,labelfont=bf]{caption} % Required for specifying captions to tables and figures -\usepackage{amsfonts, amsmath, amsthm, amssymb} % For math fonts, symbols and environments -\usepackage{wrapfig} % Allows wrapping text around tables and figures - -\parindent0pt -%\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl,formatcom=\color{DarkGreen} Blue,fontsize=\relsize{-1}} -\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl, -formatcom=\color{DarkGreen}} -\newcommand{\rr}[1]{\texttt{\color{DarkGreen} #1}} -\definecolor{DarkGreen}{rgb}{0,0.39216,0} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% declarations for jss.cls - journal of statistical software -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\author{Gilles Kratzer, Marta Pittavino, Fraser Lewis, Reinhard Furrer} -\title{\textit{abn}: an \proglang{R} package for modelling multivariate -data using additive Bayesian networks} - -%\VignetteIndexEntry{A vignette for the abn package} -%% for pretty printing and a nice hypersummary also set: -%% \Plainauthor{, Second Author} %% comma-separated -\Plaintitle{Multidimensional Bayesian regression in R} -\Shorttitle{The \pkg{abn} package} - -\Abstract{ -This vignette describes the \proglang{R} package \pkg{abn} which provides functionality for identifying statistical dependencies in complex multivariate data using additive Bayesian network (ABN) models. This methodology is ideally suited for both univariate - one response variable, and multiple explanatory variables - and multivariate analysis, where in both cases all statistical dependencies between all variables in the data are sought. ABN models comprise of directed acyclic graphs (DAGs) where each node in the graph comprises a generalized linear model. Model search algorithms are used to identify those DAG structures most supported by the data. Currently implemented are models for data comprising of categorical, count and/or continuous variables. Further relevant information about \pkg{abn} can be found at: \href{http://www.r-bayesian-networks.org}{www.r-bayesian-networks.org}. -} - -% \Keywords{\proglang{R}, Bayesian Networks, additive models, structure discovery} -% \Plainkeywords{Bayesian Networks, additive models, structure discovery} -\Keywords{Graphical models, additive models, structure learning, exact order based search, parametric bootstrapping, JAGS, MCMC, parameter learning, heuristic search} -\Plainkeywords{Graphical models, additive models, structure learning, exact order based search, parametric bootstrapping, JAGS, MCMC, parameter learning, heuristic search} - -\Address{ - Gilles Kratzer\\ - Institute of Mathematics, University of Zurich\\ - Winterthurerstrasse 190, Zurich 8057, Switzerland\\ - E-mail: \email{gilles.kratzer@math.uzh.ch}\\ - - Marta Pittavino\\ - Institute of Mathematics, University of Zurich\\ - Winterthurerstrasse 190, Zurich 8057, Switzerland\\ - E-mail: \email{marta.pittavino@math.uzh.ch}\\ - - Fraser Ian Lewis\\ - Office of Health Economics\\ - United Kingdom, London\\ - E-mail: \email{flewis@ohe.org}\\ - - Reinhard Furrer\\ - Institute of Mathematics, University of Zurich\\ - Institute of Computational Science, University of Zurich\\ - Winterthurerstrasse 190, Zurich 8057, Switzerland\\ - E-mail: \email{reinhard.furrer@math.uzh.ch} -} - -%% need no \usepackage{Sweave.sty} -%%\SweaveOpts{echo=FALSE,keep.source=TRUE} -%\SweaveOpts{keep.source=TRUE,echo=FALSE,eps=FALSE,pdf=TRUE,width=11.69,height=8.27,prefix.string=abn_v10} -\newcommand{\deep}{\setlength{\jot}{12pt}} -\newcommand{\nodeep}{\setlength{\jot}{3pt}} -\begin{document} -%\SweaveOpts{concordance=TRUE} -\setkeys{Gin}{width=1.0\textwidth} -%---------------------------------------------------------------------------------------- -\section{Introduction} -Bayesian network (BN) modelling \citep{Buntine1991, HECKERMAN1995, Lauritzen1996, Jensen2001} is a form of graphical modeling which attempts to separate out indirect from direct association in complex multivariate data, a process typically referred to as structure discovery in \cite{Friedman2003}. -In the last decades, BN modelling has been widely used in biomedical science/systems biology \citep{Poon2007, Poon2008, Poon2007a, Needham2007, Dojer2006, Jansen2003, Djebbari2008, Hodges2010} to analyse multi-dimensional data. However, only in the last few years, ABN models have been applied to veterinary epidemiology field; thanks to their ability to generalize standard regression methodologies. -Unlike other widely used multivariate approaches where dimensionality is reduced through exploiting linear combinations of random variables, such as in principal component analysis, graphical modeling does not involve any such dimension reduction. - -BN is a generic and well established \textit{data mining/machine learning} methodology, which has been demonstrated in other fields of study to be ideally suited to such analysis. They have been developed for analysing multinomial, multivariate Gaussian or conditionally Gaussian networks (a mix categorical and Gaussian variables), see \cite{HECKERMAN1995, Boettcher2004, Geiger1994}. -Additive Bayesian network (ABN) models are a special type of Bayesian network (BN) models, -where each node in the graph comprises a generalized linear model. -As the latter, they consist of statistical models which derive a directed acyclic graph (DAG) from empirical data, describing the dependency structure between random variables. All types of BN models comprise of two reciprocally dependent parts: a qualitative (the structure) and a quantitative (the model parameters) part. The DAG is the graphical representation of the joint probability distribution of all random variables in the data. The model parameters are represented by a local probability distribution for all the variables in the network. - -A number of libraries for fitting such BNs are available from CRAN. These types of BN have been constructed to ensure conjugacy, that is, enable posterior distributions for the model parameters and marginal likelihood to be calculated analytically. The purpose of \pkg{abn} is to provide a library of functions for more flexible BNs which can also not rely on conjugacy, which opens up an extremely rich modeling framework but at some considerable additional computational cost. - -Currently \pkg{abn} includes functionality for fitting non-conjugate BN models which are multi-dimensional analogues of combinations of Binomial (logistic) and Gaussian regression. It includes also model with Poisson (log) distribution for count data and generalised linear models with random effects (with the previous distributions). -% It is planned to extend this to include more complex distributions for overdispersed data such a beta-binomial and negative binomial. - -The objective in BN modeling structure discovery is to perform a model search on the data to identify an optimal model. Recall that BN models have a vast search space - super-exponential in the number of nodes - and it is generally impossible to determine a globally optimal model. How best to summarize a set of locally optimal networks with different structural features is an open question, and there are a number of widely used and intuitively reasonable possibilities. For example, one option is to conduct a series of heuristic searches and then simply select the best model found in \cite{HECKERMAN1995}; alternatively, a single summary network can be constructed using results across many different searches \citep{Hodges2010,Poon2007}. Otherwise, an exact search method, as the one presented in \cite{Koivisto2004} which perform an exhaustive search, can be used. There are obvious pros and cons to either approaches and both are common in the literature and provide a good first exploration of the data. - -For a general non-technical review of BN modeling applied in biology see \cite{Needham2007}. -While, a general introduction to BN modelling in veterinary epidemiology is provided by \cite{Lewis2011}. Further applications of BN to veterinary studies were described by \cite{Ward2013, Wilson2013, Sanchez-Vazquez2012}. Graphical modelling techniques used to analyse epidemiological data were used by -\cite{Firestone2013, Firestone2014, Lewis2012, Lewis2012b, Lewis2013, Schemann2013, Ludwig2013, McCormick2013} -resulting in dozens of publications, and references therein. - -In this manual we first consider a series of examples illustrating how to fit different types of models and run different searches and summary analysis to a (synthetic) data set comprising of 250 observations from a joint distribution comprising of 17 categorical and 16 continuous variables which is included as part of the \pkg{abn} library. This data set is a single realization from a network of the same structure as that presented in \cite{Lewis2011}, which is based on real data and sufficiently complex to provide a realistic example of data mining using Bayesian Network modeling. -Then, a fully explained Case Study is provided, where all the important steps to conduct a data analysis using additive Bayesian networks are illustrated. -Another detailed introduction and further relevant case studies about \pkg{abn} can be found at: \href{http://www.r-bayesian-networks.org}{www.r-bayesian-networks.org}. - - -\section{Pigs Case Study} - -We present data on disease occurrence in pigs provided -by the industry body the `British Pig Health Scheme' -(BPHS). The main objective of BPHS is to improve the -productivity of pig production in the UK, and reducing -disease occurrence is a significant part of this process. -The data we consider here comprise of a randomly chosen -batch of 50 pigs from each of 500 randomly chosen -pig producers in the UK. These are `finishing pigs', animals -about to enter the human food chain at an abattoir. -Each animal is assessed for the presence of a range -of different disease conditions by a specialist swine veterinarian. -We consider here the following nine disease -conditions: enzootic-pneumonia (EPcat); pleurisy (plbinary); -milk spots (MS); hepatic scarring (HS); pericarditis -(PC); peritonitis (PT); lung abscess (Abscess); tail -damage (TAIL); and papular dermatitis (PDcat). The -presence of any of these conditions results in an economic -loss to the producer. Either directly due to the -relevant infected part of the animal being removed from -the food chain, or indirectly in cases such as enzooticpneumonia, -which may potentially indicate poor herd -health and efficiency losses on the farm. An additional -loss, though not directly monetary, is the presence of tail -damage which may be suggestive of welfare concerns, -which may also be linked to sub-optimal production efficiency. -Milk spots and hepatic scarring result from infestation -with Ascaris suum which is particularly important -as this is a zoonotic helminth parasite. - - -\subsection{Deciding on a search method} - -As a very rough rule of thumb if there are less than $20$ variables (and no random effects) then probably the most robust model search option is an exact search (as opposed to a heuristic) which will identify a globally best DAG. Followed then by parametric bootstrapping in order to assess whether the identified model contains excess structure (this is an adjustment for over-modelling). Although, the parametric bootstrapping might require access to a cluster computer to make this computationally feasible. This is arguably one of the most comprehensive and reliable statistical modelling approaches for identifying an empirically justified best guess (DAG) at `nature's true model', the unknown mechanisms and processes which generated the study data. - -\subsubsection{Order Based Searches} -It is generally not feasible to iterate over all possible DAG structures when dealing with more than a handful of variables, hence the reliance on heuristic searches. It is also extremely difficult to construct efficient Monte Carlo Markov chain samplers across BN structures. A solution to this was proposed in \cite{Friedman2003} where rather than sample across DAGs, it was proposed to sample across node orderings. A node order is simply the set of integers $1$ through $n$, where $n$ is the number of variables in the data. A DAG is consistent with an ordering if for each node in the order its parents come before it. For example a DAG with only an arc from 1$\rightarrow$2 is consistent with ordering $1,2,3,4$ as the parent $1$ comes before $2$, but a DAG with an arc from 2$\rightarrow$1 is not consistent with this ordering. In effect, each order is a collection of DAGs, and note that each DAG may be consistent with multiple orders, i.e. the empty DAG is consistent with every possible ordering. This introduces bias, in that averaging across orders need not give the same results as averaging across DAGs, if the latter were possible. This is relevant when estimating posterior probabilities of individual structural features, and is baised towards more parsimonious features as they are consistent with more orders. Note that this bias does not apply to maximising across orders, as in finding most probable structures (see later). The big advantage of searching across orders is that there are $n!$ different orders compared to a reasonably tight upper bound of $2^{ n \choose 2 }$ for different DAGs. - -There are (at least) two approaches for searching across orders. The first is to construct a Markov chain which samples from the posterior distribution of all orders, and is the approach presented in \citet{Friedman2003}. Alternatively, in \cite{Koivisto2004} an exact method is proposed which rather than sample across orders, performs an exhaustive search. This has the advantage that it can also be used to find the globally optimal DAG of the data - the most probable structure - as well as posterior probabilities for structural features, such as individual arcs. The drawback is that this exact approach is only feasible with smaller number of variables e.g. up to 12 or 13 when dealing with additive models. For the code provided in \pkg{abn} this exact approach is readily feasible up to 20 variables using typical desktop computing, and potentially up to 25 variable with access to a shared memory cluster computer. - -\subsubsection{Most Probable Structure} -Using the exact order based method due to \cite{Koivisto2004} it is also possible to identify the DAG with globally best network score. Identification of a most probable structure is split into two parts. Firstly we calculate a cache of individual node scores, for example using~\rr{buildscorecache}. Next, an exhaustive order based structural search is carried out using the function \rr{mostprobable} which relies on the information in the node cache. - -As in the heuristic searches it is possible to ban or retain certain arcs, for example when splitting multinomial variables. This is done in the node cache computation step. There are two different structural priors implemented in this search, the first in the uniform prior where all structures (all parent combinations) are equally likely. This is the default \rr{prior.choice=1} in \rr{mostprobable} and the other functions. Also implemented is the prior used in \cite{Koivisto2004} where all parent combinations of equal cardinality are equally weighted, this is \rr{prior.choice=2}. The latter does give the same prior weight to a parent combination with no parents and a parent combination comprising off all possible parents (since there is only one choice of each, $n-1$ choose $0$ and n-1 choose $n-1$). This may not be desirable but is included as a prior for completness. Note that the order based search is exact in the sense that it will identify a DAG who score is equal to the best possible score if it was possible to exhaustive search across all DAGs. For example, if using \rr{prior.choice=1} then the network found should have a score greater than or equal to that found using the previously described heuristic searches. The structure found need not be unique in that others may exist with the same globally optimal score, the order based search is only guaranteed to find one such structure. - -To calculate the most probable structure we again use \rr{buildscorecache()} to calculate a cache of individual node scores. Next, the function \rr{mostprobable()} does the actual exhaustive order based search, and works for both conjugate and additive models since as with calculating -the posterior probabilities this step only involves structural searching and is not concerned with the precise parameterisation of each BN model. - -\subsection{Preparing the data} -There are two main things which need to be checked in the data before it can be used with any of the abn model fitting functions. -\begin{itemize} -\item All records with missing variables must be either removed or else the values completed. There are a range of libraries available from CRAN for completing missing values using approaches such as multiple imputation. Ideally, marginalising over the missing values is preferable (as opposed completing them as this then results in essentially dealing with models of models), but this is far from trivial here and not yet (and may never be) implemented in abn. To remove all records with one or more missing values then code similar to the following probably suffices: -<>= -library( abn) # Load the library - -mydat <- pigs.vienna[,-11] # Get data, drop batch variable - -mydat[ complete.cases(mydat),] -@ -N.b. this is not actually needed with pigs.vienna. -\item All variables which are to be treated as binary must be coerced to factors. To coerce an existing variable into a factor then: -<>= -mydat[,1] <- as.factor(mydat[,1]) -@ -coerces the first variable in data.frame pigs.vienna. The levels (labels of the factor) can be anything provided there are only two and a ``success" here is take to be the second level. For example, the second value in the vector returned by: -<>= -levels( mydat[,1]) -@ -\end{itemize} -To include additional variables in the modeling, for example interaction terms or polynomials, then these must be created manually and included into the data.frame just like any other variable. - - -\subsection{Initial searches for a optimal model} -Below is some R code which will perform an exact search. We want to find the DAG with the best goodness of fit (network score - log marginal likelihood) and ideally we would search for this without any apriori complexity limit (max number of parents). However, this may be both not computationally feasible and also highly inefficient. For example, with $25000$ observation is it really realistic to consider models with up to $9$ covariates per node. - -One approach is to start off with an apriori limit of one parent per node, find the best DAG, and then repeat an identical search process (again using functions buildscorecache and mostprobably) with a parent limit of $2$. And so on, stopping when the maximal DAG found no longer changes when the complexity limit is relaxed (increased). The parent limits \rr{max.par} are not inserted, in the code below, the \rr{max.par} command can vary from one to an higher parent limits. These initial searches can be easily automatized, in the library subdirectory \rr{system.file('bootstrapping\_example', -package='abn')} is possible to find a script file \rr{initsearch.bash} that performs the initial search explained before in an automated way, increasing progressively the number of parents, from $1$ to $5$ (for this specific example). The run time for the mostprobable function increases very considerably with the number of parents. -<>= -ban <- matrix( rep(0,dim(mydat)[2]^2),ncol=dim(mydat)[2]) -@ -The ban and retain matrix must have the names set: -<>= -colnames( ban) <- rownames(ban) <- names(mydat) - -retain <- matrix( rep(0,dim(mydat)[2]^2),ncol=dim(mydat)[2]) -colnames( retain) <- rownames(retain) <- names(mydat) -@ -Setup the distribution list for each node: -<>= -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") -@ -Build a cache of all the local computations: -<>= -mycache <- buildscorecache( data.df=mydat, - data.dists=mydists, dag.banned=ban, - dag.retained=retain, max.parents=max.par) -@ -Run the actual exact search: -<>= -mp.dag <- mostprobable( score.cache=mycache) - -fabn <- fitabn( dag.m=mp.dag,data.df=mydat, - data.dists=mydists) - -datadir <- tempdir() -@ -Save the results obtained: -<>= -save( mycache, mp.dag, fabn, file= - paste(datadir,"mp",max.par,".RData",sep="")) -@ - - -\subsection{Results from the initial search} -Searches across parent limits $1$ through $5$ were run and we now examine the results. What we are looking for is simply the model with the best score (the largest, the least negative, mlik value), checking that this does not improve when more parents are permitted. This then says we have found a DAG with maximal goodness of fit. What we find (below) is that the goodness of fit does not improve when we increase the parent limit beyond $3$. - -\begin{figure}[htbp] -\centering -\includegraphics[width=8cm]{ComparisonOfNetworkScore.pdf} %\vspace{-1.0cm} -\caption{Comparison of goodness of fits for different parent limits} \label{fig1} -\end{figure} - -The code necessary to analyze the results from the script file and to visualize the resulting DAG, in a linux environment, is the following: -<>= -load( "RData/Rout1.RData") -ml1 <- fabn$mlik; - -tographviz( dag.m=mp.dag,data.df=mydat,data.dists=mydists, - outfile="DAG_cycle.dot") -system( "dot -Tpdf -o DAG_cycle.pdf DAG_cycle.dot") -system( "evince DAG_cycle.pdf&") -@ -List of resulting marginal likelihood from the initial search: -<>= -mp.mlik <- c( -44711.62,-44685.53,-44684.64,-44684.64,-44684.64) -@ -The actual DAG corresponding to the maximum marginal likelihood: \rr{mp.mlik}$=-44684.64$ is in Fig.~\ref{fig2}. - -\begin{figure}[htbp] -\centering -\includegraphics[width=6.7cm]{DAG_cycle.pdf} -\vspace{-0.8cm} -\caption{Globally optimal DAG, found with a maximum number of 3 or more parents.} \label{fig2} -\end{figure} - -\subsection{Adjustment for overfitting: parametric bootstrapping using MCMC} -We have identified a DAG which has the best (maximum) possible goodness of fit according to the log marginal likelihood. This is the standard goodness of fit metric in Bayesian modelling \citep{MACKAY1992}, and includes an implicit penalty for model complexity. While it is sometimes not always apparent from the technical literature, the log marginal likelihood can easily (and sometimes vastly) overfit with smaller data sets. Of course the difficulty is identifying what constitutes `small' here. In other words using the log marginal likelihood alone (or indeed any of the other usual metrics such as AIC or BIC) is likely to identify structural features, which, if the experiment/study was repeated many times, would likely only be recovered in a tiny faction of instances. Therefore, these features could not be considered robust. Overfitting is an ever present issue in model selection procedures, particular is common approaches such as stepwise regression searches, see \cite{Babyak2004}. - -A well established approach for addressing overfitting is to use parametric bootstrapping \citep{Friedman1999}. The basic idea is very simple. We take our chosen model and then simulate data sets from this, the same size as the original observed data, and see how often the different structural features are recovered. For example, is it reasonable for our data set of 434 observations to support a complexity of 29 arcs? Parametric bootstrapping is arguably one of the most defensible solutions for addressing overfitting, although it is likely the most computationally demanding, as for each simulated (bootstrap) data set we need to repeat the same exact model search as used with the original data. And we may need to repeat this analysis hundreds (or more) times to get robust results. - -Performing parametric bootstrapping is easy enough to code up if done in small manageable chunks. Here we provide a step-by-step guide along with necessary sample code. - -\subsection{Software needed} -We have a DAG model and MCMC software such as JAGS and WinBUGS are designed for simulating from exactly such models. So all we need to do is implement our DAG model, e.g. in Fig.~\ref{fig2}, in the appropriate JAGS (or WinBUGS) syntax (which are very similar). -Here I am going to use JAGS, developed by \cite{Plummer2003}, in preference to WinBUGS or OpenBUGS for no other reason than that is what I am most familiar with, and JAGS is relatively straightforward to install (compile from source) on a cluster. Binary versions of JAGS are also available for most common platforms (Linux, Windows, OS X). -% -To implement our DAG in JAGS we need write a model definition file (a BUG file) which contains the structure of the dependencies in the model. We also need to provide in here the probability distributions for each and every parameter in the model. Note that in Bayesian modelling the parameter estimates will not generally conform to any standard probability distribution (e.g. Gaussian) unless we are in the very special case of having conjugate priors. The marginal parameter distributions required can be estimated using the \rr{fitabn} function and then fed into the model definition. We next demonstrate one way of doing this which is to use empirical distributions, in effect we provide JAGS with a discrete distribution over a fine grid which approximates whatever shape of density we need to sample from. - -\subsection{Generating marginal densities} -The function \rr{fitabn} has functionality to estimate the marginal posterior density for each parameter in the model. The parameters can be estimated one at a time by manually giving a grid (e.g. the x values where we want to evaluate f(x)) or else all together. In the latter case a very simple algorithm will try and work out where to estimate the density. This can work better sometimes and others, although it seems to work fine here for most variables. In order to use these distributions with JAGS we must evaluate the density over an equally spaced grid as otherwise the approach used in JAGS will not sample correctly. The basic command needed here is: -<>= -marg.f <- fitabn( dag.m=mydag,data.df=mydat,data.dists=mydists, - compute.fixed=TRUE,n.grid=1000) -@ - -We should not simply assume that the marginals have been estimated accurately, and they should each be checked using some common sense. Generally speaking, estimating the goodness of fit (mlik) for a DAG comprising of GLM nodes is very reliable. This marginalises out all parameters in the model. Estimating marginal posterior densities for individual parameters, however, can run into trouble as this presupposes that the data contains sufficient information to accurately estimate the "shape" (density) for every individual parameter in the model. This is a stronger requirement than simply being able to estimate an overall goodness of fit metric. If a relatively large number of arcs have been chosen for a node with relatively few observations (i.e. ``success'' in a binary node) then this may not be possible, or at least the results are likely to be suspect. Exactly such issues - overfitting - are why we are performing the parametric bootstrapping in the first place but they can also pose some difficulties before getting to this stage. - -It is essential to first visually check the marginal densities estimated from \rr{fitabn}. -Something like the following will create one pdf file where each page is a separate plot of a marginal posterior density. - -<>= -library( Cairo) # Available from CRAN -CairoPDF( "MargPlots_PigsData.pdf") -for( i in 1:length(marg.f$marginals)){ - cat( "processing marginals for node:", - nom1 <- names( marg.f$marginals)[i],"\n") - cur.node <- marg.f$marginals[i] - # Get the marginal for current node, this is a matrix [x,f(x)] - cur.node <- cur.node[[1]] - # This is always [[1]] for models without random effects - for( j in 1:length(cur.node)){ - cat( "processing parameter:",nom2<- names(cur.node)[j],"\n") - cur.param <- cur.node[[j]] - plot( cur.param,type="l",main=paste(nom1,":",nom2))}} -dev.off() -@ - -These plots (available in \rr{system.file('bootstrapping\_example',package='abn')}) suggests that the all the marginal estimates looks fine. -Moreover, we can now perform an additional common sense check on their reliability. A probability density must integrate to unity (the area under the curve is equal to one). The densities here are estimated numerically and so we would not expect to get exactly one (n.b. no internal standarization is done so we can check this), but if the numerical estimation has worked reliably then we would expect this to be close (e.g. $0.99, 1.01$) to on. - -\begin{figure}[htbp] -\centering -\includegraphics[width=9cm]{PigsArea.png} -\caption{Approximated area under marginal densities} \label{fig3} -\end{figure} - -From Fig.~\ref{fig3} it is obvious that everything went fine in the marginal density estimation. -To check the area we used the following code, which has as final output a file called \rr{pigs\_post\_params.R} which contains all the information JAGS needs to sample from the marginal posterior distributions. -<>= -marnew <- marg.f$marginals[[1]] - -for( i in 2: length(marg.f$marginals)){ - marnew <- c(marnew, marg.f$marginals[[i]])} -@ -A straightforward question is: how reliable are the marginals? If the numerical routines work well -then the area under the density function should be close to unity. -<>= -myarea <- rep( NA,length( marnew)) -names( myarea) <- names( marnew) -for( i in 1:length(marnew)){ - tmp <- spline(marnew[[i]]) - # Spline just helps make the estimation smoother - myarea[i] <- sum(diff(tmp$x)*tmp$y[-1])} - # Just width x height of rectangles -cbind( myarea) -@ -Now visualise \rr{myarea} as a plot, the colours need to be adjusted: -<>= -library( Cairo) -mycols <- rep("green",length(marnew)) -mycols[1:2] <- "red"; # PC -mycols[3] <- "orange"; # PT -mycols[4:5] <- "yellow"; # MS -mycols[6:7] <- "blue"; # HS -mycols[8:9] <- "lightblue"; # TAIL -mycols[10] <- "mistyrose"; # Abscess -mycols[11:12] <- "green"; # Pyaemia, lightcyan -mycols[13:14] <- "lavender"; # EPcat -mycols[15:18] <- "cornsilk"; # PDcat -mycols[19:22] <- "brown" ; # plbinary -CairoPNG( "Area_PigsData.png",pointsize=10,width=720,height=640) -par( las=2, mar=c(8.1,4.1,4.1,2.1)) -barplot( myarea,ylab="Area under Density",ylim=c(0,2), col=mycols) -dev.off() -@ -Now we create the data in the right format ready for going to JAGS: -<>= -print( names(marnew)) -# want to bind all the marginals the same nodes into a matrix -m <- marnew; -# PT -> PC, 1 parent, 2 params; -PC.p <- cbind( m[["PC|(Intercept)"]], m[["PC|PT"]]) -# PC --> NO PARENTS, 1 params; -PT.p <- cbind( m[["PT|(Intercept)"]]) -# HS -> MS, 1 parent, 2 params; -MS.p <- cbind( m[["MS|(Intercept)"]], m[["MS|HS"]]) -# EPcat -> HS, 1 parent, 2 params; -HS.p <- cbind( m[["HS|(Intercept)"]], m[["HS|EPcat"]]) -# PDcat -> TAIL, 1 parent, 2 params; -TAIL.p <- cbind( m[["TAIL|(Intercept)"]], m[["TAIL|PDcat"]]) -# Abscess --> NO PARENTS, 1 param; -Abscess.p <- cbind( m[["Abscess|(Intercept)"]]) -# TAIL -> Pyaemia, 1 parent, 2 params; -Pyaemia.p <- cbind(m[["Pyaemia|(Intercept)"]],m[["Pyaemia|TAIL"]]) -# plbinary -> EPcat, 1 parent, 2 params; -EPcat.p <- cbind( m[["EPcat|(Intercept)"]], m[["EPcat|plbinary"]]) -# MS, EPcat, plbinary --> PDcat, 3 parents, 4 params; -PDcat.p <- cbind( m[["PDcat|(Intercept)"]], m[["PDcat|MS"]], - m[["PDcat|EPcat"]], m[["PDcat|plbinary"]]) -# PC, PT, Abscess --> plbinary, 3 parents, 4 params; -plbinary.p <- cbind(m[["plbinary|(Intercept)"]],m[["plbinary|PC"]], - m[["plbinary|PT"]], m[["plbinary|Abscess"]]) - -dump( c( "PC.p","PT.p","MS.p","HS.p","TAIL.p","Abscess.p", - "Pyaemia.p","EPcat.p","PDcat.p","plbinary.p"), - file=paste('Jags/',"pigs_post_params.R",sep="")) -@ - -\subsection{Building BUG model} -Once we have the posterior distributions the next step is to actually create the DAG in JAGS. This involves creating a BUG file, a file which contains a definition of our DAG (from Fig.~\ref{fig2}) in terms which can be understood by JAGS. This can easily be done by hand, if rather tedious, and should be checked carefully for errors (which JAGS will prompt about in any case). The BUG file is here. This file is fairly heavily commented (it might look complicated but most of it is just copy and paste with minor edit) - the syntax is similar to R - and should be fairly self explanatory. Note that unlike a more usual use of WinBUGS or JAGS we have no data here, we are simply providing JAGS with a set of marginal probability distributions and how they are inter-dependent, and we want it to then generate realisations from the appropriate joint probability distribution. The next step is to tell JAGS to perform this simulation, i.e. generate a single bootstrap data set of size $n=25000$ observations based on the assumption that the DAG is Fig.~\ref{fig2} is our true model. The BUG file can be found in \rr{system.file('bootstrapping\_example',package='abn')} in the file \rr{pigs\_model.bug}. - -\subsection{A single bootstrap analysis} -To run JAGS we use four separate files: i) the BUG model definition file (model.bug); ii) a file \rr{pigs\_post\_params.R} which contains the marginal distributions referred to in the BUG file; iii) a script which runs the MCMC simulation \rr{jags\_pigs\_script.R}; and finally iv) a file which sets of random number seed (\rr{inits.R} - the value in this must be changed to use different streams of random numbers). These four files are contained in this tarball. To run this example extract all the files into one directory and then at the command line type \rr{`jags jags\_pigs\_script.R'}. In terms of the MCMC component, a burn-in of $100000$ is used but as there are no data here this takes no time to run and is likely of little use and could be shorted (it is just included to allow JAGS any internal adaptations or diagnostics that it might need to do). The actual MCMC is then run for $250000$ iterations with a thin of $10$, which gives $25000$ observations for each variable - the same size as the original data. The number of MCMC steps and thin is a little arbitrary and this could be run for longer with a bigger thin, but for this data looking at autocorrelation plots for the Gaussian variables there appears no evidence of correlation at a thin of $10$ and so this seems sufficient here. - -The next step is to automate a single run, e.g. generate a single bootstrap sample and then perform an exact search on this, just as was done for the original data. This file performs a single such analysis in R - just drop this into the same directory as the four JAGS file above. For ease we also call the JAGS script from inside R which might require some messing about with the PATH variable on Windows (e.g. if you open up a cmd prompt then you should be able to type "jags" without needed to give a full path). The output from this is given below and "results" is a single matrix (DAG) which is saved in an R workspace called \rr{boot1run.RData}. - -Get the pigs data, drop batch variable -<>= -orig.data <- pigs.vienna[,-11] -@ - -Now create a single bootstrap sample: -<>= -system("jags jags_pigs_script.R") -@ -Read in boot data and convert to data.frame in same -format as the original data, e.g. coerce to factors, using the appropriate \proglang{R} package \pkg{coda}, described in \cite{Plummer2006}: -<>= -library( coda) -boot.data <- read.coda("out1chain1.txt","out1index.txt") - -boot.data <- as.data.frame(boot.data) -for( j in 1:dim(orig.data)[2]){if(is.factor(orig.data[,j])) - { boot.data[,j]<- as.factor(boot.data[,j]) - levels(boot.data[,j])<- levels(orig.data[,j])}} -@ -Now we have the boot.data in identical format as the original: -<>= -ban <- matrix( rep(0,dim(orig.data)[2]^2), - ncol=dim(orig.data)[2]) -colnames( ban) <- rownames(ban) <- names(orig.data) - -retain <- matrix( rep(0,dim(orig.data)[2]^2), - ncol=dim(orig.data)[2]) -colnames( retain) <- rownames(retain) <- names(orig.data) - -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") -@ -Set the parent limits to $3$, equal to the original data: -<>= -max.par <- 3; -@ -Build a cache on bootstrap data: -<>= -boot1.cache <- buildscorecache( data.df=boot.data, - data.dists=mydists, max.parents=max.par, - dag.banned=ban, dag.retained=retain) -@ -Run \rr{mostprobable} search on the bootstrap data: -<>= -boot1.mp <- mostprobable( score.cache=boot1.cache) - -datadir <- tempdir() -save( boot1.mp,file=paste( datadir,"boot1run.RData",sep='')) -@ - -Once this works on your local machine it is then a case of trying to automate this in the most efficient way, for example for use on a cluster. The crucial thing here is that the random number seed used each time (in the file \rr{inits.R}) must be changed for each bootstrap simulation otherwise an identical bootstrap data set will be produced! - -\subsection{Ways to summarise results from parametric bootstrapping} -The globally optimal DAG, Fig.~\ref{fig2}, has $12$ arcs. It way be that some of these are due to over-modelling which means they will be recovered in relatively few bootstrap analyses. $10240$ bootstrap analyses were conducted (on a cluster) and all the R files, MPI wrapper, and also the actual results (in R workspaces) can be found in the folder \rr{system.file('bootstrapping\_example', -package='abn')}. - -The first step is to explore the bootstrap results. Of some interest is how many of the arcs were recovered during each of the bootstrap ``simulations''. This is given in Fig.~\ref{fig4}. We can see right away that not all the arcs in the original DAG (Fig.~\ref{fig2} - with 12 arcs) were recovered - even just once. This provides overwhelming evidence that the original exact search has indeed overfitted to the original data. Not at all surprising given the relatively small sample size. We must therefore trim off some of the complexity - arcs - from the original DAG in order to justify that our chosen DAG is a robust estimate of the underlying system which generated the observed data. - -A parametric bootstrapping approach -was suggested in \cite{Friedman1999} which uses simulation to -assess whether a chosen model comprises more complexity -than could reasonably be justified given the size -of the observed data set. Using Markov chain Monte -Carlo simulation via JAGS (open source software), $10240$ -independent (assumed by inspecting autocorrelations -from the MCMC output) data sets of the same size as -the original data were generated from our chosen model -in i). For each of these bootstrap data sets an identical -exact order-based search as in i) was conducted. - -\begin{figure}[htbp] -\centering -\includegraphics[width=9cm]{Bootstrapping.png} -\caption{Number of arcs recovered in bootstrapping} \label{fig4} -\end{figure} - -There are a number of different options in terms of trimming/pruning arcs. -One common option which apes the use of majority consensus trees in phylogenetics -trees are just special cases of DAGs, is to remove all arcs which were not recovered in at least a majority $(50\%)$ of the bootstrap results. -Fig.~\ref{fig5} shows the frequency at which each arc was recovered, the maximum value possible being $10240$ $(100\%$ support). -Collating results across these $10240$ searches we find that only -$14\%$ of the globally optimal DAGs found comprised $12$ -or more arcs. Approximately $68\%$ of DAGs had $11$ or -more arcs - therefore a robust model of the original data -has no more than $11$ arcs. Almost identical results were -obtained using a random selection of $5012$ searches suggesting -that sufficient bootstrap samples had been performed. -The usual cut-off for structural support of features -(arcs) is $50\%$ in BN modeling \citep{Poon2007, Poon2007a, Poon2008, Lycett2009, Lewis2011}, -and is analogous to the widespread use of majority consensus -trees in phylogenetics. We therefore conclude that -our chosen model in i) with 11 arcs is robust. This is perhaps -not surprising given we have a large data set of $25K$ -observations. - -\begin{figure}[htbp] -\centering -\includegraphics[width=15cm]{Summary.png} -\caption{Frequencies at which each arc in the original DAG was recovered during bootstrapping} -\label{fig5} -\end{figure} - -Note that this $50\%$ is not in any way comparable with the usual $95\%$ points used in parameter estimation as these are entirely different concepts (an explanation for this can be found here). - -Another option, other than choosing a different level of support (which is entirely up to the researcher), is to consider an undirected network. That is, include all arcs if there support - considering both directions - exceeds $50\%$. This is justifiable due to likelihood equivalence which means that - generally speaking - the data cannot discriminate between different arc directions (see here for an explanation) and therefore considering arcs recovered in only one direction may be overly conservative. Again, this decision is likely problem specific. For example, from a purely statistical perspective being conservative is generally a good thing, but from the scientists point of view this may then remove most of the more interesting results from the study. Obviously a balance is required. - -In this data it turns out that removing all arcs with have less than $50\%$ support gives an identical pruned network as if we were to consider both arc directions jointly. In generally this need not be the case. Fig.~\ref{fig6} shows out optimal DAG after removing these arcs. This is our optimal model of the data. - - -All the code for analysing the results from the bootstrap analyses can be found here: -<>= -mydata <- pigs.vienna[,-11] - -N <- 10240; -# Write out manually, clearer than using rep() -mydag <- matrix(c( -# PC PT MS HS TAIL Abscess Pyaemia EPcat PDcat plbinary - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, - 1, 1, 0, 0, 0, 1, 0, 0, 0, 0), - byrow=TRUE,ncol=10) -colnames(mydag) <- rownames(mydag) <- names(mydata) -sum( mydag) # 12 arcs, as in original model, Figure 2 - -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") - -# Use fitabn to check mydag is correct (no typos mlik = -44684.64) -print( fitabn(dag.m=mydag,data.df=mydata,data.dists=mydists)$mlik) -bestdag <- mydag -@ - -\begin{figure}[htbp] -\centering -\includegraphics[width=8cm]{postbootpigs.pdf}\vspace*{-5mm} -\caption{ Optimal DAG after removal of arcs supported at less than $50\%$ in bootstrapping. Contains $8$ Arcs.}\label{fig6} -\end{figure} - -The next command reads all files with \rr{mp[number].RData} and create a list of results: -<>= -boot.dags <- list() -these <- grep("mp10Kboot\\d+.RData", dir()) -num <- 1 -for( i in dir()[these]){# Load each file - load(i) # Provides dags, a list - tmp <- dags[which(unlist(lapply(dags,sum))>0)] - # Get valid entries in dags but as a list - for( j in 1:length(tmp)){ - # For each entry copy into boot.dags, and increment counter - boot.dags[[num]]<- tmp[[j]]; num <- num+1 } - rm( dags) -} -@ -It is always good practice, have a look at mlik values for the bootstraps viz a viz the original: -<>= -if( FALSE){ - scores <- rep(0,length(boot.dags)) - for(i in 1:length(boot.dags)){ - scores[i] <- fitabn(dag.m=boot.dags[[i]],data.df=mydata, - data.dists=mydists)$mlik - } - scores.b <- scores[-which(scores< -N)] - orig.score <- fitabn(dag.m=bestdag,data.df=mydata, - data.dists=mydists)$mlik - plot(density(scores.b,from=min(scores.b),to=max(scores.b))) - abline(v=orig.score,lwd=2,col="blue") -} -@ -We now trim all arcs from the boot results which do not occur in the Master -DAG - bestdag - since we know these are due to overfitting: -<>= -boot.dags.trim <- boot.dags -for( i in 1:length(boot.dags)){ - boot.dags.trim[[i]] <- boot.dags.trim[[i]]*bestdag } - -arc.freq <- lapply(boot.dags.trim,sum) -arc.freq <- table(unlist(arc.freq)) -library( Cairo) -CairoPNG("PigsFreqBootRes.png",pointsize=10,width=720,height=700) -par(las=1, mar=c(6.1,6.1,4.1,2.1)) -barplot( arc.freq,ylab="",xlab="",col="skyblue", - names.arg=names(arc.freq), ylim=c(0,2500)) -par( las=1) -mtext( "Number of arcs in bootstrap DAG",1,line=3,cex=1.5) -par( las=3) -mtext( "Frequency out of 10240",2,line=4,cex=1.5) -dev.off() - -total.dag <- matrix(rep(0,dim(bestdag)[2]^2),ncol=dim(bestdag)[2]) -colnames(total.dag) <- rownames(total.dag)<- colnames(bestdag) -# Get the support for each arc, total.dag: -for( i in 1:length(boot.dags)){ - if(sum(boot.dags[[i]])>0){total.dag <- total.dag+boot.dags[[i]]}} -total.dag <- total.dag*bestdag # We only want arcs in the best DAG -total.dag -@ -The frequencies at which each arc in the original DAG was -recovered during bootstrapping. - -We get the majority consensus (directed DAG): -<>= -f <- function(val,limit){ if(val>= -bestdag.trim.nodir <- bestdag -bestdag.trim.nodir[,] <- 0 # Set zero -child <- NULL; parent <- NULL -for( i in 1:dim(total.dag)[1]){ - for( j in 1:dim(total.dag)[2]){ - if(i>j){ # Get most supported direction - if(total.dag[i,j]>total.dag[j,i]){ - m.i <- i; m.j <- j;} - else {m.i <- j; m.j <- i;} - # Does arc quality - exceed threshold of support - if(total.dag[i,j]+total.dag[j,i]>N/2){ - # We want this as more than 5000 - bestdag.trim.nodir[m.i,m.j] <- 1}}}} - -tographviz( dag.m=bestdag.trim,data.df=mydata, - data.dists=mydists, outfile="postbootpigs.dot") -system( "dot -Tpdf -o postbootpigs.pdf postbootpigs.dot") -system( "evince postbootpigs.pdf&") - -save( bestdag.trim,file=paste("bestdagpigs_trim.RData",sep='')) -@ - -\subsection{Estimating marginals from the final DAG} - -\begin{figure}[!t] -\centering -\includegraphics[width=8cm]{plbinaryNode.png} -\caption{ Marginal posterior density based in the node plbinary}\label{fig8} -\end{figure} - -Once we have identified our optimal DAG then it is usual to want to examine the parameters in this model, e.g. in Fig.~\ref{fig8}. These are our results, the effects of the various variables in our study. This process is very similar to when estimating the marginals for the bootstrapping but should now be easier since we should have removed any difficulties due to over-fitting. The posterior density plots for the final DAG can be found in the pdf called -\rr{Pigs\_PostBootPlots.pdf}. These all look fine. An outlook on the parameters can be done based on the variable plbinary. - - -The R code for creating the marginals and quantiles can be found below: -<>= -mydata <- pigs.vienna[,-11] - -mydists <- list( PC="binomial", PT="binomial", MS="binomial", - HS="binomial", TAIL="binomial", - Abscess="binomial", Pyaemia="binomial", - EPcat="binomial", PDcat="binomial", - plbinary="binomial") - -marg.f <- fitabn(dag.m=bestdag.trim,data.df=mydata, - data.dists=mydists,compute.fixed=TRUE, - n.grid=1000) - -library( Cairo) -CairoPDF("Pigs_PostBootPlots.pdf") -for( i in 1:length(marg.f$marginals)){ - cat( "processing marginals for node:", - nom1 <- names(marg.f$marginals)[i],"\n") - cur.node <- marg.f$marginals[i] - # Get marginal for current node, this is a matrix [x,f(x)] - cur.node <- cur.node[[1]] - # This is always [[1]] for models without random effects - for( j in 1:length(cur.node)){ - cat("processing parameter:", - nom2 <- names(cur.node)[j],"\n") - cur.param <- cur.node[[j]] - plot( cur.param,type="l",main=paste(nom1,":",nom2))}} -dev.off() - -marnew <- marg.f$marginals[[1]] -for(i in 2: length(marg.f$marginals)){ - marnew <- c(marnew, marg.f$marginals[[i]])} - -margs <- marnew; -mymat <- matrix(rep(NA,length(margs)*3),ncol=3) -rownames(mymat) <- names(margs) -colnames(mymat) <- c("2.5%","50%","97.5%") -ignore.me <- union(grep("\\(Int",names(margs)), - grep("prec",names(margs))) -@ -These are not effect parameters, but background constants and precisions: -<>= -comment <- rep("",length(margs)) -for(i in 1:length(margs)){ - tmp <- margs[[i]] - tmp2 <- cumsum(tmp[,2])/sum(tmp[,2]) - mymat[i,] <- c(tmp[which(tmp2>0.025)[1]-1,1], - tmp[which(tmp2>0.5)[1],1], - tmp[which(tmp2>0.975)[1],1]) - myvec <- mymat[i,] - if( !(i%in%ignore.me) && (myvec[1]<0 && myvec[3]>0)){ - comment[i] <- "not sig. at 5%"} - # Truncate for printing - mymat[i,] <- as.numeric(formatC(mymat[i,],digits=3,format="f"))} -cbind( mymat) -@ - -The code above produce the table of percentiles that can be found in Table~\ref{fig7}. -\begin{table} -\begin{center} -\begin{tabular}{|l|c|c|c|} - \hline - & & \textbf{Odds Ratio} & \\ - \hline - & \textbf{2.5\%} & \textbf{50\%} & \textbf{97.5\%} \\ - \hline -PC$|$(Intercept) & 0.03 & 0.03 & 0.03 \\ - \hline -\textcolor{red}{ PC$|$PT} & \textcolor{red}{17.71} & \textcolor{red}{26.44} & \textcolor{red}{39.21} \\ - \hline - PT$|$(Intercept) & 0.00 & 0.00 & 0.00 \\ - \hline - MS$|$(Intercept) & 0.06 & 0.06 & 0.07 \\ - \hline - \textcolor{blue}{MS$|$HS} & \textcolor{blue}{0.20} & \textcolor{blue}{0.30} & \textcolor{blue}{0.44} \\ - \hline - HS$|$(Intercept) & 0.06 & 0.06 & 0.06 \\ - \hline - TAIL$|$(Intercept) & 0.00 & 0.00 & 0.00 \\ - \hline - \textcolor{red}{TAIL$|$PDcat} & \textcolor{red}{2.61} & \textcolor{red}{4.32} & \textcolor{red}{6.78} \\ - \hline - Abscess$|$(Intercept) & 0.00 & 0.01 & 0.01 \\ - \hline - Pyaemia$|$(Intercept) & 0.00 & 0.00 & 0.00 \\ - \hline - EPcat$|$(Intercept) & 0.36 & 0.37 & 0.38 \\ - \hline - \textcolor{red}{EPcat$|$plbinary} & \textcolor{red}{1.97} & \textcolor{red}{2.14} & \textcolor{red}{2.31} \\ - \hline - PDCat$|$(Intercept) & 0.04 & 0.04 & 0.04 \\ - \hline - \textcolor{red}{PDcat$|$plbinary} & \textcolor{red}{1.78} & \textcolor{red}{2.08} & \textcolor{red}{2.41} \\ - \hline - plbinary$|$(Intercept) & 0.10 & 0.11 & 0.11 \\ - \hline - \textcolor{red}{PC$|$(Intercept)} & \textcolor{red}{12.92} & \textcolor{red}{15.12} & \textcolor{red}{17.71} \\ - \hline - \textcolor{red}{PC$|$(Intercept)} & \textcolor{red}{2.72} & \textcolor{red}{4.36} & \textcolor{red}{6.97} \\ - \hline - \textcolor{red}{PC$|$(Intercept)} & \textcolor{red}{6.23} & \textcolor{red}{8.71} & \textcolor{red}{12.17} \\ - \hline -\end{tabular} -\caption{Marginal posterior quantiles for each parameter. The red lines indicate point estimates bigger than $1$, corresponding to risk factor. While the blue line refers to estimates smaller than $1$, indicating protective factor.} \label{fig7} -\end{center} -\end{table} - -All of the effect parameters, that is, ignoring the intercept terms (which is just a background constant) and the precision parameters - have $95\%$ confidence (or credible) intervals which do not cross the origin. Note this is not guaranteed to happen this criteria was not part of the model selection process, and all the parameters in the model have been justified using mlik and bootstrapping. But having such marginal intervals can make presenting the results possibly easier to a skeptical audience. - -\subsection{Pigs Case Study Conclusion} - -Based on the results presented in the previous section, it possible to draw a conclusion of the study. -From Table~\ref{fig7}, we can notice that, a part from HS, which has a protective effect on MS, all others parameters have a risk effect on possible disease outcome. Our final optimal ABN model of this disease system suggests that the presence of Pyaemia is independent from other conditions. Moreover, the remaining diseases split into two separate connected components. - -Some interesting biological questions, resulting from ABN model, can be related to the similarity of causal pathways between these two groups of diseases and to the common causes shared within each of these groups. - -The principal reasons because ABN models should be used are linked to their ability of generalization of the usual GLM to multiple dependent variables: fully multi-dimensional models. Specifically, results from ABN analyses can be used as a basis for developing new biological questions about factors -potentially affecting disease's presence, and inform the design of future targeted studies. - -\section{An additional example} -In the next subsections we illustrate how to fit an ABN model to different kinds of data, starting with the introdcution of the data we are going to deal with. The main purpose of BN structure discovery is to estimate the joint dependency structure of the random variables in the available data, and this is achieved by heuristically searching for optimal models and comparing their goodness of fit using Bayes factors. It is assumed that all structures are equally supported in the absence of any data - an uniformative prior on structures - and so comparing Bayes factors collapses to comparing the marginal likelihoods which is done on a log scale. The log marginal likelihood for a BN is typically referred to as the network score. - -\subsection{Simulated Data Example var33} -Figure~\ref{fig1} shows the structure of the distribution which generated the data set \rr{var33} included with \pkg{abn}. This diagram was created using the \rr{tographviz()} function of \pkg{abn} (see later examples) which translates the matrix which defines a network - a directed acyclic graph - into a text file of suitable format for processing in Graphviz, where this processing was done outside of \proglang{R}. Graphviz is freely available and operates on most platforms and can be downloaded from \href{http://www.graphviz.org}{www.graphviz.org}, there is also an R package which interfaces to Graphviz available from the Bioconductor project (requires an installation of Graphviz). -\begin{figure}[htb] -\includegraphics{mydag_all} -\vspace{-1.0cm} -\caption{Directed acyclic graph representation of the joint probability distribution which generated data set \rr{var33} which is included with \pkg{abn}. The square nodes are categorical (binary) and the oval nodes continuous variables.} \label{fig9} -\end{figure} - -\subsection{Fitting an additive BN model to categorical data} \label{sec2} -An additive BN model for categorical data can be constructed by considering each individual variable as a logistic regression of the other variables in the data, and hence the network model comprises of many combinations of local logistic regressions \citep{Rijmen2008}. The parameters in this model are the additive terms in a usual logistic regression and independent Gaussian priors are assumed for each covariate. Note that the variables here must all be binary, and so all multinomial variables need to be split into separate binary factors (and added to the original data.frame) in order to form the network model. This is analogous to forming the design matrix in a conventional additive model analysis. Similarly, interaction terms can be added by including appropriate additional columns in the data.frame. In these models the log marginal likelihood (network score) is estimated using Laplace approximations at each node. Hyperparameters for the means and variances in the Gaussian priors are fixed at zero and 1000 respectively, and other values can be given explicitly in the call to \rr{fitabn} but this is not recommended without good reason. - -To fit an additive model use \rr{fitabn(data.df,dag.m,data.dists, ...)}. In the following code we fit first the independence model with no arcs and then the same dependence model as above. Turning on \rr{verbose=TRUE} simply gives the individual log marginal likelihoods for each node (n.b. the numbering is that used internally and simply denotes the variables in the data.frame from left to right). - -The following code fits a network to the subset of the variables from \rr{var33} which are categorical. In this data these are all binary. Note that all categorical variables should be set as factors. -<>= -library( abn) -bin.nodes <- c( 1,3,4,6,9,10,11,12,15,18,19,20,21,26,27,28,32) -var33.cat <- var33[,bin.nodes] #Categorical nodes only - -dag33 <- matrix( 0, 17, 17) -colnames( dag33) <- rownames( dag33) <- names( var33.cat)#Set names -@ -Move back to independence model: -<>= -dag33["v11","v12"] <- 0; dag33["v11","v10"]<- 0; dag33["v4","v3"]<- 0; -@ -Setup the distribution list for each categorical node: -<>= -mydists.cat <- list( v1 ="binomial", v3 = "binomial", - v4 = "binomial", v6 = "binomial", v9 = "binomial", - v10 = "binomial", v11 = "binomial", v12 = "binomial", - v15 = "binomial", v18 = "binomial", v19 = "binomial", - v20 = "binomial", v21 = "binomial", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v32 = "binomial") -ind.mod.cat <- fitabn( data.df=var33.cat, dag.m=dag33, - data.dists=mydists.cat, verbose=FALSE) -@ -It is possible to change to \rr{verbose=TRUE} if one want to check -how change the score for each individual node. - -The network score for a model with conditional independencies is: -<>= -ind.mod.cat$mlik -@ -The structure of the network definition matrix is where each row is a ``child'' and each column is its ``parents'', where a \rr{1} denotes a parent (or arc) is present. Now lets fit a model with some conditional dependencies, for example where \rr{v11} is conditionally dependent upon \rr{v12} and \rr{v10}, and \rr{v4} is conditionally dependent upon \rr{v3}. - -Now fit the model with some conditional dependencies: -<>= -dag33["v11","v12"] <- 1; -dag33["v11","v10"] <- 1; -dag33["v4","v3"] <- 1; -dep.mod.cat <- fitabn( data.df=var33.cat, dag.m=dag33, - data.dists=mydists.cat, verbose=FALSE) -@ -The network score for a model with conditional dependencies is: -<>= -dep.mod.cat$mlik -@ -The network score is considerably improved and therefore suggests support for these new structural features. To produce a visual description of the model then we can export to graphviz as follows: -<>= -tographviz( dag=dag33, data.df=var33.cat, data.dists=mydists.cat, - outfile="mydagcat.dot", directed=TRUE) #Create file -@ -\rr{mydagcat.dot} can then be processed with graphviz unix shell typing: "dot -Tpdf mydagcat.dot -o mydagcat.pdf" or using gedit if on Windows. - -\begin{figure}[htb] -\includegraphics{mydag} -\vspace{-1.0cm} -\caption{Directed acyclic graph \rr{dag33} created using \rr{tographviz()} and Graphviz} \label{fig10} -\end{figure} -In \rr{tographviz()} the \rr{data.df} argument is used to determine whether the variable is a factor or not, where factors are displayed as squares and non-factors as ovals. To use the full range of visual Graphviz options simply use the file created by \rr{tographviz()} as a starting point and manually edit this in a text editor before running through \rr{dot} or one of the other Graphviz layout processors. - -\subsection{Fitting an additive BN model to continuous data} \label{sec3} -We now consider analogous models to those in Section~\ref{sec2} but where the network comprises of Gaussian linear regressions rather than logistic regressions. The structure of these models again assumes independent Gaussian priors for each of the coefficients in the additive components for the mean response at each node (with hyper means = 0 and hyper variances = 1000). The Gaussian response distribution is parameterized in terms of precision $(1/\sigma^2)$, and independent Gamma priors are used with shape=0.001 and scale=1/0.001 (where these are as defined in the \rr{rgamma} help page). By default, each variable in the data.frame is standardised to a mean of zero and standard deviation of one, this has no effect on the identification of dependencies between variables. -We start droping the categorical nodes: -<>= -var33.cts <- var33[,-bin.nodes] -dag33 <- matrix( 0, 16, 16) -colnames( dag33) <- rownames( dag33) <- names( var33.cts) -@ -Setup the distribution list for each continuous node: -<>= -mydists.cts <- list( v2 = "gaussian", v5 = "gaussian", - v7 = "gaussian", v8 = "gaussian", v13 = "gaussian", - v14 = "gaussian", v16 = "gaussian", v17 = "gaussian", - v22 = "gaussian", v23 = "gaussian", v24 = "gaussian", - v25 = "gaussian", v29 = "gaussian", v30 = "gaussian", - v31 = "gaussian", v33 = "gaussian") -@ -Now fit the model defined in dag33 - full independence -<>= -ind.mod.cts <- fitabn( data.df=var33.cts, dag.m=dag33, - data.dists=mydists.cts, verbose=FALSE) -@ -We use as default priors, a Gaussian density with $0$ mean and variance $1000$. -While for the precision, inverse of variance, we use a Gamma density of hyperparameters -$0.001$ and $1/0.001$. -This is the network score (goodness of fit, log marginal likelihood): -<>= -ind.mod.cts$mlik -@ -Now fit a model with conditional dependencies, for example let \rr{v33} -depend on \rr{v31}, and \rr{v24} depend on \rr{v23}, and \rr{v14} depend -on \rr{v13}. -<>= -dag33["v33","v31"] <- 1; -dag33["v24","v23"] <- 1; -dag33["v14","v13"] <- 1; -dep.mod.cts <- fitabn( data.df=var33.cts, dag.m=dag33, - data.dists=mydists.cts, verbose=FALSE) -@ -The network score for a model with conditional independence is: -<>= -dep.mod.cts$mlik - -tographviz( dag=dag33, data.df=var33.cts, data.dists=mydists.cts, - outfile="mydagcts.dot", directed=TRUE) #Create file -@ -\rr{mydagcat.dot} can then be processed with graphviz unix shell typing: "dot -Tpdf mydagcat.dot -o mydagcat.pdf" or using gedit if on Windows. - -\begin{figure}[htb] -\includegraphics{mydagcts} -\vspace{-1.0cm} -\caption{Directed acyclic graph \rr{dag33} for continuous variables only created using \rr{tographviz()} and Graphviz} \label{fig11} -\end{figure} - - -\subsection{Fitting an additive BN model to mixed data} \label{sec4} -To conclude the fitting of a single pre-specified model to data, e.g. based on expert opinion, we consider an additive BN model which comprises both binary and Gaussian nodes and this comprises of a combination of Binomial (logistic) and Gaussian linear models. Again \rr{fitabn()} is used and the code is almost identical to the previous examples. -<>= -dag33 <- matrix( 0, 33, 33) -colnames( dag33) <- rownames( dag33)<- names( var33) -@ -Setup distribution list for each mixed node: -<>= -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial", v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") -@ -Now fit the model defined in dag33, full independence: -<>= -ind.mod <- fitabn( data.df=var33, dag.m=dag33, - data.dists=mydists.mix, verbose=FALSE) -@ -The network score with no conditional dependencies is: -<>= -ind.mod$mlik -@ -We now fit a BN model which has the same structure as the joint distribution used to generate the data and later create a visual graph of this model. We then define a model with many independencies: -<>= -dag33[2,1] <- 1; -dag33[4,3] <- 1; -dag33[6,4] <- 1; dag33[6,7] <- 1; -dag33[5,6] <- 1; -dag33[7,8] <- 1; -dag33[8,9] <- 1; -dag33[9,10] <- 1; -dag33[11,10] <- 1; dag33[11,12] <- 1; dag33[11,19] <- 1; -dag33[14,13] <- 1; -dag33[17,16] <- 1;dag33[17,20] <- 1; -dag33[15,14] <- 1; dag33[15,21] <- 1; -dag33[18,20] <- 1; -dag33[19,20] <- 1; -dag33[21,20] <- 1; -dag33[22,21] <- 1; -dag33[23,21] <- 1; -dag33[24,23] <- 1; -dag33[25,23] <- 1; dag33[25,26] <- 1; -dag33[26,20] <- 1; -dag33[33,31] <- 1; -dag33[33,31] <- 1; -dag33[32,21] <- 1; dag33[32,31] <- 1; dag33[32,29] <- 1; -dag33[30,29] <- 1; -dag33[28,27] <- 1; dag33[28,29] <- 1; dag33[28,31] <- 1; -dep.mod <- fitabn( data.df=var33, dag.m=dag33, - data.dists=mydists.mix, verbose=FALSE) -@ - -The network score for a model with conditional independence is: -<>= -dep.mod$mlik -tographviz( dag=dag33, data.df=var33, data.dists=mydists.mix, - outfile="mydag_all.dot", directed=TRUE)#Create file -@ - -\begin{figure}[htb] -\includegraphics{mydag_all} -\vspace{-1.0cm} -\caption{Directed acyclic graph \rr{dag33} for mixed continuous and discrete variables} \label{fig12} -\end{figure} - -\subsection{Model fitting validation} -In order to validate the additive models for mixed binary and Gaussian models, estimates of the posterior distributions for the model parameters using Laplace approximations were compared with those estimated using Markov chain Monte Carlo. These were always in very close agreement for the range of models and data examined. This is an indirect validation of the Laplace estimate of the network score, e.g. if the posterior densities match closely then this implies that the denominator (the marginal likelihood - network score) must also be accurately estimated, as a ``gold standard'' estimate of the network score is generally unavailable for such non-conjugate models. - -%\newpage -\section{Further insights into searching strategy} \label{sec5} -The key objective of the \pkg{abn} library is to enable estimation of statistical dependencies in data comprising of multiple variables - that is, find a DAG which is robust and representative of the dependency structure of the (unknown) stochastic system which generated the observed data. The challenge here is that with such a vast model space it is impossible to enumerate over all possible DAGs, and there may be very many different DAGs with similar goodness of fit. In the next sections we first consider searching for additive (non-conjugate) models. - -\subsection{Single search for optimal additive BN model from categorical data} -To run a single search heuristic use \rr{search.hillclimber()}. This commences from a randomly created DAG which is constructed by randomly adding arcs to an empty network until all possible arcs have been tried. The function \rr{search.hillclimber()} then searches stepwise from the initial random network for an improved structure, where three stepwise operations are possible: i) add an arc; ii) remove and arc; or iii) reverse and arc. The stepwise search is subject to a number of conditions, firstly only moves that do not generate a cycle are permitted, secondly, a parent limit is imposed which fixes the maximum number of parents which each child node can have (arcs go from parent to child), and thirdly it is possible to ban or retain arcs. If provided, \rr{banned.m} is a matrix which defines arcs that are not allowed to be considered in the search process (or in the creation of the initial random network). Similarly, \rr{retain.m} includes arcs which must always be included in any model. It is also possible to specific an explicit starting matrix, \rr{start.m} and if using a retain matrix then \rr{start.m} should contain at least all those arcs present in \rr{retain.m}. Note that only very rudimentary checking is done to make sure that the ban, retain and start networks - if user supplied - -are not contradictory. - -To improve the computational performance of \rr{search.hillclimber()} a cache of all possible goodness of fit must be built in advance, using the function \rr{buildscorecache()}. Rather than re-calculate the score for each individual node in the network (the overall network score is the product of all the scores for the individual nodes) the score for each unique node found during the search is stored in the cache created by the function \rr{buildscorecache()}. - -<>= -bin.nodes <- c( 1,3,4,6,9,10,11,12,15,18,19,20,21,26,27,28,32) -var33.cat <- var33[,bin.nodes] #Categorical nodes only -dag33 <- matrix( 0, 17, 17) -colnames(dag33) <- rownames(dag33) <- names(var33.cat)#Set names -@ -Create banned and retain empty DAGs: -<>= -banned.cat <- matrix( 0, 17, 17) -colnames(banned.cat) <- rownames(banned.cat) <- names(var33.cat) -retain.cat <- matrix( 0, 17, 17) -colnames(retain.cat) <- rownames(retain.cat) <- names(var33.cat) -@ -Setup distribution list for each categorical node: -<>= -mydists.cat <- list( v1 = "binomial", v3 = "binomial", - v4 = "binomial", v6 = "binomial", v9 = "binomial", - v10 = "binomial", v11 = "binomial", v12 = "binomial", - v15 = "binomial", v18 = "binomial", v19 = "binomial", - v20 = "binomial", v21 = "binomial", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v32 = "binomial") -@ -Build cache of all the local computations this information is needed later when running a model search: -<>= -mycache.cat <- buildscorecache( data.df=var33.cat, - data.dists=mydists.cat, dag.banned=banned.cat, - dag.retained=retain.cat, max.parents=1) -@ -Running a single search heuristic for an additive BN uses \rr{search.hillclimber()}. -It uses a parameter prior specifications (as detailed above). Several additional -arguments are available which relate to the numerical routines used in the Laplace -approximation to calculate the network score. The defaults appear to work reasonably -well in practice and if it is not possible to calculate a robust value for this approximation -in any model, for example due to a singular design matrix at one or more nodes, then the model -is simply assigned a log network score of $-\infty$ which effectively removes it from the model search. - -Run a single search heuristic for an additive BN: -<>= -heur.res.cat <- search.hillclimber( score.cache=mycache.cat, - num.searches=1, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -Setting \rr{trace=TRUE}, the majority consensus network is -plotted as the searches progress. - -\subsection{Single search for optimal additive BN model for continuous data} \label{sec4a} -As above but for a network of Gaussian nodes. -<>= -var33.cts <- var33[,-bin.nodes] #Drop categorical nodes -dag33 <- matrix( 0, 16, 16) -colnames(dag33) <- rownames(dag33) <- names(var33.cts) #Set names -banned.cts <- matrix( 0, 16, 16) -colnames(banned.cts) <- rownames(banned.cts) <- names(var33.cts) -retain.cts <- matrix( 0, 16, 16) -colnames(retain.cts) <- rownames(retain.cts) <- names(var33.cts) -@ -Setup distribution list for each continuous node: -<>= -mydists.cts <- list( v2 = "gaussian", v5 = "gaussian", - v7 = "gaussian", v8 = "gaussian", v13 = "gaussian", - v14 = "gaussian", v16 = "gaussian", v17 = "gaussian", - v22 = "gaussian", v23 = "gaussian", v24 = "gaussian", - v25 = "gaussian", v29 = "gaussian", v30 = "gaussian", - v31 = "gaussian", v33 = "gaussian") -@ -Build cache of all local computations, -information needed later when running a model search: -<>= -mycache.cts<- buildscorecache( data.df=var33.cts, - data.dists=mydists.cts, dag.banned=banned.cts, - dag.retained=retain.cts, max.parents=1) -@ -Run a single search heuristic for an additive BN: -<>= -heur.res.cts<- search.hillclimber( score.cache=mycache.cts, - num.searches=1, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -Setting trace=TRUE, the majority consensus network is -plotted as the searches progress. - -\subsection{Single search for optimal additive BN model for mixed data} - -Model searching for mixed data is again very similar to the previous examples. Note that in this example the parameter priors are specified explicitly (although those given are the same as the defaults). The \rr{+1} in the hyperparameter specification is because a constant term is included in the additive formulation for each node. -<>= -dag33 <- matrix( 0, 33, 33) -colnames(dag33) <- rownames(dag33) <- names(var33)#Set names -@ -Create empty DAGs: -<>= -banned.mix <- matrix( 0, 33, 33) -colnames(banned.mix) <- rownames(banned.mix) <- names(var33) -retain.mix<- matrix( 0, 33, 33) -colnames(retain.mix) <- rownames(retain.mix) <- names(var33) -@ -Setup distribution list for mixed node: -<>= -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial", v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") -@ -Build cache of all local computations, -information needed later when running a model search: -<>= -mycache.mix <- buildscorecache( data.df=var33, - data.dists=mydists.mix, dag.banned=banned.mix, - dag.retained=retain.mix, max.parents=1) -@ -Run a single search heuristic for an additive BN: -<>= -heur.res.mix <- search.hillclimber( score.cache=mycache.mix, - num.searches=1, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -Setting \rr{trace=TRUE}, the majority consensus network is -plotted as the searches progress. - -\subsection{Multiple Search Strategies} -To estimate a robust additive BN for a given dataset is it necessary to run many searches and then summarize the results of these searches. The function \rr{search.hillclimber()} \\ -with \rr{num.searches>1} run multiple searches. It is necessary to use a single joint node cache over all searches, using the function \rr{buildscorecache}. - -Conceptually it may seem more efficient to use one global node cache to allow node information to be shared between different searches, however, in practice as the search space is so vast for some problems this can result in extremely \emph{slow} searches. As the cache becomes larger it can take much more time to search it (and it may need to be searched a very large number of times) than to simply perform the appropriate numerical computation. Profiling using the google performance tool google-pprof suggests that more than 80\% of the computation time may be taken up by lookups. When starting searches from different random places in the model space the number of individual node structures in common between any two searches, relative to the total number of different node structures searched over can be very small meaning a common node cache is inefficient. This may not be the case when starting networks are relatively similar. - -To help with performance monitoring it is possible to turn on timings using \rr{timing.on=TRUE} which then outputs the number of seconds of CPU time each individual search takes (using standard libc functions declared in time.h). - -<>= -dag33 <- matrix( 0, 33, 33) -colnames(dag33) <- rownames(dag33) <- names(var33) #Set names -@ -Create empty DAGs: -<>= -banned.mix <- matrix( 0, 33, 33) -colnames(banned.mix)<- rownames(banned.mix)<- names(var33) -retain.mix <- matrix( 0, 33, 33) -colnames(retain.mix) <- rownames(retain.mix)<- names(var33) -@ -Setup distribution list for mixed node: -<>= -mydists.mix <- list( v1 = "binomial", v2 = "gaussian", - v3 = "binomial", v4 = "binomial", v5 = "gaussian", - v6 = "binomial" , v7 = "gaussian", v8 = "gaussian", - v9 = "binomial", v10 = "binomial", v11 = "binomial", - v12 = "binomial", v13 = "gaussian", v14 = "gaussian", - v15 = "binomial", v16 = "gaussian", v17 = "gaussian", - v18 = "binomial", v19 = "binomial", v20 = "binomial", - v21 = "binomial", v22 = "gaussian", v23 = "gaussian", - v24 = "gaussian", v25 = "gaussian", v26 = "binomial", - v27 = "binomial", v28 = "binomial", v29 = "gaussian", - v30 = "gaussian", v31 = "gaussian", v32 = "binomial", - v33 = "gaussian") -n.searches <- 10 -@ -The number $10$ is an example only, it must be much larger in practice. -Set the parent limits: -<>= -max.par <- 1 -@ -We set only one parent, because the search with \rr{buildscorecache()} take some minutes. -Now we build the cache: -<>= -mycache.mix <- buildscorecache( data.df=var33, data.dists=mydists.mix, -dag.banned=banned.mix, dag.retained=retain.mix, max.parents=max.par) -@ -Repeat but this time have the majority consensus network plotted -as the searches progress: -<>= -myres.mlp <- search.hillclimber(score.cache=mycache.mix, - num.searches=n.searches, seed=0, verbose=FALSE, - trace=FALSE, timing.on=FALSE) -@ -\subsection{Creating a Summary Network: Majority Consensus}\label{mcn} -Having run many heuristic searches, then the next challenge is to summarise these results to allow for ready identification of the joint dependencies most supported by the data. One common, and very simple approach is to produce a single robust BN model of the data mimicing the approach used in phylogenetics to create majority consensus trees. A majority consensus DAG is constructed from all the arcs present in at least 50\% of the locally optimal DAGs found in the search heuristics. This creates a single summary network. Combining results from different runs of \rr{search.hillclimber()} or \rr{search.hillclimber()} is straightforward, although note that it is necessary to check for duplicate random starting networks, as while highly unlikely this is theoretically possible. The following code provides a simple way to produce a majority consensus network and Figure~\ref{fig13} shows the resulting network - note that this is an example only and many thousands of searches may need to be conducted to achieve robust results. One simple ad-hoc method for assessing how many searches are needed is to run a number of searches and split the results into two (random) groups, and calculate the majority consensus network within each group. If these are the same then it suggests that sufficient searches have been run. -To plot the majority consensus network use the result of the function \rr{search.hillclimber}, see below for some example. - -\begin{figure}[htb] -\includegraphics{dagcon} -\vspace{-1.0cm} -\caption{Example majority consensus network (from the results of only 10 searches)} \label{fig13} -\end{figure} - -<>= -tographviz( dag= myres.mlp$consensus, data.df=var33, -data.dists=mydists.mix, outfile="dagcon.dot") #Create file -@ -\rr{dagcon.dot} can then be processed with graphviz un a unix shell typing: "dot -Tpdf dagcon.dot -o dagcon.pdf" or using gedit if on Windows. - -\subsection{Creating a Summary Network: Pruning} -Rather than use the majority consensus network as the most appropriate model of the data, an alternative approach is to choose the single best model found during a large number of searches. To determine sufficient heuristic searches have been run to provide reasonable coverage of all the features of the model landscape, then again checking for a stable majority consensus network as in Section~\ref{mcn}, seems a sensible approach. Once the best overall DAG has been identified then the next task is to check this model for over-fitting. Unlike with the majority consensus network, which effective ``averages'' over many different competing models and therefore should generally comprise only robust structural features, choosing the DAG from a single model search is far more likely to contain some spurious features. When dealing with smaller data sets, say, of several hundred observations then this is extremely likely, as can easily be demonstrated using simulated data. A simple assessment of overfitting can be made by comparing the number of arcs in the majority consensus network with the number of arcs in the best fitting model. We have found that in larger data sets the majority consensus and best fitting model can be almost identical, while in smaller data sets the best fitting models may have many more arcs - suggesting a degree of overfitting. - -An advantage of choosing a DAG from an individual search is that unlike averaging over lots of different structures, as in the construction of a majority consensus network, the model chosen here has a structure which was actually found during a search across the model landscape. In contrast, the majority consensus network is a derived model which may never have been found chosen during even an exhaustive search, indeed it may even comprise of contradictory features as is a usual risk in averaging over different explanations (models) of data. In addition, a majority consensus network need also not be acyclic, although in practice this can be easily corrected by reversing one or more arcs to produce an appropriate DAG. - -A simple compromise between the risk of over-fitting in choosing the single highest scoring DAG, and the risk of inappropriately averaging across different distinct data generating processes, is to prune the highest scoring DAG using the majority consensus model. In short, an element by element multiply of the highest scoring DAG and the majority consensus DAG, which gives a new DAG which only contains the structural features in \emph{both} models. - -\section{Summary} -The \pkg{abn} library provides a range of Bayesian network models to assist with identifying statistical dependencies in complex data, in particular models which are multidimensional analogues of generalised linear models. This process is typically referred to as structure learning, or structure discovery, and is computational extremely challenging. Heuristics are the only options for data comprising of larger numbers of variables. As with all model selection, over-modelling is an everpresent danger and using either: i) summary models comprising of structural features present in many locally optimal models or else; ii) using parametric bootstrapping to determine the robustness of the features in a single locally optimal model are likely essential to provide robust results. An alternative presented was exact order based searches, in particular finding the globally most probable structure. This approach is appealing as it is exact, but despite collapsing DAGs into orderings for larger scale problems it may not be feasible. -For further in-depth analysis about \pkg{abn} refer to the website: \href{http://www.r-bayesian-networks.org}{www.r-bayesian-networks.org}. - -\newpage -\bibliography{abn} - -\end{document} \ No newline at end of file diff -Nru abn-1.2/vignettes/abn_v1.3.pdf.asis abn-1.3/vignettes/abn_v1.3.pdf.asis --- abn-1.2/vignettes/abn_v1.3.pdf.asis 1970-01-01 00:00:00.000000000 +0000 +++ abn-1.3/vignettes/abn_v1.3.pdf.asis 2018-11-19 14:43:19.000000000 +0000 @@ -0,0 +1,2 @@ +%\VignetteIndexEntry{ABN - Vignette} +%\VignetteEngine{R.rsp::asis} Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/Bootstrapping.png and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/Bootstrapping.png differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/ComparisonOfNetworkScore.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/ComparisonOfNetworkScore.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/dagcon.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/dagcon.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/dagcon.png and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/dagcon.png differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/DAG_cycle.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/DAG_cycle.pdf differ diff -Nru abn-1.2/vignettes/map1_10var.dot abn-1.3/vignettes/map1_10var.dot --- abn-1.2/vignettes/map1_10var.dot 2018-08-02 13:04:43.000000000 +0000 +++ abn-1.3/vignettes/map1_10var.dot 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -digraph dag { - -"D1"[shape=square]; -"D2"[shape=square]; -"D3"[shape=square]; -"D4"[shape=square]; -"D5"[shape=square]; -"D6"[shape=square]; -"D7"[shape=square]; -"D8"[shape=square]; -"Loc.x"[shape=oval]; -"Loc.y"[shape=oval]; - - - -"D2"->"D1"; -"D3"->"D2"; -"D4"->"D3"; -"D4"->"D6"; -"D6"->"D5"; -"D7"->"Loc.y"; -"Loc.x"->"D4"; -"Loc.x"->"D7"; - -} Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/map1_10var.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/map1_10var.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/map1_10var.png and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/map1_10var.png differ diff -Nru abn-1.2/vignettes/map_1par.dot abn-1.3/vignettes/map_1par.dot --- abn-1.2/vignettes/map_1par.dot 2018-08-02 13:04:43.000000000 +0000 +++ abn-1.3/vignettes/map_1par.dot 1970-01-01 00:00:00.000000000 +0000 @@ -1,32 +0,0 @@ -digraph dag { - -"D1"[shape=square]; -"D2"[shape=square]; -"D3"[shape=square]; -"D4"[shape=square]; -"D5"[shape=square]; -"D6"[shape=square]; -"D7"[shape=square]; -"D8"[shape=square]; -"D9"[shape=square]; -"D10"[shape=square]; -"Year"[shape=oval]; -"Loc.x"[shape=oval]; -"Loc.y"[shape=oval]; - - - -"D2"->"D1"; -"D2"->"D9"; -"D3"->"D2"; -"D4"->"D3"; -"D4"->"D6"; -"D6"->"D5"; -"D7"->"Loc.x"; -"D7"->"Loc.y"; -"D9"->"D10"; -"D10"->"D8"; -"Year"->"D7"; -"Loc.x"->"D4"; - -} Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/map_1par.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/map_1par.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/MargPlots_PigsData.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/MargPlots_PigsData.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/mydag_all.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/mydag_all.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/mydagcts.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/mydagcts.pdf differ diff -Nru abn-1.2/vignettes/mydag.dot abn-1.3/vignettes/mydag.dot --- abn-1.2/vignettes/mydag.dot 2018-08-02 13:04:43.000000000 +0000 +++ abn-1.3/vignettes/mydag.dot 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ -digraph dag { - -"v1"[shape=square]; -"v3"[shape=square]; -"v4"[shape=square]; -"v6"[shape=square]; -"v9"[shape=square]; -"v10"[shape=square]; -"v11"[shape=square]; -"v12"[shape=square]; -"v15"[shape=square]; -"v18"[shape=square]; -"v19"[shape=square]; -"v20"[shape=square]; -"v21"[shape=square]; -"v26"[shape=square]; -"v27"[shape=square]; -"v28"[shape=square]; -"v32"[shape=square]; - - - -"v3"->"v4"; -"v10"->"v11"; -"v12"->"v11"; - -} Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/mydag.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/mydag.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/PigsArea.png and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/PigsArea.png differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/Pigs_PostBootPlots.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/Pigs_PostBootPlots.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/plbinaryNode.png and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/plbinaryNode.png differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/postbootpigs.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/postbootpigs.pdf differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/Summary.png and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/Summary.png differ Binary files /tmp/tmp0FTslE/Kq6EGrkAYF/abn-1.2/vignettes/var33_MASTER.pdf and /tmp/tmp0FTslE/9Pv3DVN82_/abn-1.3/vignettes/var33_MASTER.pdf differ