## bclust package library("bclust") ######## Fitting toy data related to the section "Clustering toy example" # set seed, load required libraries set.seed(150) library("MASS") # create bivariate toy data x <- rbind(mvrnorm (20, c(0, 0), matrix(c(1, 0.9, 0.9, 1), 2, 2)), mvrnorm(20, c(4, 0), matrix(c(1, -0.9, -0.9, 1), 2, 2))) # scatter plot of the bivariate data cluster.labels <- paste(c(rep(1, 20), rep(2, 20))) plotcol <- rep(c(2, 4), each = 20) plot (x ,cex = 1.5, xlab = "Variable 1", ylab = "Variable 2", pch = cluster.labels, col = plotcol ) # cluster the bivariate data cluster.obj <- bclust(x, transformed.par = c(0, -50, log(16), 0, 0, 0), labels = cluster.labels) plot (cluster.obj) abline (h = cluster.obj$cut, lty = 2, col = "gray", lwd = 3) # add 98 standard Gaussian noise variables and cluster the new high-dimensional data x <- cbind(x, matrix(rnorm(3920), 40, 98)) cluster.obj <- bclust(x, transformed.par = c (0, -50, log(16), 0, 0, 0), labels = cluster.labels) # plot the clustering result plotcol <- rep(c(2, 4), each = 20) ditplot (cluster.obj, plot.width = 12, horizbar.distance = 0, dendrogram.lwd = 2, xlab.cex = 0.6, ylab.cex = 0.6, vertbar = plotcol, vertbar.col = plotcol, ylab.mar = 0) ######## Fitting simulated data related to the section "Clustering toy example" # load simulated Gaussian data (possibly adapt path of simG.rda) load("simG.rda") # define replications of a clustering individual x.id <- rep(1:10, each = 4) # calculate mean and corrected sum of squares of a clustering individual meansumsq <- meancss(x, x.id) # define a function to estimate model hyperparameters optimfunc <- function(phi) { -loglikelihood(x.mean = meansumsq$mean, x.css = meansumsq$css, repno = meansumsq$repno, transformed.par = phi) } # estimate hyperparameters and save them into x.tpar x.tpar <- optim(rep(0, 6), optimfunc, method = "BFGS")$par # cluster with the estimated hyperparameter values bclust.obj <- bclust(x, rep.id = x.id, transformed.par = x.tpar) # show the clustering result dptplot(bclust.obj, scale = 20, horizbar.plot = TRUE, varimp = imp(bclust.obj)$var, horizbar.distance = 0, dendrogram.lwd = 2) # show variable importances viplot(imp(bclust.obj)$var, col = as.numeric(imp(bclust.obj)$var > 0) * 2) ######## Fitting Gaelle data related to the section "Clustering real data" # load bclust library, load gaelle data, save data into x data("gaelle") x <- gaelle # define replications of a clustering individual x.id <- rep(1:14, c(3, rep(4, 13))) # calculate mean and corrected sum of squares of a clustering individual meansumsq <- meancss(x, x.id) # define a function to estimate the model hyperparameters optimfunc <- function(phi) { -loglikelihood(x.mean = meansumsq$mean, x.css = meansumsq$css, repno = meansumsq$repno, transformed.par = phi, var.select = FALSE) } # estimate the hyperparameters and save them into xinit.tpar xinit.tpar <- optim(rep(0, 5), optimfunc, method = "BFGS")$par # define another function to estimate the hyperparameters of the variable selection model optimfunc<-function(phi) { -loglikelihood(x.mean = meansumsq$mean, x.css = meansumsq$css, repno = meansumsq$repno, transformed.par = c(xinit.tpar[1:4], phi)) } # write the hyperparameters into x.tpar x.tpar <- c(xinit.tpar[1:4], optim(rep(0, 2), optimfunc, method = "BFGS")$par) # defines labels for the clustering individuals x.labels <- c ("ColWT", "d172", "d263", "isa2", "sex4", "dpe2", "mex1", "sex3", "pgm", "sex1", "WsWT", "tpt", "RLDWT", "ke103") # discriminate without variable selection bdiscrim(training = x[-(1:3), ], training.id = (x.id[-(1:3)]-1), training.labels = x.labels[-1], predict = x[1:3,], transformed.par = xinit.tpar, var.select = FALSE)$probs * 100 # discriminate with variable selection bdiscrim(training = x[-(1:3), ], training.id = (x.id[-(1:3)]-1), training.labels = x.labels[-1], predict = x[1:3, ], transformed.par = x.tpar)$probs * 100 # cluster with variable selection model bclust.obj <- bclust(x, rep.id = x.id, transformed.par = x.tpar, labels = x.labels) # show clustering result dptplot(bclust.obj, scale = 10, horizbar.plot = TRUE, varimp = imp(bclust.obj)$var, horizbar.distance = 5, dendrogram.lwd = 2)