5 Main Results
The prediction effects of the models are compared upon both the simulation dataset and the real dataset. Also, the simulation dataset enables us to further compare the estimation of coefficients using the three different methods in the logistic regression model.
To generate data for simulation purposes, we primarily utilize a logistic model with predictive variables, a response variable, and regression coefficients randomly generated from a standard normal distribution.
Firstly, we generate a vector of length \(100 \times 9\) from a standard normal distribution. This vector is then reshaped into a matrix with \(100\) rows and \(9\) columns, resulting in \(9\) independent predictive variables that fulfill the requirements for using the naive Bayes method.
Next, we add standard normal noise and an intercept to the design matrix. The regression coefficients are randomly generated from a uniform distribution \(U(0,5)\), and they are multiplied with the design matrix to obtain the true probabilities.
Finally, the response variable \(y\)
is generated using the function rbinom with \(n=1\), along with the randomly generated
probabilities obtained previously.
Firstly, the simulation dataset and the real dataset are devided into training and validation sets by a 3:1 ratio. The training set for the simulation dataset consists of 75 samples, while the training set for the real dataset contains 87 samples. Since our prediction problem is a binary classification task and there is no imbalance issue in the dataset, we select three metrics to better evaluate the model’s performance on the dataset: the cost of time from generating the model to produce probabilities, the prediction accuracy, and AUC. The formulas for calculating accuracy and AUC are as follows:
\[ \text { Accuracy }=\frac{T P+T N}{T P+T N+F P+F N} \]
\[ \text {AUC} =\frac{\sum \boldsymbol{I}\left(\boldsymbol{P}_{\text {positive }}, \boldsymbol{P}_{\text {negative }}\right)}{M * N} \]
Next, we fit the Naive Bayes model, logistic regression models, and the decision trees model on the training set and make predictions on the validation set, comparing the performance of the models. To increase statistical significance, we conduct the repeated holdout method by 20 times. In order to ensure consistency between the training and validation sets for different models, we set the random seed to be 1-20 for iteration.
5.1 Simulation Study
5.1.1 Logistic Regression
5.1.1.1 HMC-Logistic Regression
logistic_posterior <- function(theta, y, X, sig2beta=1e8) {
k <- length(theta)
beta_param <- as.numeric(theta)
onev <- rep(1, length(y))
ll_bin <- t(beta_param) %*% t(X) %*% (y - 1) -
t(onev) %*% log(1 + exp(-X %*% beta_param))
result <- ll_bin - 1/2* t(beta_param) %*%
beta_param / sig2beta
return(result)
}
g_logistic_posterior <- function(theta, y, X, sig2beta=1e8) {
n <- length(y)
k <- length(theta)
beta_param <- as.numeric(theta)
result <- t(X) %*% ( y - 1 + exp(-X %*% beta_param) /
(1 + exp(-X %*% beta_param))) -beta_param/sig2beta
return(result)
}
hmc_logistic_s <- function(Train,Test){
t1.lg <- Sys.time()
stand_range <- preProcess(Train[,-10])
Train_scaled <- cbind(predict(stand_range ,Train[,-10]),Train[,10])
Test_scaled <- cbind(predict(stand_range ,Test[,-10]), Test[,10])
X_train <- model.matrix(Classification ~ Age + BMI + Glucose + Insulin +
HOMA + Leptin + Adiponectin + Resistin + MCP.1-1,
data = Train_scaled)
X_train <- cbind(1,X_train)
y_train <- Train$Classification
N <- 10000
continuous_ind <- c(rep(TRUE, 10))
eps_vals <- ifelse(continuous_ind, 0.1, 1e-3)
fm2_hmc <- hmc(N, theta.init = rep(1, 10),
epsilon = eps_vals, L = 20,
logPOSTERIOR = logistic_posterior,
glogPOSTERIOR = g_logistic_posterior,
param = list(y = y_train, X = X_train),
varnames = colnames(X_train),
chains = 2, parallel = FALSE)
result <- summary(fm2_hmc, burnin=2000)
param1 <- result[,4][1]
param2 <- result[,4][2]
param3 <- result[,4][3]
param4 <- result[,4][4]
param5 <- result[,4][5]
param6 <- result[,4][6]
param7 <- result[,4][7]
param8 <- result[,4][8]
param9 <- result[,4][9]
param10 <- result[,4][10]
sigmoid <- function(x) {
1 / (1 + exp(-x))
}
data1 <- as.data.frame(Test_scaled)
linear_pred <- param1 + param2 * data1$Age + param3 * data1$BMI + param4 * data1$Glucose +
param5 * data1$Insulin + param6 * data1$HOMA + param7 * data1$Leptin +
param8 * data1$Adiponectin + param9 * data1$Resistin + param10 * data1$MCP.1
probabilities <- sigmoid(linear_pred)
t2.lg <- Sys.time()
pred.class <- as.integer(probabilities > 0.5)
cft <- table(pred.class, Test$Classification)
accu <- (cft[1,1]+cft[2,2])/nrow(Test)
time_cost <- t2.lg - t1.lg
AUC <- roc(probabilities,response = Test$Classification)$auc
acpt_rate <- sum(fm2_hmc$accept/N)/2
t <- summary(fm2_hmc, burnin=3000, probs=c(0.025, 0.5, 0.975))
for (i in t[,4]) {
if (i >= 1.1){
print('Convergence Problem')
}
}
return(list(prob = probabilities,accuracy = accu,AUC = AUC,acceptance_rate = acpt_rate,time_cost = time_cost))
}
data1 <- read.csv('dataR2.csv')
data2 <- read.csv('Simulation.csv')
data2 <- data2[,2:11]
colnames(data2) <- colnames(data1)
accuracy_list <- list()
auc_list <- list()
time_list <- list()
acpt_rate_list <- list()
pred_prob_df <- data.frame(matrix(ncol = 0, nrow = 25))
for (i in c(1:20)) {
set.seed(i)
train <- sample(1:nrow(data2),nrow(data2)*3/4,replace = FALSE)
Train <- data2[train,]
Test <- data2[-train,]
results_sim <- hmc_logistic_s(Train,Test)
accuracy_list[[i]] <- results_sim$accuracy
auc_list[[i]] <-results_sim$AUC
time_list[[i]] <- results_sim$time_cost
acpt_rate_list[[i]] <- results_sim$acceptance_rate
pred_prob_df <- cbind(pred_prob_df, results_sim$prob)
}
accuracy_df <- t(data.frame(accuracy = unlist(accuracy_list)))
rownames(accuracy_df) <- "hmc_accuracy"
colnames(accuracy_df) <- paste0("iteration_", 1:20)
auc_df <- t(data.frame(AUC = unlist(auc_list)))
rownames(auc_df) <- "hmc_AUC"
colnames(auc_df) <- paste0("iteration_", 1:20)
time_df <- t(data.frame(time = unlist(time_list)))
rownames(time_df) <- "hmc_time"
colnames(time_df) <- paste0("iteration_", 1:20)
acpt_rate_df <- t(data.frame(acpt_rate = unlist(acpt_rate_list)))
rownames(acpt_rate_df) <- "hmc_acpt_rate"
colnames(acpt_rate_df) <- paste0("iteration_", 1:20)
colnames(pred_prob_df) <- paste0("iteration_", 1:20)
write.csv(pred_prob_df,"simu_pred_prob_df.csv")
write.csv(acpt_rate_df,"simu_acpt_rate_df.csv")
write.csv(time_df,"simu_time_df.csv")
write.csv(auc_df,"simu_auc_df.csv")
write.csv(accuracy_df,"simu_accuracy_df.csv")
data3 <- read.csv('Simulation.csv')
data3 <- data3[,2:11]
colnames(data3) <- c('X1','X2','X3','X4','X5','X6','X7','X8','X9','Classification')
X_train <- model.matrix(Classification ~ .-1,
data = data3)
X_train <- cbind(1,scale(X_train))
y_train <- data3$Classification
N <- 10000
continuous_ind <- c(rep(TRUE, 10))
eps_vals <- ifelse(continuous_ind, 0.1, 1e-3)
fm2_hmc <- hmc(N, theta.init = rep(1, 10),
epsilon = eps_vals, L = 20,
logPOSTERIOR = logistic_posterior,
glogPOSTERIOR = g_logistic_posterior,
param = list(y = y_train, X = X_train),
varnames = colnames(X_train),
chains = 2, parallel = FALSE)
result <- summary(fm2_hmc, burnin=2000)
t <- summary(fm2_hmc, burnin=3000, probs=c(0.025, 0.5, 0.975))
for (i in t[,4]) {
if (i >= 1.1){
print('Convergence Problem')
}
}
5.1.1.2 Frequentist-Logistic Regression
frequentist_logistic <- function(Train,Test){
t1.lg <- Sys.time()
stand_range <- preProcess(Train[,-10])
Train_scaled <- cbind(predict(stand_range ,Train[,-10]),Train[,10])
Test_scaled <- cbind(predict(stand_range ,Test[,-10]), Test[,10])
colnames(Train_scaled) <- c("Age","BMI","Glucose","Insulin","HOMA" ,"Leptin","Adiponectin","Resistin","MCP.1",'Classification')
colnames(Test_scaled) <- c("Age","BMI","Glucose","Insulin","HOMA" ,"Leptin","Adiponectin","Resistin","MCP.1",'Classification')
X_train <- model.matrix(Classification ~ Age + BMI + Glucose + Insulin +
HOMA + Leptin + Adiponectin + Resistin + MCP.1-1,
data = Train_scaled)
X_train <- cbind(1,X_train)
y_train <- Train$Classification
colnames(X_train) <- c("Intercept","Age","BMI","Glucose","Insulin","HOMA" ,"Leptin","Adiponectin", "Resistin","MCP.1")
freq_model <- glm(y_train ~ Age + BMI + Glucose + Insulin +
HOMA + Leptin + Adiponectin + Resistin +
MCP.1-Intercept, family = binomial(),data = as.data.frame(X_train))
X_test <- model.matrix(Classification ~ Age + BMI + Glucose + Insulin +
HOMA + Leptin + Adiponectin + Resistin + MCP.1-1,
data = Test_scaled)
X_test <- cbind(1,X_test)
X_test <- as.data.frame(X_test,row.names = c(1:29))
colnames(X_test) <- c("Intercept","Age","BMI","Glucose","Insulin","HOMA" ,"Leptin","Adiponectin","Resistin","MCP.1")
probabilities <- predict(freq_model, X_test, type="response")
t2.lg <- Sys.time()
pred.class <- as.integer(probabilities > 0.5)
cft <- table(pred.class, Test$Classification)
accu <- (cft[1,1]+cft[2,2])/nrow(Test)
time_cost <- t2.lg - t1.lg
AUC <- roc(probabilities,response = Test$Classification)$auc
return(list(prob = probabilities,accuracy = accu,AUC = AUC,time_cost = time_cost))
}
data1 <- read.xlsx('dataR2.xlsx')
data2 <- read.csv('Simulation.csv')
data2 <- data2[,2:11]
colnames(data2) <- colnames(data1)
accuracy_list <- list()
auc_list <- list()
time_list <- list()
# acpt_rate_list <- list()
pred_prob_df <- data.frame(matrix(ncol = 0, nrow = 25))
for (i in c(1:20)) {
set.seed(i)
train <- sample(1:nrow(data2),nrow(data2)*3/4,replace = FALSE)
Train <- data2[train,]
Test <- data2[-train,]
results_sim <- frequentist_logistic(Train,Test)
accuracy_list[[i]] <- results_sim$accuracy
auc_list[[i]] <-results_sim$AUC
time_list[[i]] <- results_sim$time_cost
# acpt_rate_list[[i]] <- results_sim$acceptance_rate
pred_prob_df <- cbind(pred_prob_df, results_sim$prob)
}
accuracy_df <- t(data.frame(accuracy = unlist(accuracy_list)))
rownames(accuracy_df) <- "freq_accuracy"
colnames(accuracy_df) <- paste0("iteration_", 1:20)
auc_df <- t(data.frame(AUC = unlist(auc_list)))
rownames(auc_df) <- "freq_AUC"
colnames(auc_df) <- paste0("iteration_", 1:20)
time_df <- t(data.frame(time = unlist(time_list)))
rownames(time_df) <- "freq_time"
colnames(time_df) <- paste0("iteration_", 1:20)
colnames(pred_prob_df) <- paste0("iteration_", 1:20)
write.csv(pred_prob_df,"f_simu_pred_prob_df.csv")
write.csv(time_df,"f_simu_time_df.csv")
write.csv(auc_df,"f_simu_auc_df.csv")
write.csv(accuracy_df,"f_simu_accuracy_df.csv")
data4 <- read.csv('Simulation.csv')
data4 <- data4[,2:11]
colnames(data4) <- c('X1','X2','X3','X4','X5','X6','X7','X8','X9','Classification')
X_train <- model.matrix(Classification ~ .-1,
data = data4)
X_train <- cbind(1,scale(X_train))
y_train <- data4$Classification
colnames(X_train) <- c("Intercept","Age","BMI","Glucose","Insulin","HOMA" ,"Leptin","Adiponectin", "Resistin","MCP.1")
freq_model <- glm(y_train ~ Age + BMI + Glucose + Insulin +
HOMA + Leptin + Adiponectin + Resistin +
MCP.1-1, family = binomial(),data = as.data.frame(X_train))
summary(freq_model,probs=c(0.025, 0.5, 0.975))
Call:
glm(formula = y_train ~ Age + BMI + Glucose + Insulin + HOMA +
Leptin + Adiponectin + Resistin + MCP.1 - 1, family = binomial(),
data = as.data.frame(X_train))
Deviance Residuals:
Min 1Q Median 3Q Max
-1.81279 -0.50523 0.04689 0.80822 2.14238
Coefficients:
Estimate Std. Error z value Pr(>|z|)
Age 1.7411 0.4522 3.851 0.000118 ***
BMI 0.7715 0.3285 2.349 0.018844 *
Glucose 1.1508 0.3549 3.243 0.001183 **
Insulin 0.6606 0.3046 2.169 0.030079 *
HOMA 1.5189 0.4339 3.500 0.000465 ***
Leptin 0.6396 0.3147 2.032 0.042137 *
Adiponectin 1.0555 0.3523 2.996 0.002735 **
Resistin 0.9803 0.3428 2.860 0.004235 **
MCP.1 1.2334 0.3856 3.198 0.001382 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 138.629 on 100 degrees of freedom
Residual deviance: 77.965 on 91 degrees of freedom
AIC: 95.965
Number of Fisher Scoring iterations: 6
5.1.1.3 Hierarchical Logistic Regression
hierarchical_model <- function(Train,Test){
stand_range <- preProcess(Train[,-10])
Train_scaled <- cbind(predict(stand_range ,Train[,-10]),Train[,10])
Test_scaled <- cbind(predict(stand_range ,Test[,-10]), Test[,10])
t1 <- Sys.time()
# model string
jags.model = textConnection("model{
for( i in 1:N) {
xcent[i,1:9] <- x[i,1:9] - mean(x[,1:9])
}
for(i in 1:N){
# likelihood: y[i] ~ Bernoulli(p[i])
y[i] ~ dbern(p[i])
# the logit of p[i] is a linear function of the
# centered log dose x[i]
logit(p[i]) <- alpha.star + sum(xcent[i,] * beta) # Centered the covariate
}
alpha <- alpha.star - sum(t(mean(x[,1:9])) * beta)
for(i in 1:9){
beta[i] ~ dnorm(mu.beta, tau.beta)
}
alpha.star ~ dnorm(mu.alpha, tau.alpha)
mu.beta ~ dnorm(0,0.000001)
sigma.beta ~ dunif(1,10)
tau.beta <- 1/(sigma.beta*sigma.beta)
mu.alpha ~ dnorm(0,0.000001)
sigma.alpha ~ dunif(20,30)
tau.alpha <- 1/(sigma.alpha*sigma.alpha)
}")
# train and MCMC
data3 <- list(
x = as.matrix(Train_scaled)[,-10],
n = rep(1,75),
y = as.vector(Train_scaled[,10]),
N = 75)
inits1 <- list(alpha.star=0, beta=rep(0.1, 9),mu.beta=0,mu.alpha=0,sigma.alpha=23,sigma.beta=3)
inits2 <- list(alpha.star=0, beta=rep(-0.1, 9),mu.beta=0,mu.alpha=0,sigma.alpha=25,sigma.beta=5)
inits3 <- list(alpha.star=0, beta=rep(0.7, 9),mu.beta=0,mu.alpha=0,sigma.alpha=28,sigma.beta=7)
inits <- list(inits1,inits2,inits3)
j.model <- jags.model(file = jags.model,
data = data3,
inits = inits,
n.chains = 3) # Set 3 chains
update(j.model, 1000) # 1000 Update
params <- c("alpha", "beta")
jags.out <- coda.samples(j.model, params, n.iter = 50000, n.burnin = 2000)
t2 <- Sys.time()
# Set 50000 iterations
# Calculate the accuracy in test data set
res_all <- summary(jags.out)
para.estimate <- res_all[1]$statistics[1:10,1]
dim(para.estimate) <- c(10,1)
prob <- as.matrix(cbind(1,Test_scaled[,-10]))%*%para.estimate
prob1 <- exp(prob)/(1+exp(prob))
auc <- roc(as.numeric(Test_scaled[,10]), prob1)$auc
accu <- 1-sum(abs(as.numeric(prob1>0.5)-Test_scaled[,10]))/dim(Test_scaled)[1]
return(list(res=jags.out,time=(t2-t1),accu=accu,auc=auc,pred_prob=prob1))
}
# res=hierarchical_model(Train,Test)
time <- c()
auc <- c()
accu <- c()
pred_prob=c()
for(i in 1:20){
set.seed(i)
train <- sample(1:nrow(data4),nrow(data4)*3/4,replace = FALSE)
Train <- data4[train,]
Test <- data4[-train,]
res = hierarchical_model(Train,Test)
time <- c(time,res$time)
auc <- c(auc,res$auc)
accu <- c(accu,res$accu)
pred_prob <- cbind(pred_prob,res$pred_prob)
}
write.csv(time,"time.csv")
write.csv(auc,"AUC.csv")
write.csv(accu,"Accuracy.csv")
write.csv(pred_prob,"Pred_prob_res.csv")
5.1.1.4 Comparison of the estimation effects
# HMC
hmc_data <- data.frame(
Parameter = c("(Intercept)", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9"),
Estimate = c(0.6657143, 2.2124113, 1.0828732, 1.4895894, 0.7867817, 1.9797608, 0.8342914, 1.3053750, 1.3605785, 1.6804235),
Q2.5 = c(0.04693995, 1.27424260, 0.38370859, 0.76878863, 0.16638055, 1.05167363, 0.15361816, 0.60937579, 0.61210360, 0.83311135),
Q97.5 = c(1.368385, 3.432530, 1.951974, 2.374605, 1.496524, 3.159774, 1.596650, 2.180394, 2.235148, 2.753156)
)
hmc_data$Estimate <- round(hmc_data$Estimate, 2)
hmc_data$Q2.5 <- round(hmc_data$Q2.5, 2)
hmc_data$Q97.5 <- round(hmc_data$Q97.5, 2)
frequentist_data <- data.frame(
Parameter = c("(Intercept)", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9"),
Estimate = c(0.5556, 1.8834, 0.9241, 1.2582, 0.6471, 1.6877, 0.7123, 1.1182, 1.1606, 1.4240),
Q2.5 = c(0.43,1.69,0.78,1.11,0.53,1.5,0.58,0.98,1.01,1.26),
Q97.5 = c(0.68,2.07,1.06,1.4,0.77,1.87,0.84,1.26,1.31,1.59))
frequentist_data$Estimate <- round(frequentist_data$Estimate, 2)
frequentist_data$Q2.5 <- round(frequentist_data$Q2.5, 2)
frequentist_data$Q97.5 <- round(frequentist_data$Q97.5, 2)
hierarchical_data <- data.frame(
Parameter = c("(Intercept)", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9"),
Estimate = c(0.6178, 2.0428, 1.0352, 1.3992, 0.7773, 1.8139, 0.8406, 1.2486, 1.2931, 1.5632),
Q2.5 = c(-0.01464, 1.17703, 0.35897, 0.69758, 0.17493, 0.97632, 0.20250, 0.57294, 0.59375, 0.77190),
Q97.5 = c(1.313, 3.099, 1.812, 2.218, 1.475, 2.841, 1.556, 2.055, 2.119, 2.491)
)
hierarchical_data$Q2.5 <- round(hierarchical_data$Q2.5, 2)
hierarchical_data$Estimate <- round(hierarchical_data$Estimate, 2)
hierarchical_data$Q97.5 <- round(hierarchical_data$Q97.5, 2)
true_data <- data.frame(
Parameter = c("(Intercept)", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9"),
Estimate = c(0.62,2.17, 1.09, 1.39, 0.84, 1.82, 0.83, 1.31, 1.43, 1.55),
Q2.5 = c(0.62,2.17, 1.09, 1.39, 0.84, 1.82, 0.83, 1.31, 1.43, 1.55),
Q97.5 = c(0.62,2.17, 1.09, 1.39, 0.84, 1.82, 0.83, 1.31, 1.43, 1.55)
)
true_data$Q2.5 <- round(true_data$Q2.5, 2)
true_data$Estimate <- round(true_data$Estimate, 2)
true_data$Q97.5 <- round(true_data$Q97.5, 2)
hmc_data$Method <- "HMC"
frequentist_data$Method <- "Frequentist"
hierarchical_data$Method <- "Hierarchical"
true_data$Method <- "True"
combined_data <- rbind(hmc_data, frequentist_data, hierarchical_data,true_data)
library(dplyr)
library(ggplot2)
combined_data$Parameter <- as.factor(combined_data$Parameter)
ggplot(combined_data, aes(x = Estimate, y = Parameter)) +
geom_point(aes(color = Method), size = 2, position = position_dodge(width = 0.5)) +
geom_errorbarh(aes(color = Method,xmin = Q2.5, xmax = Q97.5, height = 0.2),
position = position_dodge(width = 0.5)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("#0072B2", "#D55E00", "#009E73",'red')) +
labs(x = "Estimate", y = "Parameter", color = "Method") +
theme_bw()
The three estimation methods exhibit certain differences in their performance. In the figure, the red color represents the ground truth. The Hierarchical Method demonstrates the highest level of estimation accuracy, with the mean of the parameter estimates produced by this method being closest to the ground truth. The HMC Method follows closely behind, while the MLE Method shows a slightly greater deviation from the ground truth in terms of the estimated mean. However, the MLE Method exhibits the smallest variance among the estimation methods, whereas the HMC Method has the highest variance, leading to less stable results. The Hierarchical Method, on the other hand, shows relatively smaller variance. Therefore, the Hierarchical Method possesses a considerable advantage in this regard.
Comparison of the effectiveness of the estimation
5.1.2 Naive Bayes
#iterations=20
sim_data <- read.csv("Simulation.csv")
sim_data <- sim_data[,-1]
colnames(sim_data)[10] <- "Classification"
qual_vars <- c("Classification")
sim_data[qual_vars] <- lapply(sim_data[qual_vars], as.factor)
sim_accuracy_list <- list()
sim_auc_list <- list()
sim_time_list <- list()
sim_pred_prob_df_bn <- data.frame(matrix(ncol = 0, nrow = 25))
for (i in c(1:20)) {
set.seed(i)
train <- sample(1:nrow(sim_data),nrow(sim_data)*3/4,replace = FALSE)
Train <- sim_data[train,]
Test <- sim_data[-train,]
stand_range <- preProcess(Train[,-10])
Train_scaled <- cbind(predict(stand_range ,Train[,-10]),Train[,10])
colnames(Train_scaled)[10] <- "Classification"
Test_scaled <- cbind(predict(stand_range ,Test[,-10]), Test[,10])
colnames(Test_scaled)[10] <- "Classification"
nb <- naivebayes(Train_scaled,Test_scaled)
sim_accuracy_list[[i]] <- nb$accuracy
sim_auc_list[[i]] <- nb$AUC
sim_time_list[[i]] <- nb$time_cost
#new_col <- t(nb$prob)
sim_pred_prob_df_bn <- cbind(sim_pred_prob_df_bn, nb$prob)
}
sim_accuracy_df_bn <- t(data.frame(accuracy = unlist(sim_accuracy_list)))
rownames(sim_accuracy_df_bn) <- "nb_accuracy"
colnames(sim_accuracy_df_bn) <- paste0("iteration_", 1:20)
write.csv(sim_accuracy_df_bn,"sim_accuracy_df_bn.csv")
sim_auc_df_bn <- t(data.frame(AUC = unlist(sim_auc_list)))
rownames(sim_auc_df_bn) <- "nb_AUC"
colnames(sim_auc_df_bn) <- paste0("iteration_", 1:20)
write.csv(sim_auc_df_bn,"sim_auc_df_bn.csv")
sim_time_df_bn <- t(data.frame(time = unlist(sim_time_list)))
rownames(sim_time_df_bn) <- "nb_time"
colnames(sim_time_df_bn) <- paste0("iteration_", 1:20)
write.csv(sim_time_df_bn,"sim_time_df_bn.csv")
colnames(sim_pred_prob_df_bn) <- paste0("iteration_", 1:20)
write.csv(sim_pred_prob_df_bn,"sim_pred_prob_df_bn.csv")
5.1.3 Decision Trees
accuracy_list <- list()
auc_list <- list()
time_list <- list()
pred_prob_sim_trees <- data.frame(matrix(ncol = 0, nrow = 25))
sim_data <- read.csv("Simulation.csv")
colnames(sim_data)[10] <- "Classification"
for(i in 1:20){
set.seed(seeds[i])
train_idx <- sample(1:nrow(sim_data), 0.75 * nrow(sim_data))
train_data <- sim_data[train_idx, ]
test_data <- sim_data[-train_idx, ]
stand_range <- preProcess(train_data[,-10])
Train_scaled <- cbind(predict(stand_range ,train_data[,-10]),train_data[,10])
colnames(Train_scaled)[10] <- "Classification"
Test_scaled <- cbind(predict(stand_range ,test_data[,-10]), test_data[,10])
colnames(Test_scaled)[10] <- "Classification"
l1=tree(Train_scaled,Test_scaled)
accuracy_list[[i]] <- l1$accu
auc_list[[i]] <- l1$auc
time_list[[i]] <- l1$time
#new_col <- t(nb$prob)
pred_prob_sim_trees <- cbind(pred_prob_sim_trees, l1$pred)
}
accuracy_sim_trees <- t(data.frame(accuracy = unlist(accuracy_list)))
rownames(accuracy_sim_trees) <- "trees_accuracy_sim"
colnames(accuracy_sim_trees) <- paste0("iteration_", 1:20)
write.csv(accuracy_sim_trees,"accuracy_sim_trees.csv")
auc_sim_trees <- t(data.frame(AUC = unlist(auc_list)))
rownames(auc_sim_trees) <- "sim_AUC"
colnames(auc_sim_trees) <- paste0("iteration_", 1:20)
write.csv(auc_sim_trees,"auc_sim_trees.csv")
time_sim_trees <- t(data.frame(time = unlist(time_list)))
rownames(time_sim_trees) <- "trees_time"
colnames(time_sim_trees) <- paste0("iteration_", 1:20)
write.csv(time_sim_trees,"time_sim_trees.csv")
colnames(pred_prob_sim_trees) <- paste0("iteration_", 1:20)
write.csv(pred_prob_sim_trees,"pred_prob_sim_trees.csv")
5.2 Real Data Analysis
5.2.1 Logistic Regression Model
5.2.1.1 HMC-Logistic Regression
hmc_logistic <- function(Train,Test){
t1.lg <- Sys.time()
X_train <- model.matrix(Classification ~ .-1,
data = Train)
X_train <- cbind(1,scale(X_train))
y_train <- Train$Classification
N <- 10000
continuous_ind <- c(rep(TRUE, 10))
eps_vals <- ifelse(continuous_ind, 0.1, 1e-3)
fm2_hmc <- hmc(N, theta.init = rep(1, 10),
epsilon = eps_vals, L = 20,
logPOSTERIOR = logistic_posterior,
glogPOSTERIOR = g_logistic_posterior,
param = list(y = y_train, X = X_train),
varnames = colnames(X_train),
chains = 2, parallel = FALSE)
result <- summary(fm2_hmc, burnin=2000)
param1 <- result[,4][1]
param2 <- result[,4][2]
param3 <- result[,4][3]
param4 <- result[,4][4]
param5 <- result[,4][5]
param6 <- result[,4][6]
param7 <- result[,4][7]
param8 <- result[,4][8]
param9 <- result[,4][9]
param10 <- result[,4][10]
sigmoid <- function(x) {
1 / (1 + exp(-x))
}
data1 <- as.data.frame(scale(Test))
linear_pred <- param1 + param2 * data1$Age + param3 * data1$BMI + param4 * data1$Glucose +
param5 * data1$Insulin + param6 * data1$HOMA + param7 * data1$Leptin +
param8 * data1$Adiponectin + param9 * data1$Resistin + param10 * data1$MCP.1
probabilities <- sigmoid(linear_pred)
t2.lg <- Sys.time()
pred.class <- as.integer(probabilities > 0.5)
cft <- table(pred.class, Test$Classification)
accu <- (cft[1,1]+cft[2,2])/nrow(Test)
time_cost <- t2.lg - t1.lg
AUC <- roc(probabilities,response = Test$Classification)$auc
acpt_rate <- sum(fm2_hmc$accept/N)/2
t <- summary(fm2_hmc, burnin=3000, probs=c(0.025, 0.5, 0.975))
for (i in t[,4]) {
if (i >= 1.1){
print('Convergence Problem')
}
}
return(list(prob = probabilities,accuracy = accu,AUC = AUC,acceptance_rate = acpt_rate,time_cost = time_cost))
}
data <- read.csv('dataR2.csv')
data$Classification <-ifelse(data$Classification == "1",0,1)
accuracy_list <- list()
auc_list <- list()
time_list <- list()
acpt_rate_list <- list()
pred_prob_df <- data.frame(matrix(ncol = 0, nrow = 29))
for (i in c(1:20)) {
set.seed(i)
train <- sample(1:nrow(data),nrow(data)*3/4,replace = FALSE)
Train <- data[train,]
Test <- data[-train,]
results_sim <- hmc_logistic(Train,Test)
accuracy_list[[i]] <- results_sim$accuracy
auc_list[[i]] <-results_sim$AUC
time_list[[i]] <- results_sim$time_cost
acpt_rate_list[[i]] <- results_sim$acceptance_rate
pred_prob_df <- cbind(pred_prob_df, results_sim$prob)
}
accuracy_df <- t(data.frame(accuracy = unlist(accuracy_list)))
rownames(accuracy_df) <- "hmc_accuracy"
colnames(accuracy_df) <- paste0("iteration_", 1:20)
auc_df <- t(data.frame(AUC = unlist(auc_list)))
rownames(auc_df) <- "hmc_AUC"
colnames(auc_df) <- paste0("iteration_", 1:20)
time_df <- t(data.frame(time = unlist(time_list)))
rownames(time_df) <- "hmc_time"
colnames(time_df) <- paste0("iteration_", 1:20)
acpt_rate_df <- t(data.frame(acpt_rate = unlist(acpt_rate_list)))
rownames(acpt_rate_df) <- "hmc_acpt_rate"
colnames(acpt_rate_df) <- paste0("iteration_", 1:20)
colnames(pred_prob_df) <- paste0("iteration_", 1:20)
write.csv(pred_prob_df,"true_pred_prob_df.csv")
write.csv(acpt_rate_df,"true_acpt_rate_df.csv")
write.csv(time_df,"true_time_df.csv")
write.csv(auc_df,"true_auc_df.csv")
write.csv(accuracy_df,"true_accuracy_df.csv")
Trace plots provide a visual indication of stationarity. These plots indicate that the MCMC chains are reasonably stationary.
Trace Plot of HMC
Histograms of the posterior distribution show that certain variables such as HOMA are not normally distributed.
Density plot of HMC
The posterior quantiles are summarized after removing an initial burnin period. The \(\hat R\) statistics provide an indication of convergence. Values close to one indicate that the multiple MCMC chains converged to the same distribution, while values above 1.1 indicate possible convergence problems. All the \(\hat R\) statistics in our study are close to one.
5.2.1.2 Frequentist-Logistic Regression
data <- read.csv('dataR2.csv')
data$Classification <-ifelse(data$Classification == "1",0,1)
accuracy_list <- list()
auc_list <- list()
time_list <- list()
pred_prob_df <- data.frame(matrix(ncol = 0, nrow = 29))
for (i in c(1:20)) {
set.seed(i)
train <- sample(1:nrow(data),nrow(data)*3/4,replace = FALSE)
Train <- data[train,]
Test <- data[-train,]
results_sim <- frequentist_logistic(Train,Test)
accuracy_list[[i]] <- results_sim$accuracy
auc_list[[i]] <-results_sim$AUC
time_list[[i]] <- results_sim$time_cost
pred_prob_df <- cbind(pred_prob_df, results_sim$prob)
}
accuracy_df <- t(data.frame(accuracy = unlist(accuracy_list)))
rownames(accuracy_df) <- "freq_accuracy"
colnames(accuracy_df) <- paste0("iteration_", 1:20)
auc_df <- t(data.frame(AUC = unlist(auc_list)))
rownames(auc_df) <- "freq_AUC"
colnames(auc_df) <- paste0("iteration_", 1:20)
time_df <- t(data.frame(time = unlist(time_list)))
rownames(time_df) <- "freq_time"
colnames(time_df) <- paste0("iteration_", 1:20)
colnames(pred_prob_df) <- paste0("iteration_", 1:20)
write.csv(pred_prob_df,"f_true_pred_prob_df.csv")
write.csv(time_df,"f_true_time_df.csv")
write.csv(auc_df,"f_true_auc_df.csv")
write.csv(accuracy_df,"f_true_accuracy_df.csv")
5.2.1.3 Hierarchical Logistic Regression
hierarchical_model <- function(Train,Test){
stand_range <- preProcess(Train[,-10])
Train_scaled <- cbind(predict(stand_range ,Train[,-10]),Train[,10]-1)
Test_scaled <- cbind(predict(stand_range ,Test[,-10]), Test[,10]-1)
t1 <- Sys.time()
# model string
jags.model = textConnection("model{
for( i in 1:N) {
xcent[i,1:9] <- x[i,1:9] - mean(x[,1:9])
}
for(i in 1:N){
# likelihood: y[i] ~ Bernoulli(p[i])
y[i] ~ dbern(p[i])
# the logit of p[i] is a linear function of the
# centered log dose x[i]
logit(p[i]) <- alpha.star + sum(xcent[i,] * beta) # Centered the covariate
}
alpha <- alpha.star - sum(t(mean(x[,1:9])) * beta)
for(i in 1:9){
beta[i] ~ dnorm(mu.beta, tau.beta)
}
alpha.star ~ dnorm(mu.alpha, tau.alpha)
mu.beta ~ dnorm(0,0.000001)
sigma.beta ~ dunif(1,10)
tau.beta <- 1/(sigma.beta*sigma.beta)
mu.alpha ~ dnorm(0,0.000001)
sigma.alpha ~ dunif(20,30)
tau.alpha <- 1/(sigma.alpha*sigma.alpha)
}")
# train and MCMC
data3 <- list(
x = as.matrix(Train_scaled)[,-10],
n = rep(1,dim(Train_scaled)[1]),
y = as.vector(Train_scaled[,10]),
N = dim(Train_scaled)[1])
inits1 <- list(alpha.star=0, beta=rep(0.1, 9),mu.beta=0,mu.alpha=0,sigma.alpha=23,sigma.beta=3)
inits2 <- list(alpha.star=0, beta=rep(-0.1, 9),mu.beta=0,mu.alpha=0,sigma.alpha=25,sigma.beta=5)
inits3 <- list(alpha.star=0, beta=rep(0.7, 9),mu.beta=0,mu.alpha=0,sigma.alpha=28,sigma.beta=7)
inits <- list(inits1,inits2,inits3)
j.model <- jags.model(file = jags.model,
data = data3,
inits = inits,
n.chains = 3) # Set 3 chains
update(j.model, 1000) # 1000 Update
params <- c("alpha", "beta")
jags.out <- coda.samples(j.model, params, n.iter = 50000, n.burnin = 2000)
t2 <- Sys.time()
# Set 50000 iterations
# Calculate the accuracy in test data set
res_all <- summary(jags.out)
para.estimate <- res_all[1]$statistics[1:10,1]
dim(para.estimate) <- c(10,1)
prob <- as.matrix(cbind(1,Test_scaled[,-10]))%*%para.estimate
prob1 <- exp(prob)/(1+exp(prob))
auc <- roc(as.numeric(Test_scaled[,10]), prob1)$auc
accu <- 1-sum(abs(as.numeric(prob1>0.5)-Test_scaled[,10]))/dim(Test_scaled)[1]
return(list(res=jags.out,time=(t2-t1),accu=accu,auc=auc,pred_prob=prob1))
}
# res=hierarchical_model(Train,Test)
time_true <- c()
auc_true <- c()
accu_true <- c()
pred_prob_true=c()
for(i in 1:20){
set.seed(i)
train <- sample(1:nrow(data1),nrow(data1)*3/4,replace = FALSE)
Train <- apply(data1[train,],2,as.numeric)
Test <- apply(data1[-train,],2,as.numeric)
res = hierarchical_model(Train,Test)
time_true <- c(time_true,res$time)
auc_true <- c(auc_true,res$auc)
accu_true <- c(accu_true,res$accu)
pred_prob_true <- cbind(pred_prob_true,res$pred_prob)
cat("This is iteration",i)
}
write.csv(time_true,"time_true.csv")
write.csv(auc_true,"AUC_true.csv")
write.csv(accu_true,"Accuracy_true.csv")
write.csv(pred_prob_true,"Pred_prob_res_true.csv")
Trace plots provide a visual indication of stationarity. These plots indicate that the MCMC chains are reasonably stationary.
Trace plot of Hierarchical Method
Density Plot of Hierarchical Method
5.2.2 Naive Bayes
#naive bayes function
naivebayes <- function(train_data,test_data){
#train
t1.lg <- Sys.time()#time
nb_model <- naiveBayes(Classification ~ ., data = train_data)
# predict
predicted_probs <- predict(nb_model, newdata = test_data, type = "raw")
predicted_class <- predict(nb_model, newdata = test_data, type = "class")
t2.lg <- Sys.time()
time_cost <- t2.lg - t1.lg
# calculate auc
roc_obj <- roc(test_data$Classification, predicted_probs[, "1"])
auc <- auc(roc_obj)
# calculate accuracy
cft <- table(predicted_class,test_data$Classification)
accu <- (cft[1,1]+cft[2,2])/nrow(test_data)
return(list(prob = predicted_probs[,2] , accuracy = accu , AUC = auc , time_cost = time_cost , summary = nb_model))
}
#iterations=20
data <- read.csv("dataR2.csv")
qual_vars <- c("Classification")
data[qual_vars] <- lapply(data[qual_vars], as.factor)
accuracy_list <- list()
auc_list <- list()
time_list <- list()
pred_prob_df_bn <- data.frame(matrix(ncol = 0, nrow = 29))
for (i in c(1:20)) {
set.seed(i)
train <- sample(1:nrow(data),nrow(data)*3/4,replace = FALSE)
Train <- data[train,]
Test <- data[-train,]
stand_range <- preProcess(Train[,-10])
Train_scaled <- cbind(predict(stand_range ,Train[,-10]),Train[,10])
colnames(Train_scaled)[10] <- "Classification"
Test_scaled <- cbind(predict(stand_range ,Test[,-10]), Test[,10])
colnames(Test_scaled)[10] <- "Classification"
nb <- naivebayes(Train_scaled,Test_scaled)
accuracy_list[[i]] <- nb$accuracy
auc_list[[i]] <- nb$AUC
time_list[[i]] <- nb$time_cost
#new_col <- t(nb$prob)
pred_prob_df_bn <- cbind(pred_prob_df_bn, nb$prob)
}
accuracy_df_bn <- t(data.frame(accuracy = unlist(accuracy_list)))
rownames(accuracy_df_bn) <- "nb_accuracy"
colnames(accuracy_df_bn) <- paste0("iteration_", 1:20)
write.csv(accuracy_df_bn,"accuracy_df_bn.csv")
auc_df_bn <- t(data.frame(AUC = unlist(auc_list)))
rownames(auc_df_bn) <- "nb_AUC"
colnames(auc_df_bn) <- paste0("iteration_", 1:20)
write.csv(auc_df_bn,"auc_df_bn.csv")
time_df_bn <- t(data.frame(time = unlist(time_list)))
rownames(time_df_bn) <- "nb_time"
colnames(time_df_bn) <- paste0("iteration_", 1:20)
write.csv(time_df_bn,"time_df_bn.csv")
colnames(pred_prob_df_bn) <- paste0("iteration_", 1:20)
write.csv(pred_prob_df_bn,"pred_prob_df_bn.csv")
5.2.3 Decision Trees
# Decision Trees function
tree <- function(train,test){
a <- Sys.time()
model <- rpart(Classification ~ ., data = train_data, method = "class")
res <- summary(model)
# prediction
pred <- predict(model, newdata = test_data, type = "prob")
pred_1 <- as.vector(pred[, 2])
dim(pred_1) <- NULL
#pred_1
auc <- roc(as.numeric(test_data$Classification)-1, pred_1)$auc
accu <- sum((pred_1>0.5)!=test[,10])/length(test[,1])
b <- Sys.time()
return(list(res=res,auc=auc,accu=accu,pred=pred_1,time=(b-a),model=model))
}
accuracy_list <- list()
auc_list <- list()
time_list <- list()
pred_prob_df_trees <- data.frame(matrix(ncol = 0, nrow = 29))
for(i in 1:20){
set.seed(seeds[i])
train_idx <- sample(1:nrow(data1), 0.75 * nrow(data1))
train_data <- data1[train_idx, ]
test_data <- data1[-train_idx, ]
stand_range <- preProcess(train_data[,-10])
Train_scaled <- cbind(predict(stand_range ,train_data[,-10]),train_data[,10])
colnames(Train_scaled)[10] <- "Classification"
Test_scaled <- cbind(predict(stand_range ,test_data[,-10]), test_data[,10])
colnames(Test_scaled)[10] <- "Classification"
l1=tree(Train_scaled,Test_scaled)
accuracy_list[[i]] <- l1$accu
auc_list[[i]] <- l1$auc
time_list[[i]] <- l1$time
#new_col <- t(nb$prob)
pred_prob_df_trees <- cbind(pred_prob_df_trees, l1$pred)
}
accuracy_df_trees <- t(data.frame(accuracy = unlist(accuracy_list)))
rownames(accuracy_df_trees) <- "trees_accuracy"
colnames(accuracy_df_trees) <- paste0("iteration_", 1:20)
write.csv(accuracy_df_trees,"accuracy_df_trees.csv")
auc_df_trees <- t(data.frame(AUC = unlist(auc_list)))
rownames(auc_df_trees) <- "nb_AUC"
colnames(auc_df_trees) <- paste0("iteration_", 1:20)
write.csv(auc_df_trees,"auc_df_trees.csv")
time_df_trees <- t(data.frame(time = unlist(time_list)))
rownames(time_df_trees) <- "trees_time"
colnames(time_df_trees) <- paste0("iteration_", 1:20)
write.csv(time_df_trees,"time_df_trees.csv")
colnames(pred_prob_df_trees) <- paste0("iteration_", 1:20)
write.csv(pred_prob_df_trees,"pred_prob_df_trees.csv")
5.3 Results
To select the top-performing models, we conduct 20 iterations of simulations and real data analysis to obtain accuracy, AUC, and runtime. We visualize the results using box plots, allowing for a more intuitive comparison.
From the simulated data results, we observe that Hierarchical Logistic Regression consistently demonstrates the best predictive performance. However, due to the need for parameter estimation through simulation, it incurs significantly longer runtime compared to other models. Additionally, Naive Bayes exhibits higher accuracy and faster runtime, but its AUC is lower than the three Logistic Regression models. This discrepancy may be attributed to the assumption of Naive Bayes that all features are independent, which may not hold true in certain cases. When there are correlations among features, Naive Bayes may fail to accurately capture such relationships, leading to lower AUC.
Analyzing the results from the real data, Hierarchical Logistic Regression exhibits the best predictive performance once again. However, similar to the simulated data, its runtime is considerably longer due to parameter estimation through simulation. Furthermore, considering accuracy, AUC, and runtime collectively, Frequentist Logistic Regression also demonstrates favorable performance.
The code for generating box plots is as follows:
accuracy_true <- read.csv("accuracy_true.csv")
auc_true <- read.csv("auc_true.csv")
time_true <- read.csv("time_true.csv")
accuracy_sim <- read.csv("accuracy_sim.csv")
auc_sim <- read.csv("auc_sim.csv")
time_sim <- read.csv("time_sim.csv")
# real data
accuracy_long <- tidyr::gather(accuracy_true, key = "Model", value = "Value")
ggplot(accuracy_long, aes(x = Model, y = Value)) +
geom_boxplot() +
scale_color_lancet() + scale_fill_lancet() +
labs(x = "Model", y = "Value") +
ggtitle("Accuracy of Real Data")
auc_long <- tidyr::gather(auc_true, key = "Model", value = "Value")
ggplot(auc_long, aes(x = Model, y = Value)) +
geom_boxplot() +
scale_color_lancet() + scale_fill_lancet() +
labs(x = "Model", y = "Value") +
ggtitle("AUC of Real Data")
time_long <- tidyr::gather(time_true, key = "Model", value = "Value")
ggplot(time_long, aes(x = Model, y = Value)) +
geom_boxplot() +
scale_color_lancet() + scale_fill_lancet() +
labs(x = "Model", y = "Value") +
ggtitle("Time of Real Data")
accuracy_long_sim <- tidyr::gather(accuracy_sim, key = "Model", value = "Value")
ggplot(accuracy_long_sim, aes(x = Model, y = Value)) +
geom_boxplot() +
scale_color_lancet() + scale_fill_lancet() +
labs(x = "Model", y = "Value") +
ggtitle("Accuracy of Simulation Data")
auc_long_sim <- tidyr::gather(auc_sim, key = "Model", value = "Value")
ggplot(auc_long_sim, aes(x = Model, y = Value)) +
geom_boxplot() +
scale_color_lancet() + scale_fill_lancet() +
labs(x = "Model", y = "Value") +
ggtitle("AUC of Simulation Data")
time_long_sim <- tidyr::gather(time_sim, key = "Model", value = "Value")
ggplot(time_long_sim, aes(x = Model, y = Value)) +
geom_boxplot() +
scale_color_lancet() + scale_fill_lancet() +
labs(x = "Model", y = "Value") +
ggtitle("Time of Simulation Data")