## Define the data generation function
# input: M, pi00,pi01,pi10,pi11, alpha1,alpha2
# output: P1,P2, Z00,Z01,Z10,Z11
data_generate1 <- function(M,pi00,pi01,pi10,pi11,alpha1,alpha2){
  P1 <- numeric(M)
  P2 <- numeric(M)
  Z00 <- numeric(M)
  Z01 <- numeric(M)
  Z10 <- numeric(M)
  Z11 <- numeric(M)
  C <- c(1:M)
  index00 <- sample(C,M*pi00)
  C1 <- C[-index00]
  index01 <- sample(C1,M*pi01)
  real_index01 <-c()
  k=1
  for (i in index01){
    real_index01[k]=which(C1==i)
    k=k+1
  }
  C2 <- C1[-real_index01]
  index10 <- sample(C2,M*pi10)
  real_index10 <-c()
  k=1
  for (i in index10){
    real_index10[k]=which(C2==i)
    k=k+1
  }
  C3 <- C2[-real_index10]
  index11 <- sample(C3,M*pi11)
  Z00[index00] <- 1
  Z00[-index00] <- 0
  Z01[index01] <- 1
  Z01[-index01] <- 0
  Z10[index10] <- 1
  Z10[-index10] <- 0
  Z11[index11] <- 1
  Z11[-index11] <- 0
  P1[index00] <- runif(M*pi00)
  P1[index01] <- runif(M*pi01)
  P1[index10] <- rbeta(M*pi10,alpha1,1)
  P1[index11] <- rbeta(M*pi11,alpha1,1)
  P2[index00] <- runif(M*pi00)
  P2[index01] <- rbeta(M*pi01,alpha2,1)
  P2[index10] <- runif(M*pi10)
  P2[index11] <- rbeta(M*pi11,alpha2,1)
  return(list(P1=P1,P2=P2,Z00=Z00,Z01=Z01,Z10=Z10,Z11=Z11))
}
## Define the function that computes the log-likelihood
# input: P1,P2,pi00,pi01,pi10,pi11,alpha1,alpha2
log_p1 <- function(P1,P2,pi00,pi01,pi10,pi11,alpha1,alpha2){
  return(sum(log(pi00+pi01*alpha2*P2^(alpha2-1)+pi10*alpha1*P1^(alpha1-1)+pi11*alpha1*alpha2*P2^(alpha2-1)*P1^(alpha1-1))))
}
## iteration
# max_iter=10000
# Set the maximum number of iterations to max_iter=10000
## In each iteration, E steps and M steps are carried out to calculate the log-likelihood and judge whether the algorithm converges
EM1 <- function(P1,P2,pi00_ini, pi01_ini,pi10_ini,pi11_ini,alpha1_ini,alpha2_ini, max_iter=1000){
  L_ini <- log_p1(P1,P2,pi00_ini, pi01_ini,pi10_ini,pi11_ini,alpha1_ini,alpha2_ini)
  for (iter in 1:max_iter){
  if(iter==1){
    pi00_old <- pi00_ini
    pi01_old <- pi01_ini
    pi10_old <- pi10_ini
    pi11_old <- pi11_ini
    alpha1_old <- alpha1_ini
    alpha2_old <- alpha2_ini
    L_old <- L_ini
  }
  ## E step
  gamma <- pi00_old+pi01_old*alpha2_old*P2^(alpha2_old-1)+alpha1_old*P1^(alpha1_old-1)*pi10_old+pi11_old*alpha1_old*alpha2_old*P1^(alpha1_old-1)*P2^(alpha2_old-1)
  gamma_z00 <- pi00_old/gamma
  gamma_z01 <- pi01_old*alpha2_old*P2^(alpha2_old-1)/gamma
  gamma_z10 <- pi10_old*alpha1_old*P1^(alpha1_old-1)/gamma
  gamma_z11 <- pi11_old*alpha1_old*alpha2_old*P1^(alpha1_old-1)*P2^(alpha2_old-1)/gamma
  ## M step
  pi00_new <- mean(gamma_z00)
  pi01_new <- mean(gamma_z01)
  pi10_new <- mean(gamma_z10)
  pi11_new <- mean(gamma_z11)
  alpha1_new <- -sum(gamma_z11+gamma_z10)/sum(gamma_z11*log(P1)+gamma_z10*log(P1))
  alpha2_new <- -sum(gamma_z11+gamma_z01)/sum(gamma_z11*log(P2)+gamma_z01*log(P2))
  ## compute the log-likelihood
  L_new <- log_p1(P1,P2,pi00_new,pi01_new,pi10_new,pi11_new,alpha1_new,alpha2_new)
  ## whether the algorithm accepts or not
  if(L_new<L_old){
    print("Error: log likelihood is not increasing!")
    break
  }
  if((L_new-L_old)/abs(L_new)<1e-5){
    pi00_est <- pi00_new
    pi01_est <- pi01_new
    pi10_est <- pi10_new
    pi11_est <- pi11_new
    alpha1_est <- alpha1_new
    alpha2_est <- alpha2_new
    break
  }else{
    pi00_old <- pi00_new
    pi01_old <- pi01_new
    pi10_old <- pi10_new
    pi11_old <- pi11_new
    alpha1_old <- alpha1_new
    alpha2_old <- alpha2_new
    L_old <- L_new
  }
  }
  return(list(pi00 = pi00_new, pi01 = pi01_new, pi10 = pi10_new, pi11 = pi11_new ,alpha1 = alpha1_new, alpha2 = alpha2_new))
}
# set random seed
set.seed(1)
# generate simulation data
M <- 100000
data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
# Parameter estimates were obtained using the EM algorithm to check for agreement with the previous results
theta_est <- EM1(data$P1,data$P2,pi00_ini=0.6,pi01_ini=0.05,pi10_ini=0.15,pi11_ini=0.01,alpha1_ini=0.2,alpha2_ini=0.1)
print(theta_est)
## $pi00
## [1] 0.6986258
## 
## $pi01
## [1] 0.0995448
## 
## $pi10
## [1] 0.1532531
## 
## $pi11
## [1] 0.0485763
## 
## $alpha1
## [1] 0.2014528
## 
## $alpha2
## [1] 0.1994058
# The estimated value is close to the true value.
# Calculate the posterior probability
gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post
# Given a level alpha that controls the FDR, use the assoc function to identify SNPS associated with the disease based on the posterior probability of the latent variable.
# The inputs are posterior and alpha.
# posterior is an m-dimensional vector, where each element represents the posterior probability of the latent variable z_i=1 corresponding to the ith SNP
# alpha is the level that controls FDR
# The output of the  function is Z_est, an m-dimensional vector, where each element represents whether the ith SNP is identified as related to the current disease, with 1 representing related and 0 representing unrelated
assoc <- function(posterior, alpha){
  M          <- length(posterior)
  fdr        <- 1 - posterior
  rank.fdr   <- rank(fdr)
  sort.fdr   <- sort(fdr)
  cumsum.fdr <- cumsum(sort.fdr)
  sort.FDR   <- cumsum.fdr/seq(1, M, 1)
  FDR        <- sort.FDR[rank.fdr]
  
  Z_est <- rep(0, M)
  Z_est[which(FDR <= alpha)] <- 1
  
  return(Z_est)
}

# Use assoc function to identify SNPs associated with disease on simulated data (controlling FDR at level 0.1) i.e., input the posterior probability of z=1 calculated by parameter-based EM estimation and use assoc function to obtain Z_est

Z_est_11 <- assoc(z11_post, 0.1)
sum(Z_est_11)
## [1] 817
## Calculate FDP and power
# Compare the real Z with the Z_est obtained in the previous step, by using table(Z_est, Z)
# V is the number of (Z_est=1, Z=0) in the table, S is the number of (Z_est=1, Z=1), R is the number of (Z_est=1), M0 is the number of (Z=0), and M-M0 is the number of (Z=1)
# Calculate FDP using V/R and power using S/(M-M0)
# FDP should be around 0.1, otherwise the code is broken

t<-table(Z_est_11, data$Z11)
t
##         
## Z_est_11     0     1
##        0 94932  4251
##        1    68   749
FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP
## [1] 0.08323133
power<-t[2,2]/(t[1,2]+t[2,2])
power
## [1] 0.1498
#Related to disease 1
Z_est_1 <- assoc(z1_post, 0.1)
sum(Z_est_1)
## [1] 9159
t<-table(Z_est_1, data$Z11+data$Z10)
t
##        
## Z_est_1     0     1
##       0 79057 11784
##       1   943  8216
FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP
## [1] 0.1029588
power<-t[2,2]/(t[1,2]+t[2,2])
power
## [1] 0.4108
#Related to disease 2
Z_est_2 <- assoc(z2_post, 0.1)
sum(Z_est_2)
## [1] 6256
t<-table(Z_est_2, data$Z11+data$Z01)
t
##        
## Z_est_2     0     1
##       0 84395  9349
##       1   605  5651
FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP
## [1] 0.09670716
power<-t[2,2]/(t[1,2]+t[2,2])
power
## [1] 0.3767333
# Repeat 20 times and record pi00_est,pi01_est,pi10_est,pi11_est,alpha1_est, alpha2_est, FDP, power and plot boxplot
rep <- 20
pi00_est <- pi01_est <-pi10_est <- pi11_est <- alpha1_est <- alpha2_est <- Z11_FDP <- Z1_FDP <- Z2_FDP <- Z11_power <- Z1_power <- Z2_power <- numeric(rep)
for (i in 1:rep) {
  set.seed(i)
  data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
  theta_est <- EM1(data$P1,data$P2,pi00_ini=0.6,pi01_ini=0.05,pi10_ini=0.15,pi11_ini=0.01,alpha1_ini=0.2,alpha2_ini=0.1)
  pi00_est[i] <- theta_est$pi00
  pi01_est[i] <- theta_est$pi01
  pi10_est[i] <- theta_est$pi10
  pi11_est[i] <- theta_est$pi11
  alpha1_est[i] <- theta_est$alpha1
  alpha2_est[i] <- theta_est$alpha2
  gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
  z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
  z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
  z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
  z00_post <- theta_est$pi00/gamma_post
  z1_post <- z11_post+z10_post
  z2_post <- z11_post+z01_post
  Z_est_11 <- assoc(z11_post, 0.1)
  t11<-table(Z_est_11, data$Z11)
  Z11_FDP[i] <- t11[2, 1]/(t11[2, 1] + t11[2, 2])
  Z11_power[i] <-t11[2,2]/(t11[1,2]+t11[2,2])
  Z_est_1 <- assoc(z1_post, 0.1)
  t1<-table(Z_est_1, data$Z11+data$Z10)
  Z1_FDP[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
  Z1_power[i] <-t1[2,2]/(t1[1,2]+t1[2,2])
  Z_est_2 <- assoc(z2_post, 0.1)
  t2<-table(Z_est_2, data$Z11+data$Z01)
  Z2_FDP[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
  Z2_power[i] <-t2[2,2]/(t2[1,2]+t2[2,2])
}
boxplot(pi00_est,ylim=c(0.693,0.706))
title("pi00_est")
abline(h=0.7,col="red")

boxplot(pi01_est)
title("pi01_est")
abline(h=0.1,col="red")

boxplot(pi10_est)
title("pi10_est")
abline(h=0.15,col="red")

boxplot(pi11_est)
title("pi11_est")
abline(h=0.05,col="red")

boxplot(alpha1_est)
title("alpha1_est")
abline(h=0.2,col="red")

boxplot(alpha2_est)
title("alpha2_est")
abline(h=0.2,col="red")

boxplot(Z11_FDP)
title("Z11_FDP")
abline(h=0.1,col="red")

boxplot(Z1_FDP)
title("Z1_FDP")
abline(h=0.1,col="red")

boxplot(Z2_FDP)
title("Z2_FDP")
abline(h=0.1,col="red")

boxplot(Z11_power)
title("Z11_power")

boxplot(Z1_power)
title("Z1_power")

boxplot(Z2_power)
title("Z2_power")

## Define the function that calculates the log-likelihood
# input: p and pi1, alpha

log_X <- function(P, pi1, alpha){
  return(sum(log(pi1*alpha*(P^(alpha-1))+1-pi1)))
}

## Define the function that implements the EM algorithm
# The EM function takes P, pi1_ini, alpha_ini, max_iter, tol and outputs pi1_est, alpha_est
EM <- function(P, pi1_ini, alpha_ini, max_iter=10000, tol=1e-6){

  L_ini <- log_X(P, pi1_ini, alpha_ini) # 初始对数似然

## Iteration
# Set maximum number of iterations max_iter=10000
## Perform E and M steps in each iteration, compute the log-likelihood, and see if the algorithm converges
  max_iter <- 10000 # Set the maximum number of iterations to prevent non-convergence
  for (iter in 1:max_iter){
    if (iter == 1){
      pi1_old <- pi1_ini
      alpha_old <- alpha_ini
      L_old <- L_ini
    }
## E step
    comp_gamma <- pi1_old*dbeta(P, alpha_old, 1)
    gamma <- comp_gamma/(comp_gamma + (1 - pi1_old)*dunif(P, 0, 1))
## M step
    pi1_new <- mean(gamma)
    alpha_new <- -sum(gamma)/sum(gamma*log(P))
## compute log likelihood
    L_new <- log_X(P, pi1_new, alpha_new)
## whether the algorithm converges
    if (L_new < L_old){
      print("Error: log likelihoood is not increasing!")
      break
    }
    if ((L_new - L_old)/abs(L_new) < tol){
      pi1_est <- pi1_new
      alpha_est <- alpha_new
      break
    } 
    else {
      pi1_old <- pi1_new
      alpha_old <- alpha_new
      L_old <- L_new
    }
  }
   return(list(pi1 = pi1_new, alpha = alpha_new))
}

rep <- 20
FDP1 <- numeric(rep)
power1 <- numeric(rep)
FDP2 <- numeric(rep)
power2 <- numeric(rep)
pi1_est <- numeric(rep)
pi2_est <- numeric(rep)
alpha1_est <- numeric(rep)
alpha2_est <- numeric(rep)

rep=20
for (i in 1:rep){
 set.seed(i)
 data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
 est1 <- EM(data$P1, 0.1, 0.1)
 est2 <- EM(data$P2, 0.1, 0.1)
 
 pi1_est[i] <- est1$pi1
 alpha1_est[i] <- est1$alpha
 pi2_est[i] <- est2$pi1
 alpha2_est[i] <- est2$alpha
 
 posterior1 <- (est1$pi1*est1$alpha*data$P1^(est1$alpha - 1))/(est1$pi1*est1$alpha*data$P1^(est1$alpha - 1) + 1 - est1$pi1)
 posterior2 <- (est2$pi1*est2$alpha*data$P2^(est2$alpha - 1))/(est2$pi1*est2$alpha*data$P2^(est2$alpha - 1) + 1 - est2$pi1)
 
 Z_est1 <- assoc(posterior1, 0.1)
 Z_est2 <- assoc(posterior2, 0.1)
 t1 <- table(Z_est1, data$Z11+data$Z10)
 t2 <- table(Z_est2, data$Z11+data$Z01)
 
 FDP1[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
 power1[i] <- t1[2, 2]/(t1[1, 2] + t1[2, 2])
 FDP2[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
 power2[i] <- t2[2, 2]/(t2[1, 2] + t2[2, 2])
}
boxplot(pi1_est)
title("pi1_est")
abline(h=0.2,col="red")

boxplot(pi2_est)
title("pi2_est")
abline(h=0.15,col="red")

boxplot(alpha1_est)
title("alpha1_est")
abline(h=0.2,col="red")

boxplot(alpha2_est)
title("alpha2_est")
abline(h=0.2,col="red")

FDP_SUM <- cbind(FDP1,FDP2)
boxplot(FDP_SUM)
title("FDP")
abline(h=0.1,col="red")

power_SUM <- cbind(Z1_power,Z2_power,power1,power2)
boxplot(power_SUM)
title("power")

# Keep pi11 and pi01 unchanged, change pi00 and pi10
pi00 <- c(0.5,0.55,0.6)
pi10 <- rep(c(0.15,0.1,0.05),each=5 )
rep=5
pi00_est <- pi01_est <- pi10_est <- pi11_est <- alpha1_est <- alpha2_est <- Z11_FDP <-Z1_FDP <- Z2_FDP <- Z11_power <- Z1_power <- Z2_power <- data.frame(pi00 = rep(pi00,each=rep),value=numeric(length(pi00)*rep))
for (i in 1:(length(pi00)*rep)){
  set.seed(i)
  data <- data_generate1(100000,pi00_est$pi00[i],0.2,pi10[i],0.15,0.2,0.2)
  theta_est <- EM1(data$P1,data$P2,pi00_ini=0.5,pi01_ini=0.1,pi10_ini=0.1,pi11_ini=0.1,alpha1_ini=0.1,alpha2_ini=0.1)
  pi00_est$value[i] <- theta_est$pi00
  pi01_est$value[i] <- theta_est$pi01
  pi10_est$value[i] <- theta_est$pi10
  pi11_est$value[i] <- theta_est$pi11
  alpha1_est$value[i] <- theta_est$alpha1  
  alpha2_est$value[i] <- theta_est$alpha2
   gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
  z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
  z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
  z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
  z00_post <- theta_est$pi00/gamma_post
  z1_post <- z11_post+z10_post
  z2_post <- z11_post+z01_post
  Z_est_11 <- assoc(z11_post, 0.1)
  t11<-table(Z_est_11, data$Z11)
  Z11_FDP$value[i] <- t11[2, 1]/(t11[2, 1] + t11[2, 2])
  Z11_power$value[i] <-t11[2,2]/(t11[1,2]+t11[2,2])
  Z_est_1 <- assoc(z1_post, 0.1)
  t1<-table(Z_est_1, data$Z11+data$Z10)
  Z1_FDP$value[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
  Z1_power$value[i] <-t1[2,2]/(t1[1,2]+t1[2,2])
  Z_est_2 <- assoc(z2_post, 0.1)
  t2<-table(Z_est_2, data$Z11+data$Z01)
  Z2_FDP$value[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
  Z2_power$value[i] <-t2[2,2]/(t2[1,2]+t2[2,2])
}
library(ggplot2)
pi00_est$pi00 <- as.factor(pi00_est$pi00)
ggplot(pi00_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = pi00,linetype="dashed",color="red")+ggtitle("pi00_est")

library(ggplot2)
pi01_est$pi00 <- as.factor(pi01_est$pi00)
ggplot(pi01_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("pi01_est")

library(ggplot2)
pi10_est$pi00 <- as.factor(pi10_est$pi00)
ggplot(pi10_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = c(0.15,0.1,0.05),linetype="dashed",color="red")+ggtitle("pi10_est")

library(ggplot2)
pi11_est$pi00 <- as.factor(pi11_est$pi00)
ggplot(pi11_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.15,linetype="dashed",color="red")+ggtitle("pi11_est")

library(ggplot2)
alpha1_est$pi00 <- as.factor(alpha1_est$pi00)
ggplot(alpha1_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("alpha1_est")

library(ggplot2)
alpha2_est$pi00 <- as.factor(alpha2_est$pi00)
ggplot(alpha2_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("alpha2_est")

library(ggplot2)
Z1_power$pi00 <- as.factor(Z1_power$pi00)
ggplot(Z1_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z1_power")

library(ggplot2)
Z2_power$pi00 <- as.factor(Z2_power$pi00)
ggplot(Z2_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z2_power")

library(ggplot2)
Z11_power$pi00 <- as.factor(Z11_power$pi00)
ggplot(Z11_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z11_power")

real data

BIP <- read.table("E:\\qjy\\ecnu\\大三第二学期\\生物医疗\\生物医疗小组作业2\\tw2_data\\pgc.bip.full.2012-04.txt", header = T)
SCZ <- read.table("E:\\qjy\\ecnu\\大三第二学期\\生物医疗\\生物医疗小组作业2\\tw2_data\\pgc.scz.full.2012-04.txt", header = T)
library(dplyr)
## 
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Take the intersection
snp_result=intersect(BIP$snpid,SCZ$snpid)
pvalue_bip <- BIP[BIP$snpid %in% snp_result,]$pval
pvalue_scz <- SCZ[SCZ$snpid %in% snp_result,]$pval
BIP_jiaoji <- BIP[BIP$snpid %in% snp_result,]
SCZ_jiaoji <- SCZ[SCZ$snpid %in% snp_result,]
BIP_SCZ <- inner_join(BIP[,c('snpid','pval')],SCZ[,c('snpid','pval')],by="snpid")
# The parameter estimates are obtained using the EM algorithm
theta_est <- EM1(pvalue_bip, pvalue_scz,pi00_ini = 0.8, pi01_ini=0.01,pi10_ini=0.01,pi11_ini=0.1,alpha1_ini=0.2,alpha2_ini=0.2)
theta_est 
## $pi00
## [1] 0.8173783
## 
## $pi01
## [1] 0.006304867
## 
## $pi10
## [1] 0.001133071
## 
## $pi11
## [1] 0.1751837
## 
## $alpha1
## [1] 0.5985608
## 
## $alpha2
## [1] 0.5039367
gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*pvalue_scz^(theta_est$alpha2-1)+theta_est$alpha1*pvalue_bip^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*pvalue_bip^(theta_est$alpha1-1)*pvalue_scz^(theta_est$alpha2-1)
z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*pvalue_bip^(theta_est$alpha1-1)*pvalue_scz^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*pvalue_scz^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*pvalue_bip^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post
# SNPS associated with both diseases
Z_est_11 <- assoc(z11_post, 0.1)
sum(Z_est_11)
## [1] 6956
# SNPS associated with BIP
Z_est_10 <- assoc(z1_post, 0.1)
sum(Z_est_10)
## [1] 7018
# SNPS associated with SCZ
Z_est_01 <- assoc(z2_post, 0.1)
sum(Z_est_01)
## [1] 8210

The coincidence of the ids of the three classes of SNPS was judged

snpid_bip <- BIP[BIP$snpid %in% snp_result,]$snpid
snp_bip = data.frame(snpid_bip = snpid_bip,bip_11 = Z_est_11,bip_10 = Z_est_10)
snpid_bip_11 <- snp_bip[which(snp_bip$bip_11 == 1),'snpid_bip']
snpid_bip_10 <- snp_bip[which(snp_bip$bip_10 == 1),'snpid_bip']
length(intersect(snpid_bip_11,snpid_bip_10))
## [1] 6956
# The high degree of overlap between those associated with both diseases and those associated with BIP
setdiff(snpid_bip_10,snpid_bip_11)
##  [1] "rs4908147"  "rs387176"   "rs263908"   "rs10921107" "rs2270543" 
##  [6] "rs17017120" "rs3845684"  "rs906867"   "rs13386455" "rs1526645" 
## [11] "rs7583278"  "rs4430889"  "rs1435845"  "rs1161474"  "rs6729882" 
## [16] "rs7641289"  "rs17718783" "rs9288851"  "rs16825602" "rs810471"  
## [21] "rs6829771"  "rs2646269"  "rs370831"   "rs391529"   "rs13128519"
## [26] "rs7733288"  "rs256014"   "rs9393672"  "rs9275523"  "rs7767277" 
## [31] "rs2842625"  "rs2138707"  "rs1934124"  "rs4357169"  "rs9387090" 
## [36] "rs13227417" "rs11761050" "rs6958911"  "rs2196"     "rs12342040"
## [41] "rs10760846" "rs7895653"  "rs3858136"  "rs10791838" "rs11174379"
## [46] "rs7300029"  "rs11174386" "rs11109376" "rs1262775"  "rs17107699"
## [51] "rs10518779" "rs16957422" "rs7190307"  "rs12943984" "rs7209032" 
## [56] "rs156430"   "rs4277599"  "rs2378249"  "rs6030341"  "rs11906231"
## [61] "rs2836779"  "rs926761"
# The only SNP associated with BIP but not with SCZ
snpid_scz <- SCZ[SCZ$snpid %in% snp_result,]$snpid
snp_scz = data.frame(snpid_scz = snpid_scz,scz_11 = Z_est_11,scz_10 = Z_est_01)
snpid_scz_11 <- snp_scz[which(snp_scz$scz_11 == 1),'snpid_scz']
snpid_scz_01 <- snp_scz[which(snp_scz$scz_10 == 1),'snpid_scz']
length(intersect(snpid_scz_11,snpid_scz_01))
## [1] 6956

Only the bipolar GWAS data was used for analysis

## Define the function that calculates the log-likelihood
# input: p, pi1, alpha
log_P_single <- function(P, pi1, alpha){
  return(sum(log(pi1*dbeta(P, alpha, 1) + (1 - pi1))))# 最终要优化的函数
}
## Define functions that implement the EM algorithm for the analysis of a single dataset
# input: P, pi1_ini, alpha_ini, max_iter, tol
# ouput: pi1_est, alpha_est
EM_single <- function(P, pi1_ini, alpha_ini, max_iter = 1e4, tol = 1e-6){
 L_ini <- log_P_single(P, pi1_ini, alpha_ini)
 for (iter in 1:max_iter){
   if (iter == 1){
     pi1_old <- pi1_ini
     alpha_old <- alpha_ini
     L_old <- L_ini
   }
 
   # E step
   comp_gamma <- pi1_old*dbeta(P, alpha_old, 1)
   gamma <- comp_gamma/(comp_gamma + 1 - pi1_old)
   
   # M step
   pi1_new <- mean(gamma)
   alpha_new <- -sum(gamma)/sum(gamma*log(P))
   L_new <- log_P_single(P, pi1_new, alpha_new)
   # whether the algorithm converges
   if (L_new < L_old){
     print("Error: log likelihoood is not increasing!")
     break
   }
   if ((L_new - L_old)/abs(L_new) < tol){
     break
   } else {
     pi1_old <- pi1_new
     alpha_old <- alpha_new
     L_old <- L_new
   }
 }
 
 return(list(pi1 = pi1_new, alpha = alpha_new))
}
theta_est_BIP <- EM_single(pvalue_bip, pi1_ini = 0.1, alpha_ini = 0.2)
theta_est_BIP
## $pi1
## [1] 0.2194204
## 
## $alpha
## [1] 0.6628928
posterior_bip <- (theta_est_BIP$pi1*theta_est_BIP$alpha*pvalue_bip^(theta_est_BIP$alpha - 1))/(theta_est_BIP$pi1*theta_est_BIP$alpha*pvalue_bip^(theta_est_BIP$alpha - 1) + 1 - theta_est_BIP$pi1)
Z_est_bip <- assoc(posterior_bip, 0.1)
sum(Z_est_bip)
## [1] 403
snp_bip_single = data.frame(snpid_bip = snpid_bip,bip_1 = Z_est_bip)
snpid_bip_1 <- snp_bip_single[which(snp_bip_single$bip_1 == 1),'snpid_bip']
length(intersect(snpid_bip_1,snpid_bip_10))
## [1] 387

Only the schizophrenia GWAS data was used for analysis

theta_est_SCZ <- EM_single(pvalue_scz, pi1_ini = 0.1, alpha_ini = 0.2)
theta_est_SCZ
## $pi1
## [1] 0.2419776
## 
## $alpha
## [1] 0.5668053
posterior_scz <- (theta_est_SCZ$pi1*theta_est_SCZ$alpha*pvalue_scz^(theta_est_SCZ$alpha - 1))/(theta_est_SCZ$pi1*theta_est_SCZ$alpha*pvalue_scz^(theta_est_SCZ$alpha - 1) + 1 - theta_est_SCZ$pi1)
Z_est_scz <- assoc(posterior_scz, 0.1)
sum(Z_est_scz)
## [1] 4012
snp_scz_single = data.frame(snpid_scz = snpid_scz,scz_1 = Z_est_scz)
snpid_scz_1 <- snp_scz_single[which(snp_scz_single$scz_1 == 1),'snpid_scz']
length(intersect(snpid_scz_1,snpid_scz_01))
## [1] 3692

Plot the Manhattan

FDR <- function(posterior, alpha){
  M          <- length(posterior)
  fdr        <- 1 - posterior
  rank.fdr   <- rank(fdr)
  sort.fdr   <- sort(fdr)
  cumsum.fdr <- cumsum(sort.fdr)
  sort.FDR   <- cumsum.fdr/seq(1, M, 1)
  FDR        <- sort.FDR[rank.fdr]

  return(FDR)
}

BIP

library(qqman)
## 
## For example usage please run: vignette('qqman')
## 
## Citation appreciated but not required:
## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.
## 
SNP_BIP_snp <- as.vector(BIP$snpid[which(Z_est_bip == 1)])
BIP_jiaoji$fdr <- -log10(FDR(posterior_bip, 0.1))
manhattan(BIP_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr", snp = "snpid", ylim = c(0, 3), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "single BIP",
          suggestiveline = T, 
          genomewideline = F)

SCZ

SNP_SCZ_snp <- as.vector(BIP$snpid[which(Z_est_scz == 1)])
# SCZ_jiaoji$fdr <- -log10(assoc(posterior_scz, 0.1)$fdr)
SCZ_jiaoji$fdr <- -log10(FDR(posterior_scz, 0.1))
manhattan(SCZ_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr", snp = "snpid", ylim = c(0, 3), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "single SCZ",
          suggestiveline = T, 
          genomewideline = F)

Integrated analysis: BIP

SNP_BIP_with_SCZ <- as.vector(BIP_jiaoji$snpid[which(Z_est_10 == 1)])
# BIP_jiaoji$fdr1 <- -log10(assoc(z1_post, 0.1)$fdr)
BIP_jiaoji$fdr1 <- -log10(FDR(z1_post, 0.1))
manhattan(BIP_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr1", snp = "snpid", ylim = c(0, 4), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "BIP with SCZ",
          suggestiveline = T, 
          genomewideline = F)

Integrated analysis: SCZ

SNP_SCZ_with_BIP <- as.vector(SCZ_jiaoji$snpid[which(Z_est_01 == 1)])
# SCZ_jiaoji$fdr1 <- -log10(assoc(z2_post, 0.1)$fdr)
SCZ_jiaoji$fdr1 <- -log10(FDR(z2_post, 0.1))
manhattan(SCZ_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr1", snp = "snpid", ylim = c(0, 4), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "SCZ with BIP",
          suggestiveline = T, 
          genomewideline = F)

Extension: GPA http://dongjunchung.github.io/GPA/

# install.packages("devtools")
# library(devtools)
# install_github("dongjunchung/GPA")
library(GPA)
## 载入需要的程序包:Rcpp
## 载入需要的程序包:parallel
## 载入需要的程序包:ggrepel
## 载入需要的程序包:plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## 载入程序包:'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 载入需要的程序包:vegan
## 载入需要的程序包:permute
## 载入需要的程序包:lattice
## 
## 载入程序包:'lattice'
## The following object is masked from 'package:qqman':
## 
##     qq
## This is vegan 2.6-8
## 载入需要的程序包:DT
## 载入需要的程序包:shiny
## 
## 载入程序包:'shiny'
## The following objects are masked from 'package:DT':
## 
##     dataTableOutput, renderDataTable
## 载入需要的程序包:shinyBS
## 
## 载入程序包:'GPA'
## The following object is masked _by_ '.GlobalEnv':
## 
##     assoc
## The following object is masked from 'package:stats':
## 
##     cov
fit.GPA.noAnn <- GPA( BIP_SCZ[ , c(2,3) ], NULL )
## Info: Number of GWAS data: 2
## Info: Theoretically null distribution is used.
## Info: No annotation data is provided.
## Info: Lower bound for pi estimates is set to 1 / [number of SNPs].
## Info: EM algorithm stops because it reaches the specified maximum number of iterations (maxIter = 2000).
fit.GPA.noAnn
## Summary: GPA model fitting results (class: GPA)
## --------------------------------------------------
## Data summary:
##  Number of GWAS data: 2
##  Number of SNPs: 1064236
##  Number of annotation data: (not provided)
## Model setting:
##  Theoretical null distribution is assumed.
## Parameter estimates (standard errors):
##  alpha: 0.603 0.503
##       ( 0.005 0.003 )
##  GWAS combination: 00 10 01 11
##  pi: 0.82 0 0 0.18
##    ( 0.003 0.003 0.004 0.005 )
## --------------------------------------------------
estimates( fit.GPA.noAnn )
## $pis
##           00           10           01           11 
## 8.197874e-01 9.396412e-07 6.278356e-06 1.802054e-01 
## 
## $betaAlpha
## [1] 0.6028026 0.5027528
assoc.GPA.noAnn <- GPA::assoc( fit.GPA.noAnn, FDR=0.10, fdrControl="global" )
## Info: Association mapping based on the global FDR control at level 0.1.
table(assoc.GPA.noAnn[,1])#BIP
## 
##       0       1 
## 1055956    8280
table(assoc.GPA.noAnn[,2])#SCZ
## 
##       0       1 
## 1055954    8282
#table(assoc.GPA.noAnn)
# association with SCZ and BIP
assoc11.GPA.noAnn <- GPA::assoc( fit.GPA.noAnn, FDR=0.10, fdrControl="global", pattern="11" )
## Info: Association mapping based on the global FDR control at level 0.1.
table(assoc11.GPA.noAnn)
## assoc11.GPA.noAnn
##       0       1 
## 1055956    8280
# Test for pleiotropy
 fit.GPA.pleiotropy.H0 <- GPA( BIP_SCZ[ , c(2,3) ], NULL, pleiotropyH0=TRUE )
## Info: Number of GWAS data: 2
## Info: Theoretically null distribution is used.
## Info: No annotation data is provided.
## Info: Fit the GPA model under H0 of the pleitropy test.
## Info: Lower bound for pi estimates is set to 1 / [number of SNPs].
## Info: EM algorithm stops in 1612-th iteration because there is no improvements in log likelihood.
fit.GPA.pleiotropy.H0
## Summary: GPA model fitting results (class: GPA)
## --------------------------------------------------
## Data summary:
##  Number of GWAS data: 2
##  Number of SNPs: 1064236
##  Number of annotation data: (not provided)
## Model setting:
##  Theoretical null distribution is assumed.
##  GPA is fitted under H0 of pleiotropy LRT.
## Parameter estimates (standard errors):
##  alpha: 0.669 0.571
##       ( 0.006 0.003 )
##  GWAS combination: 00 10 01 11
##  pi: 0.582 0.171 0.191 0.056
##    ( 0.005 0.005 0.004 0.004 )
## --------------------------------------------------
test.GPA.pleiotropy <- pTest( fit.GPA.noAnn, fit.GPA.pleiotropy.H0 )
## Hypothesis testing for pleiotropy
## --------------------------------------------------
## GWAS combination:  00 10 01 11 
## pi:  0.82 0 0 0.18 
##   (  0.003 0.003 0.004 0.005  )
## 
## test statistics:  8587.519 
## p-value:  0 
##   --------------------------------------------------
test.GPA.pleiotropy
## $pi
##           00           10           01           11 
## 8.197874e-01 9.396412e-07 6.278356e-06 1.802054e-01 
## 
## $piSE
##                   pi_10       pi_01       pi_11 
## 0.002961348 0.003494226 0.004344911 0.004672846 
## 
## $statistics
## iteration_1612 
##       8587.519 
## 
## $pvalue
## iteration_1612 
##              0