## Define the data generation function
# input: M, pi00,pi01,pi10,pi11, alpha1,alpha2
# output: P1,P2, Z00,Z01,Z10,Z11
data_generate1 <- function(M,pi00,pi01,pi10,pi11,alpha1,alpha2){
  P1 <- numeric(M)
  P2 <- numeric(M)
  Z00 <- numeric(M)
  Z01 <- numeric(M)
  Z10 <- numeric(M)
  Z11 <- numeric(M)
  C <- c(1:M)
  index00 <- sample(C,M*pi00)
  C1 <- C[-index00]
  index01 <- sample(C1,M*pi01)
  real_index01 <-c()
  k=1
  for (i in index01){
    real_index01[k]=which(C1==i)
    k=k+1
  }
  C2 <- C1[-real_index01]
  index10 <- sample(C2,M*pi10)
  real_index10 <-c()
  k=1
  for (i in index10){
    real_index10[k]=which(C2==i)
    k=k+1
  }
  C3 <- C2[-real_index10]
  index11 <- sample(C3,M*pi11)
  Z00[index00] <- 1
  Z00[-index00] <- 0
  Z01[index01] <- 1
  Z01[-index01] <- 0
  Z10[index10] <- 1
  Z10[-index10] <- 0
  Z11[index11] <- 1
  Z11[-index11] <- 0
  P1[index00] <- runif(M*pi00)
  P1[index01] <- runif(M*pi01)
  P1[index10] <- rbeta(M*pi10,alpha1,1)
  P1[index11] <- rbeta(M*pi11,alpha1,1)
  P2[index00] <- runif(M*pi00)
  P2[index01] <- rbeta(M*pi01,alpha2,1)
  P2[index10] <- runif(M*pi10)
  P2[index11] <- rbeta(M*pi11,alpha2,1)
  return(list(P1=P1,P2=P2,Z00=Z00,Z01=Z01,Z10=Z10,Z11=Z11))
}

## Define the function that computes the log-likelihood
# input: P1,P2,pi00,pi01,pi10,pi11,alpha1,alpha2
log_p1 <- function(P1,P2,pi00,pi01,pi10,pi11,alpha1,alpha2){
  return(sum(log(pi00+pi01*alpha2*P2^(alpha2-1)+pi10*alpha1*P1^(alpha1-1)+pi11*alpha1*alpha2*P2^(alpha2-1)*P1^(alpha1-1))))
}

## iteration
# max_iter=10000
# Set the maximum number of iterations to max_iter=10000
## In each iteration, E steps and M steps are carried out to calculate the log-likelihood and judge whether the algorithm converges
EM1 <- function(P1,P2,pi00_ini, pi01_ini,pi10_ini,pi11_ini,alpha1_ini,alpha2_ini, max_iter=1000){
  L_ini <- log_p1(P1,P2,pi00_ini, pi01_ini,pi10_ini,pi11_ini,alpha1_ini,alpha2_ini)
  for (iter in 1:max_iter){
  if(iter==1){
    pi00_old <- pi00_ini
    pi01_old <- pi01_ini
    pi10_old <- pi10_ini
    pi11_old <- pi11_ini
    alpha1_old <- alpha1_ini
    alpha2_old <- alpha2_ini
    L_old <- L_ini
  }
  ## E step
  gamma <- pi00_old+pi01_old*alpha2_old*P2^(alpha2_old-1)+alpha1_old*P1^(alpha1_old-1)*pi10_old+pi11_old*alpha1_old*alpha2_old*P1^(alpha1_old-1)*P2^(alpha2_old-1)
  gamma_z00 <- pi00_old/gamma
  gamma_z01 <- pi01_old*alpha2_old*P2^(alpha2_old-1)/gamma
  gamma_z10 <- pi10_old*alpha1_old*P1^(alpha1_old-1)/gamma
  gamma_z11 <- pi11_old*alpha1_old*alpha2_old*P1^(alpha1_old-1)*P2^(alpha2_old-1)/gamma
  ## M step
  pi00_new <- mean(gamma_z00)
  pi01_new <- mean(gamma_z01)
  pi10_new <- mean(gamma_z10)
  pi11_new <- mean(gamma_z11)
  alpha1_new <- -sum(gamma_z11+gamma_z10)/sum(gamma_z11*log(P1)+gamma_z10*log(P1))
  alpha2_new <- -sum(gamma_z11+gamma_z01)/sum(gamma_z11*log(P2)+gamma_z01*log(P2))
  ## compute the log-likelihood
  L_new <- log_p1(P1,P2,pi00_new,pi01_new,pi10_new,pi11_new,alpha1_new,alpha2_new)
  ## whether the algorithm accepts or not
  if(L_new<L_old){
    print("Error: log likelihood is not increasing!")
    break
  }
  if((L_new-L_old)/abs(L_new)<1e-5){
    pi00_est <- pi00_new
    pi01_est <- pi01_new
    pi10_est <- pi10_new
    pi11_est <- pi11_new
    alpha1_est <- alpha1_new
    alpha2_est <- alpha2_new
    break
  }else{
    pi00_old <- pi00_new
    pi01_old <- pi01_new
    pi10_old <- pi10_new
    pi11_old <- pi11_new
    alpha1_old <- alpha1_new
    alpha2_old <- alpha2_new
    L_old <- L_new
  }
  }
  return(list(pi00 = pi00_new, pi01 = pi01_new, pi10 = pi10_new, pi11 = pi11_new ,alpha1 = alpha1_new, alpha2 = alpha2_new))
}

# set random seed
set.seed(1)
# generate simulation data
M <- 100000
data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
# Parameter estimates were obtained using the EM algorithm to check for agreement with the previous results
theta_est <- EM1(data$P1,data$P2,pi00_ini=0.6,pi01_ini=0.05,pi10_ini=0.15,pi11_ini=0.01,alpha1_ini=0.2,alpha2_ini=0.1)
print(theta_est)

## $pi00
## [1] 0.6986258
## 
## $pi01
## [1] 0.0995448
## 
## $pi10
## [1] 0.1532531
## 
## $pi11
## [1] 0.0485763
## 
## $alpha1
## [1] 0.2014528
## 
## $alpha2
## [1] 0.1994058

# The estimated value is close to the true value.

# Calculate the posterior probability
gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post

# Given a level alpha that controls the FDR, use the assoc function to identify SNPS associated with the disease based on the posterior probability of the latent variable.
# The inputs are posterior and alpha.
# posterior is an m-dimensional vector, where each element represents the posterior probability of the latent variable z_i=1 corresponding to the ith SNP
# alpha is the level that controls FDR
# The output of the  function is Z_est, an m-dimensional vector, where each element represents whether the ith SNP is identified as related to the current disease, with 1 representing related and 0 representing unrelated
assoc <- function(posterior, alpha){
  M          <- length(posterior)
  fdr        <- 1 - posterior
  rank.fdr   <- rank(fdr)
  sort.fdr   <- sort(fdr)
  cumsum.fdr <- cumsum(sort.fdr)
  sort.FDR   <- cumsum.fdr/seq(1, M, 1)
  FDR        <- sort.FDR[rank.fdr]
  
  Z_est <- rep(0, M)
  Z_est[which(FDR <= alpha)] <- 1
  
  return(Z_est)
}

# Use assoc function to identify SNPs associated with disease on simulated data (controlling FDR at level 0.1) i.e., input the posterior probability of z=1 calculated by parameter-based EM estimation and use assoc function to obtain Z_est

Z_est_11 <- assoc(z11_post, 0.1)
sum(Z_est_11)

## [1] 817

## Calculate FDP and power
# Compare the real Z with the Z_est obtained in the previous step, by using table(Z_est, Z)
# V is the number of (Z_est=1, Z=0) in the table, S is the number of (Z_est=1, Z=1), R is the number of (Z_est=1), M0 is the number of (Z=0), and M-M0 is the number of (Z=1)
# Calculate FDP using V/R and power using S/(M-M0)
# FDP should be around 0.1, otherwise the code is broken

t<-table(Z_est_11, data$Z11)
t

##         
## Z_est_11     0     1
##        0 94932  4251
##        1    68   749

FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP

## [1] 0.08323133

power<-t[2,2]/(t[1,2]+t[2,2])
power

## [1] 0.1498

#Related to disease 1
Z_est_1 <- assoc(z1_post, 0.1)
sum(Z_est_1)

## [1] 9159

t<-table(Z_est_1, data$Z11+data$Z10)
t

##        
## Z_est_1     0     1
##       0 79057 11784
##       1   943  8216

FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP

## [1] 0.1029588

power<-t[2,2]/(t[1,2]+t[2,2])
power

## [1] 0.4108

#Related to disease 2
Z_est_2 <- assoc(z2_post, 0.1)
sum(Z_est_2)

## [1] 6256

t<-table(Z_est_2, data$Z11+data$Z01)
t

##        
## Z_est_2     0     1
##       0 84395  9349
##       1   605  5651

FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP

## [1] 0.09670716

power<-t[2,2]/(t[1,2]+t[2,2])
power

## [1] 0.3767333

# Repeat 20 times and record pi00_est,pi01_est,pi10_est,pi11_est,alpha1_est, alpha2_est, FDP, power and plot boxplot
rep <- 20
pi00_est <- pi01_est <-pi10_est <- pi11_est <- alpha1_est <- alpha2_est <- Z11_FDP <- Z1_FDP <- Z2_FDP <- Z11_power <- Z1_power <- Z2_power <- numeric(rep)
for (i in 1:rep) {
  set.seed(i)
  data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
  theta_est <- EM1(data$P1,data$P2,pi00_ini=0.6,pi01_ini=0.05,pi10_ini=0.15,pi11_ini=0.01,alpha1_ini=0.2,alpha2_ini=0.1)
  pi00_est[i] <- theta_est$pi00
  pi01_est[i] <- theta_est$pi01
  pi10_est[i] <- theta_est$pi10
  pi11_est[i] <- theta_est$pi11
  alpha1_est[i] <- theta_est$alpha1
  alpha2_est[i] <- theta_est$alpha2
  gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
  z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
  z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
  z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
  z00_post <- theta_est$pi00/gamma_post
  z1_post <- z11_post+z10_post
  z2_post <- z11_post+z01_post
  Z_est_11 <- assoc(z11_post, 0.1)
  t11<-table(Z_est_11, data$Z11)
  Z11_FDP[i] <- t11[2, 1]/(t11[2, 1] + t11[2, 2])
  Z11_power[i] <-t11[2,2]/(t11[1,2]+t11[2,2])
  Z_est_1 <- assoc(z1_post, 0.1)
  t1<-table(Z_est_1, data$Z11+data$Z10)
  Z1_FDP[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
  Z1_power[i] <-t1[2,2]/(t1[1,2]+t1[2,2])
  Z_est_2 <- assoc(z2_post, 0.1)
  t2<-table(Z_est_2, data$Z11+data$Z01)
  Z2_FDP[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
  Z2_power[i] <-t2[2,2]/(t2[1,2]+t2[2,2])
}

boxplot(pi00_est,ylim=c(0.693,0.706))
title("pi00_est")
abline(h=0.7,col="red")

boxplot(pi01_est)
title("pi01_est")
abline(h=0.1,col="red")

boxplot(pi10_est)
title("pi10_est")
abline(h=0.15,col="red")

boxplot(pi11_est)
title("pi11_est")
abline(h=0.05,col="red")

boxplot(alpha1_est)
title("alpha1_est")
abline(h=0.2,col="red")

boxplot(alpha2_est)
title("alpha2_est")
abline(h=0.2,col="red")

boxplot(Z11_FDP)
title("Z11_FDP")
abline(h=0.1,col="red")

boxplot(Z1_FDP)
title("Z1_FDP")
abline(h=0.1,col="red")

boxplot(Z2_FDP)
title("Z2_FDP")
abline(h=0.1,col="red")

boxplot(Z11_power)
title("Z11_power")

boxplot(Z1_power)
title("Z1_power")

boxplot(Z2_power)
title("Z2_power")

## Define the function that calculates the log-likelihood
# input: p and pi1, alpha

log_X <- function(P, pi1, alpha){
  return(sum(log(pi1*alpha*(P^(alpha-1))+1-pi1)))
}

## Define the function that implements the EM algorithm
# The EM function takes P, pi1_ini, alpha_ini, max_iter, tol and outputs pi1_est, alpha_est
EM <- function(P, pi1_ini, alpha_ini, max_iter=10000, tol=1e-6){

  L_ini <- log_X(P, pi1_ini, alpha_ini) # 初始对数似然

## Iteration
# Set maximum number of iterations max_iter=10000
## Perform E and M steps in each iteration, compute the log-likelihood, and see if the algorithm converges
  max_iter <- 10000 # Set the maximum number of iterations to prevent non-convergence
  for (iter in 1:max_iter){
    if (iter == 1){
      pi1_old <- pi1_ini
      alpha_old <- alpha_ini
      L_old <- L_ini
    }
## E step
    comp_gamma <- pi1_old*dbeta(P, alpha_old, 1)
    gamma <- comp_gamma/(comp_gamma + (1 - pi1_old)*dunif(P, 0, 1))
## M step
    pi1_new <- mean(gamma)
    alpha_new <- -sum(gamma)/sum(gamma*log(P))
## compute log likelihood
    L_new <- log_X(P, pi1_new, alpha_new)
## whether the algorithm converges
    if (L_new < L_old){
      print("Error: log likelihoood is not increasing!")
      break
    }
    if ((L_new - L_old)/abs(L_new) < tol){
      pi1_est <- pi1_new
      alpha_est <- alpha_new
      break
    } 
    else {
      pi1_old <- pi1_new
      alpha_old <- alpha_new
      L_old <- L_new
    }
  }
   return(list(pi1 = pi1_new, alpha = alpha_new))
}

rep <- 20
FDP1 <- numeric(rep)
power1 <- numeric(rep)
FDP2 <- numeric(rep)
power2 <- numeric(rep)
pi1_est <- numeric(rep)
pi2_est <- numeric(rep)
alpha1_est <- numeric(rep)
alpha2_est <- numeric(rep)

rep=20
for (i in 1:rep){
 set.seed(i)
 data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
 est1 <- EM(data$P1, 0.1, 0.1)
 est2 <- EM(data$P2, 0.1, 0.1)
 
 pi1_est[i] <- est1$pi1
 alpha1_est[i] <- est1$alpha
 pi2_est[i] <- est2$pi1
 alpha2_est[i] <- est2$alpha
 
 posterior1 <- (est1$pi1*est1$alpha*data$P1^(est1$alpha - 1))/(est1$pi1*est1$alpha*data$P1^(est1$alpha - 1) + 1 - est1$pi1)
 posterior2 <- (est2$pi1*est2$alpha*data$P2^(est2$alpha - 1))/(est2$pi1*est2$alpha*data$P2^(est2$alpha - 1) + 1 - est2$pi1)
 
 Z_est1 <- assoc(posterior1, 0.1)
 Z_est2 <- assoc(posterior2, 0.1)
 t1 <- table(Z_est1, data$Z11+data$Z10)
 t2 <- table(Z_est2, data$Z11+data$Z01)
 
 FDP1[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
 power1[i] <- t1[2, 2]/(t1[1, 2] + t1[2, 2])
 FDP2[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
 power2[i] <- t2[2, 2]/(t2[1, 2] + t2[2, 2])
}

boxplot(pi1_est)
title("pi1_est")
abline(h=0.2,col="red")

boxplot(pi2_est)
title("pi2_est")
abline(h=0.15,col="red")

boxplot(alpha1_est)
title("alpha1_est")
abline(h=0.2,col="red")

boxplot(alpha2_est)
title("alpha2_est")
abline(h=0.2,col="red")

FDP_SUM <- cbind(FDP1,FDP2)
boxplot(FDP_SUM)
title("FDP")
abline(h=0.1,col="red")

power_SUM <- cbind(Z1_power,Z2_power,power1,power2)
boxplot(power_SUM)
title("power")

# Keep pi11 and pi01 unchanged, change pi00 and pi10
pi00 <- c(0.5,0.55,0.6)
pi10 <- rep(c(0.15,0.1,0.05),each=5 )
rep=5
pi00_est <- pi01_est <- pi10_est <- pi11_est <- alpha1_est <- alpha2_est <- Z11_FDP <-Z1_FDP <- Z2_FDP <- Z11_power <- Z1_power <- Z2_power <- data.frame(pi00 = rep(pi00,each=rep),value=numeric(length(pi00)*rep))
for (i in 1:(length(pi00)*rep)){
  set.seed(i)
  data <- data_generate1(100000,pi00_est$pi00[i],0.2,pi10[i],0.15,0.2,0.2)
  theta_est <- EM1(data$P1,data$P2,pi00_ini=0.5,pi01_ini=0.1,pi10_ini=0.1,pi11_ini=0.1,alpha1_ini=0.1,alpha2_ini=0.1)
  pi00_est$value[i] <- theta_est$pi00
  pi01_est$value[i] <- theta_est$pi01
  pi10_est$value[i] <- theta_est$pi10
  pi11_est$value[i] <- theta_est$pi11
  alpha1_est$value[i] <- theta_est$alpha1  
  alpha2_est$value[i] <- theta_est$alpha2
   gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
  z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
  z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
  z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
  z00_post <- theta_est$pi00/gamma_post
  z1_post <- z11_post+z10_post
  z2_post <- z11_post+z01_post
  Z_est_11 <- assoc(z11_post, 0.1)
  t11<-table(Z_est_11, data$Z11)
  Z11_FDP$value[i] <- t11[2, 1]/(t11[2, 1] + t11[2, 2])
  Z11_power$value[i] <-t11[2,2]/(t11[1,2]+t11[2,2])
  Z_est_1 <- assoc(z1_post, 0.1)
  t1<-table(Z_est_1, data$Z11+data$Z10)
  Z1_FDP$value[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
  Z1_power$value[i] <-t1[2,2]/(t1[1,2]+t1[2,2])
  Z_est_2 <- assoc(z2_post, 0.1)
  t2<-table(Z_est_2, data$Z11+data$Z01)
  Z2_FDP$value[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
  Z2_power$value[i] <-t2[2,2]/(t2[1,2]+t2[2,2])
}

library(ggplot2)
pi00_est$pi00 <- as.factor(pi00_est$pi00)
ggplot(pi00_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = pi00,linetype="dashed",color="red")+ggtitle("pi00_est")

library(ggplot2)
pi01_est$pi00 <- as.factor(pi01_est$pi00)
ggplot(pi01_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("pi01_est")

library(ggplot2)
pi10_est$pi00 <- as.factor(pi10_est$pi00)
ggplot(pi10_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = c(0.15,0.1,0.05),linetype="dashed",color="red")+ggtitle("pi10_est")

library(ggplot2)
pi11_est$pi00 <- as.factor(pi11_est$pi00)
ggplot(pi11_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.15,linetype="dashed",color="red")+ggtitle("pi11_est")

library(ggplot2)
alpha1_est$pi00 <- as.factor(alpha1_est$pi00)
ggplot(alpha1_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("alpha1_est")

library(ggplot2)
alpha2_est$pi00 <- as.factor(alpha2_est$pi00)
ggplot(alpha2_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("alpha2_est")

library(ggplot2)
Z1_power$pi00 <- as.factor(Z1_power$pi00)
ggplot(Z1_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z1_power")

library(ggplot2)
Z2_power$pi00 <- as.factor(Z2_power$pi00)
ggplot(Z2_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z2_power")

library(ggplot2)
Z11_power$pi00 <- as.factor(Z11_power$pi00)
ggplot(Z11_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z11_power")

real data

BIP <- read.table("E:\\qjy\\ecnu\\大三第二学期\\生物医疗\\生物医疗小组作业2\\tw2_data\\pgc.bip.full.2012-04.txt", header = T)

SCZ <- read.table("E:\\qjy\\ecnu\\大三第二学期\\生物医疗\\生物医疗小组作业2\\tw2_data\\pgc.scz.full.2012-04.txt", header = T)

library(dplyr)

## 
## 载入程序包：'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Take the intersection
snp_result=intersect(BIP$snpid,SCZ$snpid)
pvalue_bip <- BIP[BIP$snpid %in% snp_result,]$pval
pvalue_scz <- SCZ[SCZ$snpid %in% snp_result,]$pval
BIP_jiaoji <- BIP[BIP$snpid %in% snp_result,]
SCZ_jiaoji <- SCZ[SCZ$snpid %in% snp_result,]
BIP_SCZ <- inner_join(BIP[,c('snpid','pval')],SCZ[,c('snpid','pval')],by="snpid")

# The parameter estimates are obtained using the EM algorithm
theta_est <- EM1(pvalue_bip, pvalue_scz,pi00_ini = 0.8, pi01_ini=0.01,pi10_ini=0.01,pi11_ini=0.1,alpha1_ini=0.2,alpha2_ini=0.2)
theta_est

## $pi00
## [1] 0.8173783
## 
## $pi01
## [1] 0.006304867
## 
## $pi10
## [1] 0.001133071
## 
## $pi11
## [1] 0.1751837
## 
## $alpha1
## [1] 0.5985608
## 
## $alpha2
## [1] 0.5039367

gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*pvalue_scz^(theta_est$alpha2-1)+theta_est$alpha1*pvalue_bip^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*pvalue_bip^(theta_est$alpha1-1)*pvalue_scz^(theta_est$alpha2-1)

z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*pvalue_bip^(theta_est$alpha1-1)*pvalue_scz^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*pvalue_scz^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*pvalue_bip^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post

# SNPS associated with both diseases
Z_est_11 <- assoc(z11_post, 0.1)
sum(Z_est_11)

## [1] 6956

# SNPS associated with BIP
Z_est_10 <- assoc(z1_post, 0.1)
sum(Z_est_10)

## [1] 7018

# SNPS associated with SCZ
Z_est_01 <- assoc(z2_post, 0.1)
sum(Z_est_01)

## [1] 8210

The coincidence of the ids of the three classes of SNPS was judged

snpid_bip <- BIP[BIP$snpid %in% snp_result,]$snpid
snp_bip = data.frame(snpid_bip = snpid_bip,bip_11 = Z_est_11,bip_10 = Z_est_10)
snpid_bip_11 <- snp_bip[which(snp_bip$bip_11 == 1),'snpid_bip']
snpid_bip_10 <- snp_bip[which(snp_bip$bip_10 == 1),'snpid_bip']

length(intersect(snpid_bip_11,snpid_bip_10))

## [1] 6956

# The high degree of overlap between those associated with both diseases and those associated with BIP

setdiff(snpid_bip_10,snpid_bip_11)

##  [1] "rs4908147"  "rs387176"   "rs263908"   "rs10921107" "rs2270543" 
##  [6] "rs17017120" "rs3845684"  "rs906867"   "rs13386455" "rs1526645" 
## [11] "rs7583278"  "rs4430889"  "rs1435845"  "rs1161474"  "rs6729882" 
## [16] "rs7641289"  "rs17718783" "rs9288851"  "rs16825602" "rs810471"  
## [21] "rs6829771"  "rs2646269"  "rs370831"   "rs391529"   "rs13128519"
## [26] "rs7733288"  "rs256014"   "rs9393672"  "rs9275523"  "rs7767277" 
## [31] "rs2842625"  "rs2138707"  "rs1934124"  "rs4357169"  "rs9387090" 
## [36] "rs13227417" "rs11761050" "rs6958911"  "rs2196"     "rs12342040"
## [41] "rs10760846" "rs7895653"  "rs3858136"  "rs10791838" "rs11174379"
## [46] "rs7300029"  "rs11174386" "rs11109376" "rs1262775"  "rs17107699"
## [51] "rs10518779" "rs16957422" "rs7190307"  "rs12943984" "rs7209032" 
## [56] "rs156430"   "rs4277599"  "rs2378249"  "rs6030341"  "rs11906231"
## [61] "rs2836779"  "rs926761"

# The only SNP associated with BIP but not with SCZ

snpid_scz <- SCZ[SCZ$snpid %in% snp_result,]$snpid
snp_scz = data.frame(snpid_scz = snpid_scz,scz_11 = Z_est_11,scz_10 = Z_est_01)
snpid_scz_11 <- snp_scz[which(snp_scz$scz_11 == 1),'snpid_scz']
snpid_scz_01 <- snp_scz[which(snp_scz$scz_10 == 1),'snpid_scz']

length(intersect(snpid_scz_11,snpid_scz_01))

## [1] 6956

Only the bipolar GWAS data was used for analysis

## Define the function that calculates the log-likelihood
# input: p, pi1, alpha
log_P_single <- function(P, pi1, alpha){
  return(sum(log(pi1*dbeta(P, alpha, 1) + (1 - pi1))))# 最终要优化的函数
}
## Define functions that implement the EM algorithm for the analysis of a single dataset
# input: P, pi1_ini, alpha_ini, max_iter, tol
# ouput: pi1_est, alpha_est
EM_single <- function(P, pi1_ini, alpha_ini, max_iter = 1e4, tol = 1e-6){
 L_ini <- log_P_single(P, pi1_ini, alpha_ini)
 for (iter in 1:max_iter){
   if (iter == 1){
     pi1_old <- pi1_ini
     alpha_old <- alpha_ini
     L_old <- L_ini
   }
 
   # E step
   comp_gamma <- pi1_old*dbeta(P, alpha_old, 1)
   gamma <- comp_gamma/(comp_gamma + 1 - pi1_old)
   
   # M step
   pi1_new <- mean(gamma)
   alpha_new <- -sum(gamma)/sum(gamma*log(P))
   L_new <- log_P_single(P, pi1_new, alpha_new)
   # whether the algorithm converges
   if (L_new < L_old){
     print("Error: log likelihoood is not increasing!")
     break
   }
   if ((L_new - L_old)/abs(L_new) < tol){
     break
   } else {
     pi1_old <- pi1_new
     alpha_old <- alpha_new
     L_old <- L_new
   }
 }
 
 return(list(pi1 = pi1_new, alpha = alpha_new))
}

theta_est_BIP <- EM_single(pvalue_bip, pi1_ini = 0.1, alpha_ini = 0.2)
theta_est_BIP

## $pi1
## [1] 0.2194204
## 
## $alpha
## [1] 0.6628928

posterior_bip <- (theta_est_BIP$pi1*theta_est_BIP$alpha*pvalue_bip^(theta_est_BIP$alpha - 1))/(theta_est_BIP$pi1*theta_est_BIP$alpha*pvalue_bip^(theta_est_BIP$alpha - 1) + 1 - theta_est_BIP$pi1)
Z_est_bip <- assoc(posterior_bip, 0.1)
sum(Z_est_bip)

## [1] 403

snp_bip_single = data.frame(snpid_bip = snpid_bip,bip_1 = Z_est_bip)
snpid_bip_1 <- snp_bip_single[which(snp_bip_single$bip_1 == 1),'snpid_bip']

length(intersect(snpid_bip_1,snpid_bip_10))

## [1] 387

Only the schizophrenia GWAS data was used for analysis

theta_est_SCZ <- EM_single(pvalue_scz, pi1_ini = 0.1, alpha_ini = 0.2)
theta_est_SCZ

## $pi1
## [1] 0.2419776
## 
## $alpha
## [1] 0.5668053

posterior_scz <- (theta_est_SCZ$pi1*theta_est_SCZ$alpha*pvalue_scz^(theta_est_SCZ$alpha - 1))/(theta_est_SCZ$pi1*theta_est_SCZ$alpha*pvalue_scz^(theta_est_SCZ$alpha - 1) + 1 - theta_est_SCZ$pi1)
Z_est_scz <- assoc(posterior_scz, 0.1)
sum(Z_est_scz)

## [1] 4012

snp_scz_single = data.frame(snpid_scz = snpid_scz,scz_1 = Z_est_scz)
snpid_scz_1 <- snp_scz_single[which(snp_scz_single$scz_1 == 1),'snpid_scz']

length(intersect(snpid_scz_1,snpid_scz_01))

## [1] 3692

Plot the Manhattan

FDR <- function(posterior, alpha){
  M          <- length(posterior)
  fdr        <- 1 - posterior
  rank.fdr   <- rank(fdr)
  sort.fdr   <- sort(fdr)
  cumsum.fdr <- cumsum(sort.fdr)
  sort.FDR   <- cumsum.fdr/seq(1, M, 1)
  FDR        <- sort.FDR[rank.fdr]

  return(FDR)
}

BIP

library(qqman)

##

## For example usage please run: vignette('qqman')

##

## Citation appreciated but not required:

## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.

##

SNP_BIP_snp <- as.vector(BIP$snpid[which(Z_est_bip == 1)])
BIP_jiaoji$fdr <- -log10(FDR(posterior_bip, 0.1))
manhattan(BIP_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr", snp = "snpid", ylim = c(0, 3), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "single BIP",
          suggestiveline = T, 
          genomewideline = F)

SCZ

SNP_SCZ_snp <- as.vector(BIP$snpid[which(Z_est_scz == 1)])
# SCZ_jiaoji$fdr <- -log10(assoc(posterior_scz, 0.1)$fdr)
SCZ_jiaoji$fdr <- -log10(FDR(posterior_scz, 0.1))
manhattan(SCZ_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr", snp = "snpid", ylim = c(0, 3), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "single SCZ",
          suggestiveline = T, 
          genomewideline = F)

Integrated analysis: BIP

SNP_BIP_with_SCZ <- as.vector(BIP_jiaoji$snpid[which(Z_est_10 == 1)])
# BIP_jiaoji$fdr1 <- -log10(assoc(z1_post, 0.1)$fdr)
BIP_jiaoji$fdr1 <- -log10(FDR(z1_post, 0.1))
manhattan(BIP_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr1", snp = "snpid", ylim = c(0, 4), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "BIP with SCZ",
          suggestiveline = T, 
          genomewideline = F)

Integrated analysis: SCZ

SNP_SCZ_with_BIP <- as.vector(SCZ_jiaoji$snpid[which(Z_est_01 == 1)])
# SCZ_jiaoji$fdr1 <- -log10(assoc(z2_post, 0.1)$fdr)
SCZ_jiaoji$fdr1 <- -log10(FDR(z2_post, 0.1))
manhattan(SCZ_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr1", snp = "snpid", ylim = c(0, 4), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
          main = "SCZ with BIP",
          suggestiveline = T, 
          genomewideline = F)

Extension: GPA http://dongjunchung.github.io/GPA/

# install.packages("devtools")
# library(devtools)
# install_github("dongjunchung/GPA")

library(GPA)

## 载入需要的程序包：Rcpp

## 载入需要的程序包：parallel

## 载入需要的程序包：ggrepel

## 载入需要的程序包：plyr

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## 载入程序包：'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## 载入需要的程序包：vegan

## 载入需要的程序包：permute

## 载入需要的程序包：lattice

## 
## 载入程序包：'lattice'

## The following object is masked from 'package:qqman':
## 
##     qq

## This is vegan 2.6-8

## 载入需要的程序包：DT

## 载入需要的程序包：shiny

## 
## 载入程序包：'shiny'

## The following objects are masked from 'package:DT':
## 
##     dataTableOutput, renderDataTable

## 载入需要的程序包：shinyBS

## 
## 载入程序包：'GPA'

## The following object is masked _by_ '.GlobalEnv':
## 
##     assoc

## The following object is masked from 'package:stats':
## 
##     cov

fit.GPA.noAnn <- GPA( BIP_SCZ[ , c(2,3) ], NULL )

## Info: Number of GWAS data: 2

## Info: Theoretically null distribution is used.

## Info: No annotation data is provided.

## Info: Lower bound for pi estimates is set to 1 / [number of SNPs].

## Info: EM algorithm stops because it reaches the specified maximum number of iterations (maxIter = 2000).

fit.GPA.noAnn

## Summary: GPA model fitting results (class: GPA)
## --------------------------------------------------
## Data summary:
##  Number of GWAS data: 2
##  Number of SNPs: 1064236
##  Number of annotation data: (not provided)
## Model setting:
##  Theoretical null distribution is assumed.
## Parameter estimates (standard errors):
##  alpha: 0.603 0.503
##       ( 0.005 0.003 )
##  GWAS combination: 00 10 01 11
##  pi: 0.82 0 0 0.18
##    ( 0.003 0.003 0.004 0.005 )
## --------------------------------------------------

estimates( fit.GPA.noAnn )

## $pis
##           00           10           01           11 
## 8.197874e-01 9.396412e-07 6.278356e-06 1.802054e-01 
## 
## $betaAlpha
## [1] 0.6028026 0.5027528

assoc.GPA.noAnn <- GPA::assoc( fit.GPA.noAnn, FDR=0.10, fdrControl="global" )

## Info: Association mapping based on the global FDR control at level 0.1.

table(assoc.GPA.noAnn[,1])#BIP

## 
##       0       1 
## 1055956    8280

table(assoc.GPA.noAnn[,2])#SCZ

## 
##       0       1 
## 1055954    8282

#table(assoc.GPA.noAnn)

# association with SCZ and BIP
assoc11.GPA.noAnn <- GPA::assoc( fit.GPA.noAnn, FDR=0.10, fdrControl="global", pattern="11" )

## Info: Association mapping based on the global FDR control at level 0.1.

table(assoc11.GPA.noAnn)

## assoc11.GPA.noAnn
##       0       1 
## 1055956    8280

# Test for pleiotropy
 fit.GPA.pleiotropy.H0 <- GPA( BIP_SCZ[ , c(2,3) ], NULL, pleiotropyH0=TRUE )

## Info: Number of GWAS data: 2

## Info: Theoretically null distribution is used.

## Info: No annotation data is provided.

## Info: Fit the GPA model under H0 of the pleitropy test.

## Info: Lower bound for pi estimates is set to 1 / [number of SNPs].

## Info: EM algorithm stops in 1612-th iteration because there is no improvements in log likelihood.

fit.GPA.pleiotropy.H0

## Summary: GPA model fitting results (class: GPA)
## --------------------------------------------------
## Data summary:
##  Number of GWAS data: 2
##  Number of SNPs: 1064236
##  Number of annotation data: (not provided)
## Model setting:
##  Theoretical null distribution is assumed.
##  GPA is fitted under H0 of pleiotropy LRT.
## Parameter estimates (standard errors):
##  alpha: 0.669 0.571
##       ( 0.006 0.003 )
##  GWAS combination: 00 10 01 11
##  pi: 0.582 0.171 0.191 0.056
##    ( 0.005 0.005 0.004 0.004 )
## --------------------------------------------------

test.GPA.pleiotropy <- pTest( fit.GPA.noAnn, fit.GPA.pleiotropy.H0 )

## Hypothesis testing for pleiotropy
## --------------------------------------------------
## GWAS combination:  00 10 01 11 
## pi:  0.82 0 0 0.18 
##   (  0.003 0.003 0.004 0.005  )
## 
## test statistics:  8587.519 
## p-value:  0 
##   --------------------------------------------------

test.GPA.pleiotropy

## $pi
##           00           10           01           11 
## 8.197874e-01 9.396412e-07 6.278356e-06 1.802054e-01 
## 
## $piSE
##                   pi_10       pi_01       pi_11 
## 0.002961348 0.003494226 0.004344911 0.004672846 
## 
## $statistics
## iteration_1612 
##       8587.519 
## 
## $pvalue
## iteration_1612 
##              0

Integration of GWAS Data for Related Diseases

Wang Yingxuan; Qin Jiayue; Wuxin

2024-10-18

real data

The coincidence of the ids of the three classes of SNPS was judged

Only the bipolar GWAS data was used for analysis

Only the schizophrenia GWAS data was used for analysis