## Define the data generation function
# input: M, pi00,pi01,pi10,pi11, alpha1,alpha2
# output: P1,P2, Z00,Z01,Z10,Z11
data_generate1 <- function(M,pi00,pi01,pi10,pi11,alpha1,alpha2){
P1 <- numeric(M)
P2 <- numeric(M)
Z00 <- numeric(M)
Z01 <- numeric(M)
Z10 <- numeric(M)
Z11 <- numeric(M)
C <- c(1:M)
index00 <- sample(C,M*pi00)
C1 <- C[-index00]
index01 <- sample(C1,M*pi01)
real_index01 <-c()
k=1
for (i in index01){
real_index01[k]=which(C1==i)
k=k+1
}
C2 <- C1[-real_index01]
index10 <- sample(C2,M*pi10)
real_index10 <-c()
k=1
for (i in index10){
real_index10[k]=which(C2==i)
k=k+1
}
C3 <- C2[-real_index10]
index11 <- sample(C3,M*pi11)
Z00[index00] <- 1
Z00[-index00] <- 0
Z01[index01] <- 1
Z01[-index01] <- 0
Z10[index10] <- 1
Z10[-index10] <- 0
Z11[index11] <- 1
Z11[-index11] <- 0
P1[index00] <- runif(M*pi00)
P1[index01] <- runif(M*pi01)
P1[index10] <- rbeta(M*pi10,alpha1,1)
P1[index11] <- rbeta(M*pi11,alpha1,1)
P2[index00] <- runif(M*pi00)
P2[index01] <- rbeta(M*pi01,alpha2,1)
P2[index10] <- runif(M*pi10)
P2[index11] <- rbeta(M*pi11,alpha2,1)
return(list(P1=P1,P2=P2,Z00=Z00,Z01=Z01,Z10=Z10,Z11=Z11))
}
## Define the function that computes the log-likelihood
# input: P1,P2,pi00,pi01,pi10,pi11,alpha1,alpha2
log_p1 <- function(P1,P2,pi00,pi01,pi10,pi11,alpha1,alpha2){
return(sum(log(pi00+pi01*alpha2*P2^(alpha2-1)+pi10*alpha1*P1^(alpha1-1)+pi11*alpha1*alpha2*P2^(alpha2-1)*P1^(alpha1-1))))
}
## iteration
# max_iter=10000
# Set the maximum number of iterations to max_iter=10000
## In each iteration, E steps and M steps are carried out to calculate the log-likelihood and judge whether the algorithm converges
EM1 <- function(P1,P2,pi00_ini, pi01_ini,pi10_ini,pi11_ini,alpha1_ini,alpha2_ini, max_iter=1000){
L_ini <- log_p1(P1,P2,pi00_ini, pi01_ini,pi10_ini,pi11_ini,alpha1_ini,alpha2_ini)
for (iter in 1:max_iter){
if(iter==1){
pi00_old <- pi00_ini
pi01_old <- pi01_ini
pi10_old <- pi10_ini
pi11_old <- pi11_ini
alpha1_old <- alpha1_ini
alpha2_old <- alpha2_ini
L_old <- L_ini
}
## E step
gamma <- pi00_old+pi01_old*alpha2_old*P2^(alpha2_old-1)+alpha1_old*P1^(alpha1_old-1)*pi10_old+pi11_old*alpha1_old*alpha2_old*P1^(alpha1_old-1)*P2^(alpha2_old-1)
gamma_z00 <- pi00_old/gamma
gamma_z01 <- pi01_old*alpha2_old*P2^(alpha2_old-1)/gamma
gamma_z10 <- pi10_old*alpha1_old*P1^(alpha1_old-1)/gamma
gamma_z11 <- pi11_old*alpha1_old*alpha2_old*P1^(alpha1_old-1)*P2^(alpha2_old-1)/gamma
## M step
pi00_new <- mean(gamma_z00)
pi01_new <- mean(gamma_z01)
pi10_new <- mean(gamma_z10)
pi11_new <- mean(gamma_z11)
alpha1_new <- -sum(gamma_z11+gamma_z10)/sum(gamma_z11*log(P1)+gamma_z10*log(P1))
alpha2_new <- -sum(gamma_z11+gamma_z01)/sum(gamma_z11*log(P2)+gamma_z01*log(P2))
## compute the log-likelihood
L_new <- log_p1(P1,P2,pi00_new,pi01_new,pi10_new,pi11_new,alpha1_new,alpha2_new)
## whether the algorithm accepts or not
if(L_new<L_old){
print("Error: log likelihood is not increasing!")
break
}
if((L_new-L_old)/abs(L_new)<1e-5){
pi00_est <- pi00_new
pi01_est <- pi01_new
pi10_est <- pi10_new
pi11_est <- pi11_new
alpha1_est <- alpha1_new
alpha2_est <- alpha2_new
break
}else{
pi00_old <- pi00_new
pi01_old <- pi01_new
pi10_old <- pi10_new
pi11_old <- pi11_new
alpha1_old <- alpha1_new
alpha2_old <- alpha2_new
L_old <- L_new
}
}
return(list(pi00 = pi00_new, pi01 = pi01_new, pi10 = pi10_new, pi11 = pi11_new ,alpha1 = alpha1_new, alpha2 = alpha2_new))
}
# set random seed
set.seed(1)
# generate simulation data
M <- 100000
data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
# Parameter estimates were obtained using the EM algorithm to check for agreement with the previous results
theta_est <- EM1(data$P1,data$P2,pi00_ini=0.6,pi01_ini=0.05,pi10_ini=0.15,pi11_ini=0.01,alpha1_ini=0.2,alpha2_ini=0.1)
print(theta_est)
## $pi00
## [1] 0.6986258
##
## $pi01
## [1] 0.0995448
##
## $pi10
## [1] 0.1532531
##
## $pi11
## [1] 0.0485763
##
## $alpha1
## [1] 0.2014528
##
## $alpha2
## [1] 0.1994058
# The estimated value is close to the true value.
# Calculate the posterior probability
gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post
# Given a level alpha that controls the FDR, use the assoc function to identify SNPS associated with the disease based on the posterior probability of the latent variable.
# The inputs are posterior and alpha.
# posterior is an m-dimensional vector, where each element represents the posterior probability of the latent variable z_i=1 corresponding to the ith SNP
# alpha is the level that controls FDR
# The output of the function is Z_est, an m-dimensional vector, where each element represents whether the ith SNP is identified as related to the current disease, with 1 representing related and 0 representing unrelated
assoc <- function(posterior, alpha){
M <- length(posterior)
fdr <- 1 - posterior
rank.fdr <- rank(fdr)
sort.fdr <- sort(fdr)
cumsum.fdr <- cumsum(sort.fdr)
sort.FDR <- cumsum.fdr/seq(1, M, 1)
FDR <- sort.FDR[rank.fdr]
Z_est <- rep(0, M)
Z_est[which(FDR <= alpha)] <- 1
return(Z_est)
}
# Use assoc function to identify SNPs associated with disease on simulated data (controlling FDR at level 0.1) i.e., input the posterior probability of z=1 calculated by parameter-based EM estimation and use assoc function to obtain Z_est
Z_est_11 <- assoc(z11_post, 0.1)
sum(Z_est_11)
## [1] 817
## Calculate FDP and power
# Compare the real Z with the Z_est obtained in the previous step, by using table(Z_est, Z)
# V is the number of (Z_est=1, Z=0) in the table, S is the number of (Z_est=1, Z=1), R is the number of (Z_est=1), M0 is the number of (Z=0), and M-M0 is the number of (Z=1)
# Calculate FDP using V/R and power using S/(M-M0)
# FDP should be around 0.1, otherwise the code is broken
t<-table(Z_est_11, data$Z11)
t
##
## Z_est_11 0 1
## 0 94932 4251
## 1 68 749
FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP
## [1] 0.08323133
power<-t[2,2]/(t[1,2]+t[2,2])
power
## [1] 0.1498
#Related to disease 1
Z_est_1 <- assoc(z1_post, 0.1)
sum(Z_est_1)
## [1] 9159
t<-table(Z_est_1, data$Z11+data$Z10)
t
##
## Z_est_1 0 1
## 0 79057 11784
## 1 943 8216
FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP
## [1] 0.1029588
power<-t[2,2]/(t[1,2]+t[2,2])
power
## [1] 0.4108
#Related to disease 2
Z_est_2 <- assoc(z2_post, 0.1)
sum(Z_est_2)
## [1] 6256
t<-table(Z_est_2, data$Z11+data$Z01)
t
##
## Z_est_2 0 1
## 0 84395 9349
## 1 605 5651
FDP <- t[2, 1]/(t[2, 1] + t[2, 2])
FDP
## [1] 0.09670716
power<-t[2,2]/(t[1,2]+t[2,2])
power
## [1] 0.3767333
# Repeat 20 times and record pi00_est,pi01_est,pi10_est,pi11_est,alpha1_est, alpha2_est, FDP, power and plot boxplot
rep <- 20
pi00_est <- pi01_est <-pi10_est <- pi11_est <- alpha1_est <- alpha2_est <- Z11_FDP <- Z1_FDP <- Z2_FDP <- Z11_power <- Z1_power <- Z2_power <- numeric(rep)
for (i in 1:rep) {
set.seed(i)
data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
theta_est <- EM1(data$P1,data$P2,pi00_ini=0.6,pi01_ini=0.05,pi10_ini=0.15,pi11_ini=0.01,alpha1_ini=0.2,alpha2_ini=0.1)
pi00_est[i] <- theta_est$pi00
pi01_est[i] <- theta_est$pi01
pi10_est[i] <- theta_est$pi10
pi11_est[i] <- theta_est$pi11
alpha1_est[i] <- theta_est$alpha1
alpha2_est[i] <- theta_est$alpha2
gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post
Z_est_11 <- assoc(z11_post, 0.1)
t11<-table(Z_est_11, data$Z11)
Z11_FDP[i] <- t11[2, 1]/(t11[2, 1] + t11[2, 2])
Z11_power[i] <-t11[2,2]/(t11[1,2]+t11[2,2])
Z_est_1 <- assoc(z1_post, 0.1)
t1<-table(Z_est_1, data$Z11+data$Z10)
Z1_FDP[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
Z1_power[i] <-t1[2,2]/(t1[1,2]+t1[2,2])
Z_est_2 <- assoc(z2_post, 0.1)
t2<-table(Z_est_2, data$Z11+data$Z01)
Z2_FDP[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
Z2_power[i] <-t2[2,2]/(t2[1,2]+t2[2,2])
}
boxplot(pi00_est,ylim=c(0.693,0.706))
title("pi00_est")
abline(h=0.7,col="red")
boxplot(pi01_est)
title("pi01_est")
abline(h=0.1,col="red")
boxplot(pi10_est)
title("pi10_est")
abline(h=0.15,col="red")
boxplot(pi11_est)
title("pi11_est")
abline(h=0.05,col="red")
boxplot(alpha1_est)
title("alpha1_est")
abline(h=0.2,col="red")
boxplot(alpha2_est)
title("alpha2_est")
abline(h=0.2,col="red")
boxplot(Z11_FDP)
title("Z11_FDP")
abline(h=0.1,col="red")
boxplot(Z1_FDP)
title("Z1_FDP")
abline(h=0.1,col="red")
boxplot(Z2_FDP)
title("Z2_FDP")
abline(h=0.1,col="red")
boxplot(Z11_power)
title("Z11_power")
boxplot(Z1_power)
title("Z1_power")
boxplot(Z2_power)
title("Z2_power")
## Define the function that calculates the log-likelihood
# input: p and pi1, alpha
log_X <- function(P, pi1, alpha){
return(sum(log(pi1*alpha*(P^(alpha-1))+1-pi1)))
}
## Define the function that implements the EM algorithm
# The EM function takes P, pi1_ini, alpha_ini, max_iter, tol and outputs pi1_est, alpha_est
EM <- function(P, pi1_ini, alpha_ini, max_iter=10000, tol=1e-6){
L_ini <- log_X(P, pi1_ini, alpha_ini) # 初始对数似然
## Iteration
# Set maximum number of iterations max_iter=10000
## Perform E and M steps in each iteration, compute the log-likelihood, and see if the algorithm converges
max_iter <- 10000 # Set the maximum number of iterations to prevent non-convergence
for (iter in 1:max_iter){
if (iter == 1){
pi1_old <- pi1_ini
alpha_old <- alpha_ini
L_old <- L_ini
}
## E step
comp_gamma <- pi1_old*dbeta(P, alpha_old, 1)
gamma <- comp_gamma/(comp_gamma + (1 - pi1_old)*dunif(P, 0, 1))
## M step
pi1_new <- mean(gamma)
alpha_new <- -sum(gamma)/sum(gamma*log(P))
## compute log likelihood
L_new <- log_X(P, pi1_new, alpha_new)
## whether the algorithm converges
if (L_new < L_old){
print("Error: log likelihoood is not increasing!")
break
}
if ((L_new - L_old)/abs(L_new) < tol){
pi1_est <- pi1_new
alpha_est <- alpha_new
break
}
else {
pi1_old <- pi1_new
alpha_old <- alpha_new
L_old <- L_new
}
}
return(list(pi1 = pi1_new, alpha = alpha_new))
}
rep <- 20
FDP1 <- numeric(rep)
power1 <- numeric(rep)
FDP2 <- numeric(rep)
power2 <- numeric(rep)
pi1_est <- numeric(rep)
pi2_est <- numeric(rep)
alpha1_est <- numeric(rep)
alpha2_est <- numeric(rep)
rep=20
for (i in 1:rep){
set.seed(i)
data <- data_generate1(M,0.7,0.1,0.15,0.05,0.2,0.2)
est1 <- EM(data$P1, 0.1, 0.1)
est2 <- EM(data$P2, 0.1, 0.1)
pi1_est[i] <- est1$pi1
alpha1_est[i] <- est1$alpha
pi2_est[i] <- est2$pi1
alpha2_est[i] <- est2$alpha
posterior1 <- (est1$pi1*est1$alpha*data$P1^(est1$alpha - 1))/(est1$pi1*est1$alpha*data$P1^(est1$alpha - 1) + 1 - est1$pi1)
posterior2 <- (est2$pi1*est2$alpha*data$P2^(est2$alpha - 1))/(est2$pi1*est2$alpha*data$P2^(est2$alpha - 1) + 1 - est2$pi1)
Z_est1 <- assoc(posterior1, 0.1)
Z_est2 <- assoc(posterior2, 0.1)
t1 <- table(Z_est1, data$Z11+data$Z10)
t2 <- table(Z_est2, data$Z11+data$Z01)
FDP1[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
power1[i] <- t1[2, 2]/(t1[1, 2] + t1[2, 2])
FDP2[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
power2[i] <- t2[2, 2]/(t2[1, 2] + t2[2, 2])
}
boxplot(pi1_est)
title("pi1_est")
abline(h=0.2,col="red")
boxplot(pi2_est)
title("pi2_est")
abline(h=0.15,col="red")
boxplot(alpha1_est)
title("alpha1_est")
abline(h=0.2,col="red")
boxplot(alpha2_est)
title("alpha2_est")
abline(h=0.2,col="red")
FDP_SUM <- cbind(FDP1,FDP2)
boxplot(FDP_SUM)
title("FDP")
abline(h=0.1,col="red")
power_SUM <- cbind(Z1_power,Z2_power,power1,power2)
boxplot(power_SUM)
title("power")
# Keep pi11 and pi01 unchanged, change pi00 and pi10
pi00 <- c(0.5,0.55,0.6)
pi10 <- rep(c(0.15,0.1,0.05),each=5 )
rep=5
pi00_est <- pi01_est <- pi10_est <- pi11_est <- alpha1_est <- alpha2_est <- Z11_FDP <-Z1_FDP <- Z2_FDP <- Z11_power <- Z1_power <- Z2_power <- data.frame(pi00 = rep(pi00,each=rep),value=numeric(length(pi00)*rep))
for (i in 1:(length(pi00)*rep)){
set.seed(i)
data <- data_generate1(100000,pi00_est$pi00[i],0.2,pi10[i],0.15,0.2,0.2)
theta_est <- EM1(data$P1,data$P2,pi00_ini=0.5,pi01_ini=0.1,pi10_ini=0.1,pi11_ini=0.1,alpha1_ini=0.1,alpha2_ini=0.1)
pi00_est$value[i] <- theta_est$pi00
pi01_est$value[i] <- theta_est$pi01
pi10_est$value[i] <- theta_est$pi10
pi11_est$value[i] <- theta_est$pi11
alpha1_est$value[i] <- theta_est$alpha1
alpha2_est$value[i] <- theta_est$alpha2
gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)+theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)
z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*data$P1^(theta_est$alpha1-1)*data$P2^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*data$P2^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*data$P1^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post
Z_est_11 <- assoc(z11_post, 0.1)
t11<-table(Z_est_11, data$Z11)
Z11_FDP$value[i] <- t11[2, 1]/(t11[2, 1] + t11[2, 2])
Z11_power$value[i] <-t11[2,2]/(t11[1,2]+t11[2,2])
Z_est_1 <- assoc(z1_post, 0.1)
t1<-table(Z_est_1, data$Z11+data$Z10)
Z1_FDP$value[i] <- t1[2, 1]/(t1[2, 1] + t1[2, 2])
Z1_power$value[i] <-t1[2,2]/(t1[1,2]+t1[2,2])
Z_est_2 <- assoc(z2_post, 0.1)
t2<-table(Z_est_2, data$Z11+data$Z01)
Z2_FDP$value[i] <- t2[2, 1]/(t2[2, 1] + t2[2, 2])
Z2_power$value[i] <-t2[2,2]/(t2[1,2]+t2[2,2])
}
library(ggplot2)
pi00_est$pi00 <- as.factor(pi00_est$pi00)
ggplot(pi00_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = pi00,linetype="dashed",color="red")+ggtitle("pi00_est")
library(ggplot2)
pi01_est$pi00 <- as.factor(pi01_est$pi00)
ggplot(pi01_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("pi01_est")
library(ggplot2)
pi10_est$pi00 <- as.factor(pi10_est$pi00)
ggplot(pi10_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = c(0.15,0.1,0.05),linetype="dashed",color="red")+ggtitle("pi10_est")
library(ggplot2)
pi11_est$pi00 <- as.factor(pi11_est$pi00)
ggplot(pi11_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.15,linetype="dashed",color="red")+ggtitle("pi11_est")
library(ggplot2)
alpha1_est$pi00 <- as.factor(alpha1_est$pi00)
ggplot(alpha1_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("alpha1_est")
library(ggplot2)
alpha2_est$pi00 <- as.factor(alpha2_est$pi00)
ggplot(alpha2_est,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+geom_hline(yintercept = 0.2,linetype="dashed",color="red")+ggtitle("alpha2_est")
library(ggplot2)
Z1_power$pi00 <- as.factor(Z1_power$pi00)
ggplot(Z1_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z1_power")
library(ggplot2)
Z2_power$pi00 <- as.factor(Z2_power$pi00)
ggplot(Z2_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z2_power")
library(ggplot2)
Z11_power$pi00 <- as.factor(Z11_power$pi00)
ggplot(Z11_power,aes(y=value,x=pi00,color=pi00))+geom_boxplot()+ggtitle("Z11_power")
BIP <- read.table("E:\\qjy\\ecnu\\大三第二学期\\生物医疗\\生物医疗小组作业2\\tw2_data\\pgc.bip.full.2012-04.txt", header = T)
SCZ <- read.table("E:\\qjy\\ecnu\\大三第二学期\\生物医疗\\生物医疗小组作业2\\tw2_data\\pgc.scz.full.2012-04.txt", header = T)
library(dplyr)
##
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Take the intersection
snp_result=intersect(BIP$snpid,SCZ$snpid)
pvalue_bip <- BIP[BIP$snpid %in% snp_result,]$pval
pvalue_scz <- SCZ[SCZ$snpid %in% snp_result,]$pval
BIP_jiaoji <- BIP[BIP$snpid %in% snp_result,]
SCZ_jiaoji <- SCZ[SCZ$snpid %in% snp_result,]
BIP_SCZ <- inner_join(BIP[,c('snpid','pval')],SCZ[,c('snpid','pval')],by="snpid")
# The parameter estimates are obtained using the EM algorithm
theta_est <- EM1(pvalue_bip, pvalue_scz,pi00_ini = 0.8, pi01_ini=0.01,pi10_ini=0.01,pi11_ini=0.1,alpha1_ini=0.2,alpha2_ini=0.2)
theta_est
## $pi00
## [1] 0.8173783
##
## $pi01
## [1] 0.006304867
##
## $pi10
## [1] 0.001133071
##
## $pi11
## [1] 0.1751837
##
## $alpha1
## [1] 0.5985608
##
## $alpha2
## [1] 0.5039367
gamma_post <- theta_est$pi00+theta_est$pi01*theta_est$alpha2*pvalue_scz^(theta_est$alpha2-1)+theta_est$alpha1*pvalue_bip^(theta_est$alpha1-1)*theta_est$pi10+theta_est$pi11*theta_est$alpha1*theta_est$alpha2*pvalue_bip^(theta_est$alpha1-1)*pvalue_scz^(theta_est$alpha2-1)
z11_post <- theta_est$pi11*theta_est$alpha1*theta_est$alpha2*pvalue_bip^(theta_est$alpha1-1)*pvalue_scz^(theta_est$alpha2-1)/gamma_post
z01_post <- theta_est$pi01*theta_est$alpha2*pvalue_scz^(theta_est$alpha2-1)/gamma_post
z10_post <- theta_est$alpha1*pvalue_bip^(theta_est$alpha1-1)*theta_est$pi10/gamma_post
z00_post <- theta_est$pi00/gamma_post
z1_post <- z11_post+z10_post
z2_post <- z11_post+z01_post
# SNPS associated with both diseases
Z_est_11 <- assoc(z11_post, 0.1)
sum(Z_est_11)
## [1] 6956
# SNPS associated with BIP
Z_est_10 <- assoc(z1_post, 0.1)
sum(Z_est_10)
## [1] 7018
# SNPS associated with SCZ
Z_est_01 <- assoc(z2_post, 0.1)
sum(Z_est_01)
## [1] 8210
snpid_bip <- BIP[BIP$snpid %in% snp_result,]$snpid
snp_bip = data.frame(snpid_bip = snpid_bip,bip_11 = Z_est_11,bip_10 = Z_est_10)
snpid_bip_11 <- snp_bip[which(snp_bip$bip_11 == 1),'snpid_bip']
snpid_bip_10 <- snp_bip[which(snp_bip$bip_10 == 1),'snpid_bip']
length(intersect(snpid_bip_11,snpid_bip_10))
## [1] 6956
# The high degree of overlap between those associated with both diseases and those associated with BIP
setdiff(snpid_bip_10,snpid_bip_11)
## [1] "rs4908147" "rs387176" "rs263908" "rs10921107" "rs2270543"
## [6] "rs17017120" "rs3845684" "rs906867" "rs13386455" "rs1526645"
## [11] "rs7583278" "rs4430889" "rs1435845" "rs1161474" "rs6729882"
## [16] "rs7641289" "rs17718783" "rs9288851" "rs16825602" "rs810471"
## [21] "rs6829771" "rs2646269" "rs370831" "rs391529" "rs13128519"
## [26] "rs7733288" "rs256014" "rs9393672" "rs9275523" "rs7767277"
## [31] "rs2842625" "rs2138707" "rs1934124" "rs4357169" "rs9387090"
## [36] "rs13227417" "rs11761050" "rs6958911" "rs2196" "rs12342040"
## [41] "rs10760846" "rs7895653" "rs3858136" "rs10791838" "rs11174379"
## [46] "rs7300029" "rs11174386" "rs11109376" "rs1262775" "rs17107699"
## [51] "rs10518779" "rs16957422" "rs7190307" "rs12943984" "rs7209032"
## [56] "rs156430" "rs4277599" "rs2378249" "rs6030341" "rs11906231"
## [61] "rs2836779" "rs926761"
# The only SNP associated with BIP but not with SCZ
snpid_scz <- SCZ[SCZ$snpid %in% snp_result,]$snpid
snp_scz = data.frame(snpid_scz = snpid_scz,scz_11 = Z_est_11,scz_10 = Z_est_01)
snpid_scz_11 <- snp_scz[which(snp_scz$scz_11 == 1),'snpid_scz']
snpid_scz_01 <- snp_scz[which(snp_scz$scz_10 == 1),'snpid_scz']
length(intersect(snpid_scz_11,snpid_scz_01))
## [1] 6956
## Define the function that calculates the log-likelihood
# input: p, pi1, alpha
log_P_single <- function(P, pi1, alpha){
return(sum(log(pi1*dbeta(P, alpha, 1) + (1 - pi1))))# 最终要优化的函数
}
## Define functions that implement the EM algorithm for the analysis of a single dataset
# input: P, pi1_ini, alpha_ini, max_iter, tol
# ouput: pi1_est, alpha_est
EM_single <- function(P, pi1_ini, alpha_ini, max_iter = 1e4, tol = 1e-6){
L_ini <- log_P_single(P, pi1_ini, alpha_ini)
for (iter in 1:max_iter){
if (iter == 1){
pi1_old <- pi1_ini
alpha_old <- alpha_ini
L_old <- L_ini
}
# E step
comp_gamma <- pi1_old*dbeta(P, alpha_old, 1)
gamma <- comp_gamma/(comp_gamma + 1 - pi1_old)
# M step
pi1_new <- mean(gamma)
alpha_new <- -sum(gamma)/sum(gamma*log(P))
L_new <- log_P_single(P, pi1_new, alpha_new)
# whether the algorithm converges
if (L_new < L_old){
print("Error: log likelihoood is not increasing!")
break
}
if ((L_new - L_old)/abs(L_new) < tol){
break
} else {
pi1_old <- pi1_new
alpha_old <- alpha_new
L_old <- L_new
}
}
return(list(pi1 = pi1_new, alpha = alpha_new))
}
theta_est_BIP <- EM_single(pvalue_bip, pi1_ini = 0.1, alpha_ini = 0.2)
theta_est_BIP
## $pi1
## [1] 0.2194204
##
## $alpha
## [1] 0.6628928
posterior_bip <- (theta_est_BIP$pi1*theta_est_BIP$alpha*pvalue_bip^(theta_est_BIP$alpha - 1))/(theta_est_BIP$pi1*theta_est_BIP$alpha*pvalue_bip^(theta_est_BIP$alpha - 1) + 1 - theta_est_BIP$pi1)
Z_est_bip <- assoc(posterior_bip, 0.1)
sum(Z_est_bip)
## [1] 403
snp_bip_single = data.frame(snpid_bip = snpid_bip,bip_1 = Z_est_bip)
snpid_bip_1 <- snp_bip_single[which(snp_bip_single$bip_1 == 1),'snpid_bip']
length(intersect(snpid_bip_1,snpid_bip_10))
## [1] 387
theta_est_SCZ <- EM_single(pvalue_scz, pi1_ini = 0.1, alpha_ini = 0.2)
theta_est_SCZ
## $pi1
## [1] 0.2419776
##
## $alpha
## [1] 0.5668053
posterior_scz <- (theta_est_SCZ$pi1*theta_est_SCZ$alpha*pvalue_scz^(theta_est_SCZ$alpha - 1))/(theta_est_SCZ$pi1*theta_est_SCZ$alpha*pvalue_scz^(theta_est_SCZ$alpha - 1) + 1 - theta_est_SCZ$pi1)
Z_est_scz <- assoc(posterior_scz, 0.1)
sum(Z_est_scz)
## [1] 4012
snp_scz_single = data.frame(snpid_scz = snpid_scz,scz_1 = Z_est_scz)
snpid_scz_1 <- snp_scz_single[which(snp_scz_single$scz_1 == 1),'snpid_scz']
length(intersect(snpid_scz_1,snpid_scz_01))
## [1] 3692
Plot the Manhattan
FDR <- function(posterior, alpha){
M <- length(posterior)
fdr <- 1 - posterior
rank.fdr <- rank(fdr)
sort.fdr <- sort(fdr)
cumsum.fdr <- cumsum(sort.fdr)
sort.FDR <- cumsum.fdr/seq(1, M, 1)
FDR <- sort.FDR[rank.fdr]
return(FDR)
}
BIP
library(qqman)
##
## For example usage please run: vignette('qqman')
##
## Citation appreciated but not required:
## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.
##
SNP_BIP_snp <- as.vector(BIP$snpid[which(Z_est_bip == 1)])
BIP_jiaoji$fdr <- -log10(FDR(posterior_bip, 0.1))
manhattan(BIP_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr", snp = "snpid", ylim = c(0, 3), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
main = "single BIP",
suggestiveline = T,
genomewideline = F)
SCZ
SNP_SCZ_snp <- as.vector(BIP$snpid[which(Z_est_scz == 1)])
# SCZ_jiaoji$fdr <- -log10(assoc(posterior_scz, 0.1)$fdr)
SCZ_jiaoji$fdr <- -log10(FDR(posterior_scz, 0.1))
manhattan(SCZ_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr", snp = "snpid", ylim = c(0, 3), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
main = "single SCZ",
suggestiveline = T,
genomewideline = F)
Integrated analysis: BIP
SNP_BIP_with_SCZ <- as.vector(BIP_jiaoji$snpid[which(Z_est_10 == 1)])
# BIP_jiaoji$fdr1 <- -log10(assoc(z1_post, 0.1)$fdr)
BIP_jiaoji$fdr1 <- -log10(FDR(z1_post, 0.1))
manhattan(BIP_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr1", snp = "snpid", ylim = c(0, 4), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
main = "BIP with SCZ",
suggestiveline = T,
genomewideline = F)
Integrated analysis: SCZ
SNP_SCZ_with_BIP <- as.vector(SCZ_jiaoji$snpid[which(Z_est_01 == 1)])
# SCZ_jiaoji$fdr1 <- -log10(assoc(z2_post, 0.1)$fdr)
SCZ_jiaoji$fdr1 <- -log10(FDR(z2_post, 0.1))
manhattan(SCZ_jiaoji, chr = "hg18chr", bp = "bp", p = "fdr1", snp = "snpid", ylim = c(0, 4), col = c("blue4", "orange3"), cex = 0.6,logp = F,ylab = "-log_10(fdr)",
main = "SCZ with BIP",
suggestiveline = T,
genomewideline = F)
Extension: GPA http://dongjunchung.github.io/GPA/
# install.packages("devtools")
# library(devtools)
# install_github("dongjunchung/GPA")
library(GPA)
## 载入需要的程序包:Rcpp
## 载入需要的程序包:parallel
## 载入需要的程序包:ggrepel
## 载入需要的程序包:plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## 载入程序包:'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## 载入需要的程序包:vegan
## 载入需要的程序包:permute
## 载入需要的程序包:lattice
##
## 载入程序包:'lattice'
## The following object is masked from 'package:qqman':
##
## qq
## This is vegan 2.6-8
## 载入需要的程序包:DT
## 载入需要的程序包:shiny
##
## 载入程序包:'shiny'
## The following objects are masked from 'package:DT':
##
## dataTableOutput, renderDataTable
## 载入需要的程序包:shinyBS
##
## 载入程序包:'GPA'
## The following object is masked _by_ '.GlobalEnv':
##
## assoc
## The following object is masked from 'package:stats':
##
## cov
fit.GPA.noAnn <- GPA( BIP_SCZ[ , c(2,3) ], NULL )
## Info: Number of GWAS data: 2
## Info: Theoretically null distribution is used.
## Info: No annotation data is provided.
## Info: Lower bound for pi estimates is set to 1 / [number of SNPs].
## Info: EM algorithm stops because it reaches the specified maximum number of iterations (maxIter = 2000).
fit.GPA.noAnn
## Summary: GPA model fitting results (class: GPA)
## --------------------------------------------------
## Data summary:
## Number of GWAS data: 2
## Number of SNPs: 1064236
## Number of annotation data: (not provided)
## Model setting:
## Theoretical null distribution is assumed.
## Parameter estimates (standard errors):
## alpha: 0.603 0.503
## ( 0.005 0.003 )
## GWAS combination: 00 10 01 11
## pi: 0.82 0 0 0.18
## ( 0.003 0.003 0.004 0.005 )
## --------------------------------------------------
estimates( fit.GPA.noAnn )
## $pis
## 00 10 01 11
## 8.197874e-01 9.396412e-07 6.278356e-06 1.802054e-01
##
## $betaAlpha
## [1] 0.6028026 0.5027528
assoc.GPA.noAnn <- GPA::assoc( fit.GPA.noAnn, FDR=0.10, fdrControl="global" )
## Info: Association mapping based on the global FDR control at level 0.1.
table(assoc.GPA.noAnn[,1])#BIP
##
## 0 1
## 1055956 8280
table(assoc.GPA.noAnn[,2])#SCZ
##
## 0 1
## 1055954 8282
#table(assoc.GPA.noAnn)
# association with SCZ and BIP
assoc11.GPA.noAnn <- GPA::assoc( fit.GPA.noAnn, FDR=0.10, fdrControl="global", pattern="11" )
## Info: Association mapping based on the global FDR control at level 0.1.
table(assoc11.GPA.noAnn)
## assoc11.GPA.noAnn
## 0 1
## 1055956 8280
# Test for pleiotropy
fit.GPA.pleiotropy.H0 <- GPA( BIP_SCZ[ , c(2,3) ], NULL, pleiotropyH0=TRUE )
## Info: Number of GWAS data: 2
## Info: Theoretically null distribution is used.
## Info: No annotation data is provided.
## Info: Fit the GPA model under H0 of the pleitropy test.
## Info: Lower bound for pi estimates is set to 1 / [number of SNPs].
## Info: EM algorithm stops in 1612-th iteration because there is no improvements in log likelihood.
fit.GPA.pleiotropy.H0
## Summary: GPA model fitting results (class: GPA)
## --------------------------------------------------
## Data summary:
## Number of GWAS data: 2
## Number of SNPs: 1064236
## Number of annotation data: (not provided)
## Model setting:
## Theoretical null distribution is assumed.
## GPA is fitted under H0 of pleiotropy LRT.
## Parameter estimates (standard errors):
## alpha: 0.669 0.571
## ( 0.006 0.003 )
## GWAS combination: 00 10 01 11
## pi: 0.582 0.171 0.191 0.056
## ( 0.005 0.005 0.004 0.004 )
## --------------------------------------------------
test.GPA.pleiotropy <- pTest( fit.GPA.noAnn, fit.GPA.pleiotropy.H0 )
## Hypothesis testing for pleiotropy
## --------------------------------------------------
## GWAS combination: 00 10 01 11
## pi: 0.82 0 0 0.18
## ( 0.003 0.003 0.004 0.005 )
##
## test statistics: 8587.519
## p-value: 0
## --------------------------------------------------
test.GPA.pleiotropy
## $pi
## 00 10 01 11
## 8.197874e-01 9.396412e-07 6.278356e-06 1.802054e-01
##
## $piSE
## pi_10 pi_01 pi_11
## 0.002961348 0.003494226 0.004344911 0.004672846
##
## $statistics
## iteration_1612
## 8587.519
##
## $pvalue
## iteration_1612
## 0