rm(list=ls(all=TRUE))
#############################################################################
# Importance functions for the sensitivity analysis in Web Appendix E
# These functions are similar to those included in "GeneralDistribution_Hierarchical_OP.R"

#################################################################################
# obtain threshold, sensitivity, Specificity for individual testing
# Arguments:
# Pos.genr(B) should generate B samples from f+
# Neg.genr(B) should generate B samples from f-
# Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon 
# B: number of Monte Carlo samples
# Outputs:
# tau.star: the threshold for individual testing (from maximizing the Youden index)
# Se: the sensitivity for individual testing
# Sp: the specificity for individual testing
IndTest=function(Pos.genr,Neg.genr,Mes.gner,B)
{
  sample.pos=Mes.genr(Pos.genr(B))
  sample.neg=Mes.genr(Neg.genr(B))
  int=c(median(sample.neg),median(sample.pos))
  Youden=function(t){return(-mean(sample.pos>t)-mean(sample.neg<t))}
  tau.star=optimize(Youden,interval=int)$minimum
  Se=mean(sample.pos>tau.star)
  Sp=mean(sample.neg<tau.star)
  return(list(tau.star=tau.star,Se=Se,Sp=Sp))
}
############################################################################## 
# obtain threshold for pooled testing
# Function to compute the newly proposed threshold for pool of size n via Monte Carlo approximation
# Arguments:
# n       : a given pool size
# Pos.genr(B) should generate B samples from f+
# Neg.genr(B) should generate B samples from f-
# Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon 
# B: number of Monte Carlo samples
Thres=function(n,Pos.genr,Neg.genr,Mes.genr, B)
{
  sample.pos=Mes.genr(rowMeans(cbind(Pos.genr(B), matrix(Neg.genr(B*(n-1)),B,n-1))))
  sample.neg=Mes.genr(rowMeans(matrix(Neg.genr(B*n),B,n)))	
  int=c(as.vector(quantile(sample.neg,.45)),quantile(sample.pos,.55))
  pooled.Youden=function(t)
  {
    res=mean(sample.pos>t)+mean(sample.neg<t)
    return(-res)
  }
  # maximize the pooled version of Youden index
  return(optimize(pooled.Youden,interval=int)$minimum)
}

################################################################################
# Monte Carlo simulation procedure to approximate the operating 
# characteristis of an S-stage hierarchical algorithm in our biomarker-based framework.

#***********************************************************************#
# Supporting function
# This function simply does one Monte Carlo simulation for efficiency
# Arguments:
# S.size:       = c(n1, n2, ..., nS). The configuration of the algorithm
# p             = disease prevalence
# thres         = a sequence of thresholds used for pools of sizes n1, n2, 
#                 n3, ..., nS, respectively. Note that, one should make
#                 sure that the order of thresholds in THRED is the same
#                 as the one of pool sizes in S.size.
# Pos.genr(B) should generate B samples from f+
# Neg.genr(B) should generate B samples from f-
# Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon 


Sim.Hierarchical.num=function(S.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
{
  True.Status=rbinom(S.size[1],1,p)
  S=length(S.size)
  True.Concen=Pos.genr(S.size[1])*True.Status+Neg.genr(S.size[1])*(1-True.Status)
  res=matrix(1,1,S.size[1])
  Done=1
  s=1
  eff=0
  while(Done>0 && s<=S)
  {
    temp=c()
    K=S.size[1]/S.size[s]
    for(j in 1:K)
    {
      ind=((j-1)*S.size[s]+1):(j*S.size[s])
      temp[j]=I(Mes.genr(mean(True.Concen[ind]))>thres[s])*1
      if(min(res[,ind])!=0)
      {
        eff=eff+1
      }
    }
    res=rbind(res,rep(temp,rep(S.size[s],K)))
    Done=max(temp)
    s=s+1		
  }
  Diag=apply(res,2,prod)
  return(eff/S.size[1])
}


#***********************************************************************#
# Supporting function
# This function simply does one Monte Carlo simulation for pooling sensitivity and pooling specificity
# Arguments:
# S.size:       = c(n1, n2, ..., nS). The configuration of the algorithm
# p             = disease prevalence
# thres         = a sequence of thresholds used for pools of sizes n1, n2, 
#                 n3, ..., nS, respectively. Note that, one should make
#                 sure that the order of thresholds in THRED is the same
#                 as the one of pool sizes in S.size.
# Pos.genr(B) should generate B samples from f+
# Neg.genr(B) should generate B samples from f-
# Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon 

Sim.Hierarchical.sesp=function(S.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
{
  S=length(S.size)
  True.Status=c(0,rbinom(S.size[1]-1,1,p))
  True.Concen=Pos.genr(S.size[1])*True.Status+Neg.genr(S.size[1])*(1-True.Status)
  s=1
  while(s<=S)
  {
    ind=1:(S.size[s])
    temp=I(Mes.genr(mean(True.Concen[ind]))>thres[s])*1
    if(temp==0)
    {
      s=S+1
      Diag.neg=temp
    }else{
      s=s+1
      Diag.neg=temp
    }
  }
  
  True.Status=c(1,rbinom(S.size[1]-1,1,p))
  True.Concen=Pos.genr(S.size[1])*True.Status+Neg.genr(S.size[1])*(1-True.Status)
  s=1
  while(s<=S)
  {
    ind=1:(S.size[s])
    temp=I(Mes.genr(mean(True.Concen[ind]))>thres[s])*1
    if(temp==0)
    {
      s=S+1
      Diag.pos=temp
    }else{
      s=s+1
      Diag.pos=temp
    }
  }
  return(list(Diag.pos=Diag.pos,Diag.neg=Diag.neg))
}


#***********************************************************************#
# Main function
# This function approximate EFF, PSE, PSP, PPV, NPV of H(n1:n2:...:nS) 
# for a given prevalence using M Monte Carlo replications
#
# Arguments:
# S.size:       = c(n1, n2, ..., nS). The configuration of the algorithm
# p             = disease prevalence
# thres         = a sequence of thresholds used for pools of sizes n1, n2, 
#                 n3, ..., nS, respectively. Note that, one should make
#                 sure that the order of thresholds in THRED is the same
#                 as the one of pool sizes in S.size.
# Pos.genr(B) should generate B samples from f+
# Neg.genr(B) should generate B samples from f-
# Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon 
# B: size of Monte Carlo replications
Sim.Hierarchical=function(S.size,p,thres,Pos.genr,Neg.genr,Mes.genr,B)
{
  num=0
  num2=0
  dpos=0
  dneg=0
  for(j in 1:B)
  {
    #print(j)
    temp=Sim.Hierarchical.num(S.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
    num=num+temp
    num2=num2+temp^2
    temp=Sim.Hierarchical.sesp(S.size,p,thres,Pos.genr,Neg.genr,Mes.genr)
    dpos=dpos+temp$Diag.pos
    dneg=dneg+temp$Diag.neg
  }
  eff.mean=num/B
  eff.sd=sqrt(round((B*num2-num^2)/(B*(B-1)),8))
  pse=dpos/B
  psp=1-dneg/B
  ppv=pse*p/(pse*p+(1-psp)*(1-p))
  npv=psp*(1-p)/(psp*(1-p)+(1-pse)*p)
  
  # output everything in a list
  # EFF: approximated efficiency
  # SD:  approximated standard deviation of number of tests per specimen
  # PSE: approximated pooling sensitivty
  # PSP: approximated pooling specificity
  # PPV: approximated pooling positive predictive value
  # NPV: approximated pooling negative predcitive value
  return(list(EFF=eff.mean,SD=eff.sd,PSE=pse,PSP=psp,PPV=ppv,NPV=npv))
}


#***********************************************************************#
# Final function
# This function approximate EFF, PSE, PSP, PPV, NPV of H(n1:n2:...:nS) 
# using our newly proposed threshold to maximize the pooled version of Youden index
# for a given prevalence using B Monte Carlo replications
#
# Arguments:
# S.size:       = c(n1, n2, ..., nS). The configuration of the algorithm
# p             = disease prevalence
# Pos.genr(B) should generate B samples from f+
# Neg.genr(B) should generate B samples from f-
# Mes.genr(c.tilde) should generate a measured value for each value of the argument "c.tilde" based on f_epsilon 
# B: size of Monte Carlo replications
Sim.Hierarchical.all=function(S.size,p,Pos.genr,Neg.genr,Mes.genr,B)
{
  Ind.res=IndTest(Pos.genr,Neg.genr,Mes.gner,B)
  tau.star=Ind.res$tau.star
  print("The threshold for individual testing is")
  print(tau.star)
  Se=Ind.res$Se
  Sp=Ind.res$Sp
  #print("The sensitivity for individual testing is")
  print(Se)
  #print("The specificity for individual testing is")
  print(Sp)
  new.thres=c()
  for(ss in 1:length(S.size))
  {
    new.thres=c(new.thres,Thres(S.size[ss],Pos.genr,Neg.genr,Mes.genr,B))
  }
  OP.newth=Sim.Hierarchical(S.size,p,new.thres,Pos.genr,Neg.genr,Mes.genr,B)
  print(cbind(OP.newth))
  return(cbind(OP.newth))
}


##################################################################################
# Conduct the sensitivity analysis in Web Appendix E

# The negative biomarker distribution is Gamma with mean 1 and variance .25
para.n=c(1,.25)
para.n=c(para.n[1]^2/para.n[2],para.n[1]/para.n[2]) #alpha,beta
Neg.genr=function(B){return(rgamma(B,shape=para.n[1],rate=para.n[2]))}


# Pos.genr(B) generates B random samples from the user-chosen positive biomarker distribution
# The positive biomarker distribution is Gamma with mean 10-delta and variance 20
# For the sensitivity analysis, when delta=0, the positive biomarker distribution is correctly specificed
# When delta>0, the positive biomarker distribution is wrongly specified.
delta <- 0
para.p=c(10-delta,20)
para.p=c(para.p[1]^2/para.p[2],para.p[1]/para.p[2]) #alpha,beta
Pos.genr=function(B){return(rgamma(B,shape=para.p[1],rate=para.p[2]))}

# Given a vector of true biomarker levels \tilde C's, this function produces measured values of these true levels
# based on the user-chosen measurement error distribution.
# C|\tilde C\sim N(\tilde C, 0.1^2)
para.e=c(0,0.1)
Mes.genr=function(sample){return(rnorm(	length(sample),sample+para.e[1],para.e[2]))}

# Delta.grid creates the wrong specificiation of the positive biomarker distribution.
# At delta take the value 0 in delta.grid, there is no wrong specificiation
# When delta is at nonzero values in delta.grid, there is wrong specificiation.
delta.grid=seq(0,2,length=21)

OP.res=c()
for(delta in delta.grid)
{
  # Neg.genr(B) generates B random samples from the user-chosen negative biomarker distribution.
  
  # Pos.genr(B) generates B random samples from the user-chosen positive biomarker distribution
  # The positive biomarker distribution is Gamma with mean 10-delta and variance 20
  # For the sensitivity analysis, when delta=0, the positive biomarker distribution is correctly specificed
  # When delta>0, the positive biomarker distribution is wrongly specified.
  para.p=c(10-delta,20)
  para.p=c(para.p[1]^2/para.p[2],para.p[1]/para.p[2]) #alpha,beta
  Pos.genr=function(B){return(rgamma(B,shape=para.p[1],rate=para.p[2]))}

  S.size=c(5,1) # i.e., H(5:1)
  p=0.05  # prevalence
  B=1000000 # number of Monte Carlo data sets
  
  # Approximate EFF, PSE, PSP, PPV, NPV of H(5:1) when p=0.05 through 1,000,000 Monte Carlo replications
  # When the positive biomarker distribution is specified to be Gamma with mean 10-delta, variance 25
  res=Sim.Hierarchical.all(S.size,p,Pos.genr,Neg.genr,Mes.genr,B)
  
  # save all the approximated value in OP.res
  OP.res=cbind(OP.res,res)
}

##########################################################################
# Create the Figure E.1 in Web Appendix E
layout(matrix(1:6, 3, 2,byrow=TRUE))
par(mar=c(4.5,4.5,.5,.5))
NameS=c("EFF","SD","PSE","PSP","PPV","NPV")
for(mm in 1:6)
{
  plot(delta.grid,OP.res[mm,],ylim=c(0,1),xlab=expression(delta),ylab=NameS[mm],cex.lab=1.2,cex.axis=1.2)
  lines(delta.grid,rep(OP.res[mm,1],length(delta.grid)))
}