***************************************************************************; ** PROGRAM: boot_3MRCV.sas **; ** AUTHOR: Christopher R. Bilder **; ** Department of Statistics **; ** University of Nebraska-Lincoln **; ** chris@chrisbilder.com **; ** DATE: 3-23-06 **; ** PURPOSE: Fit a generalized loglinear model via the marginal modeling **; ** method and use the bootstrap and Rao-Scott approximations **; ** to find a p-value for testing the model of interest vs. the **; ** saturated model. **; ** NOTES: **; ** 1) Do not distribute copies of this program without the permission of**; ** the author. **; ** 2) Copyright 2006 Christopher R. Bilder **; ** 3) See control_3MRCV.sas for more information **; ***************************************************************************; **********************************************************************; ** **; ** NAME: MACRO TIME **; ** PURPOSE: Write time to output file **; ** VARIABLES: title = begin or end of program **; ** **; **********************************************************************; %MACRO TIME(title); data time; time = hour(time()); minute = minute(time()); second = second(time()); month = month(today()); day = day(today()); year = year(today()); run; proc print data=time; title2 "&title"; run; %MEND TIME; **********************************************************************; ** **; ** NAME: MACRO MAKE_2WAY **; ** PURPOSE: Find all possible 2-way tables for the observed data **; ** **; **********************************************************************; %MACRO MAKE_2WAY(data); %do i = 1 %to (&nrow2-1); %do j = (&i + 1) %to &nrow2; proc freq data=&data noprint; tables item&i.*item&j / out=out_2way sparse; run; *Fix data set up to save; data out_2way_2; set out_2way; item_numb1 = &i; item_numb2 = &j; a = item&i; b = item&j; if count = 0 then count = 0.5; keep item_numb1 item_numb2 a b count; run; data out_2way_save; set out_2way_save out_2way_2; run; %end; %end; %do i = (&nrow2+1) %to (&nrow2+&ncol2-1); %do j = (&i + 1) %to (&nrow2+&ncol2); proc freq data=&data noprint; tables item&i.*item&j / out=out_2way sparse; run; *Fix data set up to save; data out_2way_2; set out_2way; item_numb1 = &i; item_numb2 = &j; a = item&i; b = item&j; if count = 0 then count = 0.5; keep item_numb1 item_numb2 a b count; run; data out_2way_save; set out_2way_save out_2way_2; run; %end; %end; %do i = (&nrow2+&ncol2+1) %to (&nrow2+&ncol2+&nstrat2-1); %do j = (&i + 1) %to (&nrow2+&ncol2+&nstrat2); proc freq data=&data noprint; tables item&i.*item&j / out=out_2way sparse; run; *Fix data set up to save; data out_2way_2; set out_2way; item_numb1 = &i; item_numb2 = &j; a = item&i; b = item&j; if count = 0 then count = 0.5; keep item_numb1 item_numb2 a b count; run; data out_2way_save; set out_2way_save out_2way_2; run; %end; %end; data out_2way_save; set out_2way_save; *just helps to reorder for PROC IML code; new_count = count; drop count; run; %MEND MAKE_2WAY; **********************************************************************; ** **; ** NAME: MACRO FIND_SUBTABLE_WYZ **; ** PURPOSE: Finds sub-tables for Wi, Yj, and Zk combos **; ** **; **********************************************************************; %MACRO FIND_SUBTABLE_WYZ(data, save_data); %do i=1 %to &nrow2; %do j=1 %to &ncol2; %do k=1 %to &nstrat2; proc freq data=&data noprint; tables W&i*Y&j*Z&k / sparse out=out_sub; by iter; weight count; run; data out_sub2; set out_sub; W = &i; Y = &j; Z = &k; wi = w&i; yj = y&j; zk = z&k; drop percent w&i y&j z&k; run; data &save_data; set &save_data out_sub2; run; %end; %end; %end; %MEND FIND_SUBTABLE_WYZ; **********************************************************************; ** **; ** NAME: MACRO GENERALIZE_FREQ **; ** PURPOSE: Write part of a TABLES statement for PROC FREQ **; ** **; **********************************************************************; %MACRO GENERALIZE_FREQ(data, out_data); %let W = W1 *; %do i = 2 %to &nrow2; %let W = &W.W&i.*; %end; %let Y = Y1 *; %do j = 2 %to &ncol2; %let Y = &Y.Y&j*; %end; %let Z = Z1; %do k = 2 %to &nstrat2; %let Z = &Z.*Z&k; %end; proc freq data=&data noprint; table &W &Y &Z / sparse out=&out_data; by iter; run; %MEND GENERALIZE_FREQ; **********************************************************************; ** **; ** NAME: MACRO CORRECT_FORMAT **; ** PURPOSE: Put data in correct format needed for calculations **; ** **; **********************************************************************; %MACRO CORRECT_FORMAT(data_set); *Find multinomial format of data; %GENERALIZE_FREQ(&data_set, freq_set); data freq_set; set freq_set; percent = percent*0.01; run; data save_sub_WY_obs; set _null_; run; %FIND_SUBTABLE_WYZ(freq_set, save_sub_WY_obs); data set3; set save_sub_WY_obs; if count = 0 then count = &constant_zero; run; *This part would need to be generalized for other data sets with different number of items!; *Rename items for MAKE_SUBTABLE_SAME (finds all possible observed sub-tables); data sasdata_one3; set &data_set; item1 = w1; item2 = w2; item3 = w3; item4 = y1; item5 = y2; item6 = y3; item7 = y4; item8 = z1; item9 = z2; item10 = z3; item11 = z4; item12 = z5; keep item1-item12; run; data out_2way_save; set _null_; run; %MAKE_2WAY(sasdata_one3); %MEND CORRECT_FORMAT; **********************************************************************; ** **; ** NAME: MACRO SIM_LOOP **; ** PURPOSE: Looping structure for the bootstrap simulations **; ** NOTES: Just use sim_numb = 1 %to 1 for one example data set **; ** Does all of the main bootstrap calculations and fits the **; ** model of interest. **; ** **; **********************************************************************; %MACRO SIM_LOOP(sim_data, seed, n); %TIME(SIM_LOOP MACRO STARTED AT:); *Put data in correct format for analysis; %CORRECT_FORMAT(&sim_data); *Create data sets to save some of the results in; data save_rej; set _null_; run; data save_info_set; set _null_; run; data save_numb_it_item; set _null_; run; %do sim_numb = 1 %to 1; data set2_orig; set set3; if iter = &sim_numb; run; ***************************************************************; *Calculate the model for the observed data; *TURN OFF PRINTING; proc printto print="nul:"; run; *Fit the model; proc genmod data=set2_orig; class &class ; model count = &comp_ind_model &model / dist=poi link=log obstats noint; ods output obstats=obstats_orig modelfit=modfit_orig ParameterEstimates=est_orig; run; *TURN ON PRINTING; proc printto; run; *Sort for a merge later; proc sort data=modfit_orig; by criterion; run; ********************************************************************************; *Put the observed and predicted sub-tables together to use with Gange algorithm; *This part would need to be generalized for other data sets with different number of items!; data pred_sort2; set obstats_orig; select (W); when ("1") item_numb1=1; when ("2") item_numb1=2; when ("3") item_numb1=3; otherwise item_numb1=0; end; select (Y); when ("1") item_numb2=4; when ("2") item_numb2=5; when ("3") item_numb2=6; when ("4") item_numb2=7; otherwise item_numb2=0; end; select (Z); when ("1") item_numb3=8; when ("2") item_numb3=9; when ("3") item_numb3=10; when ("4") item_numb3=11; when ("5") item_numb3=12; otherwise item_numb3=0; end; if wi = "0" then a = 0; else a = 1; *need since wi is a character; if yj = "0" then b = 0; else b = 1; if zk = "0" then c = 0; else c = 1; new_count = pred; keep item_numb1-item_numb3 a b c new_count; run; %TIME(GANGE BEGINS:); *Using the observed data and the predicted data, find a vector of multinomial; * probabilities under the Ho model; * - Macro is located in the separate Gange program; %DO_IT_GANGE; %TIME(GANGE ENDS:); *Change result format of Gange program - p = tau = multinomial probabilities; data set1_tau2; set set1_tau; array item {&ncol} item1-item&ncol; array COLNUMB {&ncol} COL1-COL&ncol; do j = 1 to &ncol; item{j} = colnumb{j};; end; p = col%eval(&ncol+1); keep item1-item&ncol p; run; ********************************************************************************; *Take resamples; *generate the data - Macro is located in the separate resampling program; %R_DO_IT_DATA_GEN(set1_tau2, &seed, &n, &totsim, &numbsim, gendata); *save the number of resampled data sets lost due to marginal table not being the observed size; data numb_it_item; set numb_it; *rename n=lostset; sim_numb = &sim_numb; keep sim_numb n; run; data save_numb_it_item; set save_numb_it_item numb_it_item; run; *Create the final version of the resampled data. Note that the gendata produced in the; * resampling program will be for only 2 MRCVs (program originally written for that).; * We can still use the same program, but use a different DATA STEP to create gendata; * to hold the resamples ; data gendata; set sortset4; iter = floor((_n_-0.1)/&n+1); array rownumb {&nrow2} w1-w&nrow2; array colnumb {&ncol2} y1-y&ncol2; array stratnumb {&nstrat2} z1-z&nstrat2; array names {%eval(&nrow2*&ncol2*&nstrat2)} item1-item%eval(&nrow2*&ncol2*&nstrat2); do j=1 to &nrow2; rownumb{j} = names{j}; end; do j = 1 to &ncol2; colnumb{j} = names{j+&nrow2}; end; do j = 1 to &nstrat2; stratnumb{j} = names{j+&nrow2+&ncol2}; end; dummy=1; keep y1-y&ncol2 w1-w&nrow2 z1-z&nstrat2 iter dummy; run; %GENERALIZE_FREQ(gendata, freq_set_gendata); data freq_set_gendata; set freq_set_gendata; percent = percent*0.01; run; data save_sub_WY; set _null_; run; %FIND_SUBTABLE_WYZ(freq_set_gendata, save_sub_WY); proc sort data=save_sub_WY; by iter; run; *Add &constant_zero to a sub-table cell if the cell is 0; data resample2; set save_sub_WY; if count = 0 then count = &constant_zero; run; ********************************************************************************; *Fit model to resamples; %TIME(FIT MODEL TO RESAMPLES BEGINS:); *TURN OFF PRINTING; proc printto print="nul:"; run; proc genmod data=resample2; class &class; model count = &comp_ind_model &model / dist=poi link=log obstats noint; ods output obstats=obstats modelfit=mod_fit_boot; by iter; run; *TURN ON PRINTING; proc printto; run; %TIME(FIT MODEL TO RESAMPLES ENDS:); ********************************************************************************************; *Combine results and calculate p-value; *Sort for a merge coming up; proc sort data=mod_fit_boot; by criterion; run; *Merge the goodness-of-fit values for the resampled and observed data; data mod_fit_all; merge mod_fit_boot(rename=(value=value_star) drop=ValueDF DF) modfit_orig(rename=(value=value_orig) drop=ValueDF DF); by Criterion; if Criterion = "Log Likelihood" or Criterion = "Scaled Deviance" or Criterion="Scaled Pearson X2" then delete; if value_star > value_orig then reject = 1; else reject =0; run; data _null_; set modfit_orig; if criterion = 'Deviance' then call symput('Deviance', value); if criterion = 'Pearson Chi-Square' then call symput('Pearson', value); run; title2 "Bootstrap histogram"; proc univariate data=mod_fit_all noprint; histogram value_star; footnote1 "The observed Pearson = &pearson and the observed LRT = &Deviance"; by criterion; run; title2 "The p-values for &model versus the saturated"; footnote1; proc means data=mod_fit_all mean noprint; class criterion; var reject; output out=out_rej mean=p_value; run; data out_rej2; set out_rej; if _TYPE_ ne 0; drop _TYPE_ _FREQ_; sim_numb = &sim_numb; run; *put into save data set; data save_rej; set save_rej out_rej2; run; *Merge some of the calculations; data save_rej2; merge save_rej save_info_set save_numb_it_item; by sim_numb; if p_value < &alpha then reject=1; else reject=0; run; *Variable explanation: Error - max(abs(tau*_i - tau*_(i+1))) for the last iteration where tau*_i * is a vector of multinomial probabilities for the ith iteration; * iter_conv - number of iterations took to find the multinomial probabilities; * under the Ho model; * sim_numb and reject could be printed as well if simulations are being performed; title2 'Print the boot. p-values, Gange convergence information (error and iter_conv)'; proc print data=save_rej2; var Criterion p_value error iter_conv; run; %end; %TIME(SIM_LOOP MACRO ENDED AT:); %MEND SIM_LOOP; **********************************************************************; ** **; ** NAME: MACRO NUMOBS **; ** PURPOSE: Find number of observations in a data set **; ** NOTES: Adapted from a SAS website sample program, use with RS **; ** calculations **; ** VARIABLES: dsn = Data set **; ** name = Number of observations put in here **; ** **; **********************************************************************; %MACRO NUMOBS(dsn, name); %global &name; data _null_; call symput("&name", left(put(count2,8.))); stop; set &dsn nobs=count2; run; %MEND NUMOBS; **********************************************************************; ** **; ** NAME: MACRO G_H **; ** PURPOSE: Creates matrices of all 0-1 combinations which are **; ** needed for PROC IML computations for RS **; ** NOTES: Use with RS calculations **; ** VARIABLES: numb_items = number of items **; ** matrix = name of it **; ** **; ** **; **********************************************************************; %MACRO G_H(numb_items, matrix); %let X = do item1 = 0 to 1%str(;); %let endit = end%str(;); *Generalize; %do j = 2 %to &numb_items; %let X = &X.do item&j=0 to 1%str(;); %let endit = &endit end%str(;); %end; *Create matrix of all items; data itemvec; &X; output; &endit; run; *Transpose the matrix to be in the correct form; proc transpose data=itemvec out=&matrix.itemvec2; var item1-item&numb_items; run; %MEND G_H; **********************************************************************; ** **; ** NAME: MACRO RS **; ** PURPOSE: Fit the model, do the 2nd order RS adjustment, find **; ** standardized Pearson residuals **; ** NOTES: This MACRO is not set up for simulations **; ** VARIABLES: n = sample size **; ** **; ** **; **********************************************************************; %MACRO RS(n); %TIME(RS PART OF PROGRAM STARTED AT:); *Just used to save results; data save_sat2; set _null_; run; *Find G and H and L matrices; %G_H(&nrow2, G); %G_H(&ncol2, H); %G_H(&nstrat2, L); *TURN OFF PRINTING; proc printto print="nul:"; run; *Fit model - note: set3 comes from MACRO CORRECT_FORMAT, the data set is created in the bootstrap; * portion of the program; proc genmod data=set3; class &class; model count = &comp_ind_model &model / dist=poi link=log type3 obstats noint; ods output obstats=obstats_orig modelfit=modfit_orig ParameterEstimates=est_orig; ods exclude parameterestimates obstats; run; *TURN ON PRINTING; proc printto; run; ***********************************************************************************; * RS calculations; *TURN OFF PRINTING; proc printto print="nul:"; run; *Find X matrix; proc glmmod data=set3 OUTDESIGN=X_set; class &class; model count = w*y*z wi(w*y*z) yj(w*y*z) zk(w*y*z) &model / noint; run; *TURN ON PRINTING; proc printto; run; *Count the columns in X_set by looking at the est_orig data set; * ncol(X_set) = n_row(est_orig) - 2; %NUMOBS(est_orig, col_in_X); *Transpose for merge; proc transpose data=X_set out=Xtran name=col label=label; var col1-col%eval(&col_in_X-2); *-2 since there will be intercept and scale rows in est_orig; run; *Merge with parameter estimates to see which columns of X used; data merg_set; merge est_orig Xtran; if DF ne 0; run; *Transpose back; proc transpose data=merg_set out=X_mat(drop=_name_); var col1-col%eval(8*&nrow2*&ncol2*&nstrat2); run; *Get in order needed for PROC IML; data merg_set2; merge obstats_orig X_mat; run; *Get # of parameters estimated - correct # of columns in the X matrix ; %NUMOBS(merg_set, row_in_merg_set); proc sort data=merg_set2 out=merg_set3(keep= col1-col&row_in_merg_set); by descending wi descending yj descending zk; run; proc sort data=obstats_orig out=obstats_orig2; by descending wi descending yj descending zk; run; *Do the IML RS calculations; proc iml; *Get X; use merg_set3; read all into X; use obstats_orig2; read all var{pred} into mu_hat; *print mu_hat; use obstats_orig2; read all var{count} into m; *FIND V matrix - Cov(m); *Get tau_hat - multinomial probabilities; use freq_set; read all var{percent} into tau; *print tau; use Gitemvec2; read all into G; use Hitemvec2; read all into H; use Litemvec2; read all into L; *print G, H, L; *Matrices of ones; Jr = repeat(1, &nrow2, 2**&nrow2); Jc = repeat(1, &ncol2, 2**&ncol2); Jq = repeat(1, &nstrat2, 2**&nstrat2); tau_to_pi = G@H@L // G@H@(Jq-L) // G@(Jc-H)@L // G@(Jc-H)@(Jq-L) // (Jr-G)@H@L // (Jr-G)@H@(Jq-L) // (Jr-G)@(Jc-H)@L // (Jr-G)@(Jc-H)@(Jq-L); *This part finishes the calculations needed for the cov_m matrix (called V in Appendix A).; * Since this can take some time to do (10 minutes on 1.7GHZ processor with 512MB memory computer); * I have calculated the matrix already for the Section 4.2 data and saved it to the file v_mat_m.txt.; * Since the matrix does not change for each model, it can be used again - as long as the same data ; * set is being analyzed. For a different data set, remove the comments from the code below ; * and calculate it.; *cov_m = &n * tau_to_pi*(Diag(tau) - tau*t(tau))*t(tau_to_pi); *create cov_m_set from cov_m; * append from cov_m; use cov_m_set; read all into cov_m; *Covariance for beta_hat; A = Diag(sqrt(mu_hat))*X; cov_beta = inv(t(A)*A) * t(A) * Diag(1/sqrt(mu_hat)) * cov_m * t(inv(t(A)*A) * t(A) * Diag(1/sqrt(mu_hat))); *Covariance for mu_hat; cov_mu = Diag(mu_hat)*X * cov_beta * t(X)*Diag(mu_hat); *Covariance for residual; cov_e = (I(8*&nrow2*&ncol2*&nstrat2) - Diag(mu_hat)*X*inv(t(X)*Diag(mu_hat)*X)*t(X)) * cov_m * t(I(8*&nrow2*&ncol2*&nstrat2) - Diag(mu_hat)*X*inv(t(X)*Diag(mu_hat)*X)*t(X)); *Modified Pearson; e = m - mu_hat; Xsq_M = t(e) * Diag(1/mu_hat) * e; *print Xsq_M; *Pearson residuals; stand_err = repeat(0, 8*&nrow2*&ncol2*&nstrat2, 1); do i = 1 to 8*&nrow2*&ncol2*&nstrat2; *due to rounding error, may get small variance <0; if cov_e[i,i] <=0 then cov_e[i,i] = 10**(-16); stand_err[i,1] = sqrt(cov_e[i,i]); end; stand_resid = e/stand_err; pear_resid = e/sqrt(mu_hat); *print m mu_hat stand_err stand_resid pear_resid; send_out = stand_resid || stand_err|| pear_resid || mu_hat ; col = {"stand_resid" "stand_err" "pear_resid " "mu_hat"}; create stand_resid from send_out[colname=col]; append from send_out; *Find eigenvalues for RS adjustment; mat = Diag(1/mu_hat) * cov_e; eig_mat = eigval(mat); *print eig_mat; sum_eig = trace(mat); sum_eig_sq = sum(eig_mat[,1]##2); send_out2 = sum_eig || sum_eig_sq; *print sum_eig sum_eig_sq; col2 = {"sum_eig" "sum_eig_sq"}; create sat_adj from send_out2[colname=col2]; append from send_out2; quit; *Do this merge in order to get W Y Z Wi Yj Zk in the same data set - need for next sort; data stand_resid2; merge stand_resid obstats_orig2; run; *Reorder back to way it was before; proc sort data=stand_resid2 out=stand_resid3(keep=stand_resid stand_err pear_resid mu_hat observation count W Y Z wi yj zk); by w y z wi yj zk; run; data sat; set modfit_orig; if criterion = 'Pearson Chi-Square'; drop criterion valueDF; run; *Do the RS adjustment to the Pearson statistic; data sat2; merge sat sat_adj; length model $ 80; model = "&model"; RS_Pearson = sum_eig*value/sum_eig_sq; df_adj = sum_eig**2 / sum_eig_sq; sat_p = 1-CDF('CHISQUARED', RS_Pearson, df_adj); run; data save_sat2; set save_sat2 sat2; run; title2 "Print |standardized Pearson residuals| > 2.576"; proc print data=stand_resid3; where stand_resid>2.576 or stand_resid<-2.576; var observation W Y Z wi yj zk count mu_hat stand_resid stand_err; run; title2 'The Pearson statistic (value), RS adjusted Pearson (RS_Pearson)'; title3 'DF for chi-square approximation (df_adj), and resulting p-value (sat_p)'; proc print data=save_sat2; var value RS_Pearson df_adj sat_p; run; %TIME(RS PART OF PROGRAM ENDED AT:); %MEND RS;