dm log 'clear; output; clear' wpgm; options nodate nocenter linesize=64 ls=90 pageno=1; PROC GREPLAY NOFS IGOUT = WORK.GSEG; DELETE _ALL_; /* list contents of template catalog */ proc greplay nofs tc=sashelp.templt; list tc; run; quit; goptions reset=all; goptions rotate=landscape; /* SAS Textbook Examples An Introduction to Categorical Analysis by Alan Agresti Chapter 5 - Logistic Regression */; * Inputting the Crab data, p. 82-83. ; data crab; input color spine width satell weight; if satell>0 then y=1; if satell=0 then y=0; n=1; weight = weight/1000; color = color - 1; if color=4 then dark=0; if color < 4 then dark=1; cards; 3 3 28.3 8 3050 4 3 22.5 0 1550 2 1 26.0 9 2300 4 3 24.8 0 2100 4 3 26.0 4 2600 3 3 23.8 0 2100 2 1 26.5 0 2350 4 2 24.7 0 1900 3 1 23.7 0 1950 4 3 25.6 0 2150 4 3 24.3 0 2150 3 3 25.8 0 2650 3 3 28.2 11 3050 5 2 21.0 0 1850 3 1 26.0 14 2300 2 1 27.1 8 2950 3 3 25.2 1 2000 3 3 29.0 1 3000 5 3 24.7 0 2200 3 3 27.4 5 2700 3 2 23.2 4 1950 2 2 25.0 3 2300 3 1 22.5 1 1600 4 3 26.7 2 2600 5 3 25.8 3 2000 5 3 26.2 0 1300 3 3 28.7 3 3150 3 1 26.8 5 2700 5 3 27.5 0 2600 3 3 24.9 0 2100 2 1 29.3 4 3200 2 3 25.8 0 2600 3 2 25.7 0 2000 3 1 25.7 8 2000 3 1 26.7 5 2700 5 3 23.7 0 1850 3 3 26.8 0 2650 3 3 27.5 6 3150 5 3 23.4 0 1900 3 3 27.9 6 2800 4 3 27.5 3 3100 2 1 26.1 5 2800 2 1 27.7 6 2500 3 1 30.0 5 3300 4 1 28.5 9 3250 4 3 28.9 4 2800 3 3 28.2 6 2600 3 3 25.0 4 2100 3 3 28.5 3 3000 3 1 30.3 3 3600 5 3 24.7 5 2100 3 3 27.7 5 2900 2 1 27.4 6 2700 3 3 22.9 4 1600 3 1 25.7 5 2000 3 3 28.3 15 3000 3 3 27.2 3 2700 4 3 26.2 3 2300 3 1 27.8 0 2750 5 3 25.5 0 2250 4 3 27.1 0 2550 4 3 24.5 5 2050 4 1 27.0 3 2450 3 3 26.0 5 2150 3 3 28.0 1 2800 3 3 30.0 8 3050 3 3 29.0 10 3200 3 3 26.2 0 2400 3 1 26.5 0 1300 3 3 26.2 3 2400 4 3 25.6 7 2800 4 3 23.0 1 1650 4 3 23.0 0 1800 3 3 25.4 6 2250 4 3 24.2 0 1900 3 2 22.9 0 1600 4 2 26.0 3 2200 3 3 25.4 4 2250 4 3 25.7 0 1200 3 3 25.1 5 2100 4 2 24.5 0 2250 5 3 27.5 0 2900 4 3 23.1 0 1650 4 1 25.9 4 2550 3 3 25.8 0 2300 5 3 27.0 3 2250 3 3 28.5 0 3050 5 1 25.5 0 2750 5 3 23.5 0 1900 3 2 24.0 0 1700 3 1 29.7 5 3850 3 1 26.8 0 2550 5 3 26.7 0 2450 3 1 28.7 0 3200 4 3 23.1 0 1550 3 1 29.0 1 2800 4 3 25.5 0 2250 4 3 26.5 1 1967 4 3 24.5 1 2200 4 3 28.5 1 3000 3 3 28.2 1 2867 3 3 24.5 1 1600 3 3 27.5 1 2550 3 2 24.7 4 2550 3 1 25.2 1 2000 4 3 27.3 1 2900 3 3 26.3 1 2400 3 3 29.0 1 3100 3 3 25.3 2 1900 3 3 26.5 4 2300 3 3 27.8 3 3250 3 3 27.0 6 2500 4 3 25.7 0 2100 3 3 25.0 2 2100 3 3 31.9 2 3325 5 3 23.7 0 1800 5 3 29.3 12 3225 4 3 22.0 0 1400 3 3 25.0 5 2400 4 3 27.0 6 2500 4 3 23.8 6 1800 2 1 30.2 2 3275 4 3 26.2 0 2225 3 3 24.2 2 1650 3 3 27.4 3 2900 3 2 25.4 0 2300 4 3 28.4 3 3200 5 3 22.5 4 1475 3 3 26.2 2 2025 3 1 24.9 6 2300 2 2 24.5 6 1950 3 3 25.1 0 1800 3 1 28.0 4 2900 5 3 25.8 10 2250 3 3 27.9 7 3050 3 3 24.9 0 2200 3 1 28.4 5 3100 4 3 27.2 5 2400 3 2 25.0 6 2250 3 3 27.5 6 2625 3 1 33.5 7 5200 3 3 30.5 3 3325 4 3 29.0 3 2925 3 1 24.3 0 2000 3 3 25.8 0 2400 5 3 25.0 8 2100 3 1 31.7 4 3725 3 3 29.5 4 3025 4 3 24.0 10 1900 3 3 30.0 9 3000 3 3 27.6 4 2850 3 3 26.2 0 2300 3 1 23.1 0 2000 3 1 22.9 0 1600 5 3 24.5 0 1900 3 3 24.7 4 1950 3 3 28.3 0 3200 3 3 23.9 2 1850 4 3 23.8 0 1800 4 2 29.8 4 3500 3 3 26.5 4 2350 3 3 26.0 3 2275 3 3 28.2 8 3050 5 3 25.7 0 2150 3 3 26.5 7 2750 3 3 25.8 0 2200 4 3 24.1 0 1800 4 3 26.2 2 2175 4 3 26.1 3 2750 4 3 29.0 4 3275 2 1 28.0 0 2625 5 3 27.0 0 2625 3 2 24.5 0 2000 ; run; proc means data=crab; run; proc freq data=crab; tables color spine satell y; run; *******************************************************; /* Creating the categorical variable for width and plotting the proportion of satellites and whether satellites are present or not (Y = 1, yes; Y=2, no) versus width. */; data crab1; set crab; wcat=0; if width<=23.25 then wcat=1; if 23.25< width<=24.25 then wcat=2; if 24.25< width<=25.25 then wcat=3; if 25.25< width<=26.25 then wcat=4; if 26.25< width<=27.25 then wcat=5; if 27.25< width<=28.25 then wcat=6; if 28.25< width<=29.25 then wcat=7; if 29.25< width then wcat=8; run; proc print data=crab1; run; proc sql; create table crab2 as select *, sum(y)/sum(n) as prop, mean(width) as wmidpt, sum(y) as yes, sum(n) as cases from crab1 group by wcat; quit; proc sort data=crab2; by width; run; *************************************************; goption reset=all; symbol1 v=dot c=blue h=.7; symbol2 v=dot c=red h=.7; axis1 order=(0 1) label=(angle = 90 'Presence of Satellites'); axis2 label=('Width'); proc gplot data=crab2; plot prop*wmidpt y*width/ overlay vaxis=axis1 haxis=axis2; run; quit; *************************************************; /* Table 5.1, p. 106. Note: The variables LCL and UCL are the lower and upper values respectively, of the 95% confidence interval for the predicted probability. */; proc logistic data=crab2 ; model y = width ; *output out=predict p=pi_hat; run; proc logistic data=crab2 desc; model y = width ; output out=predict p=pi_hat; run; proc print data=predict; run; proc sql; create table pred2 as select *, sum(pi_hat) as predicted_satell, sum(pi_hat)/sum(n) as predicted_prob from predict group by wcat; quit; proc print data=pred2; run; proc sort data=pred2; by wcat; run; data pred3; set pred2; by wcat; if first.wcat; run; proc format; value wcat 1='<=23.25' 2='23.25-24.25' 3='24.25-25.25' 4='25.25-26.25' 5='26.25-27.25' 6='27.25-28.25' 7='28.25-29.25' 8='>29.25'; run; proc print data = pred3; format wcat wcat.; var wcat cases yes prop predicted_prob predicted_satell; run; *****************************************************; /* The model can also be estimated using proc genmod as can be seen in the following output. Proc genmod will provide the likelihood ratio confidence interval for all the parameters in the model including chi-squared tests for all the parameters in the model whereas proc logistic will provide the chi-squared tests as well as many other details such as model fit statistics and odds ratios. Another big difference between the two procedures is that proc genmod is a very general procedure that can handle many different distributions and link functions but it does not in general provide a great deal of residuals or built in options that more specific procedures such as proc logistic provides. */; proc genmod data=crab; model y = width / dist=bin link=identity waldci lrci; run; proc genmod data=crab desc ; model y = width / dist=bin link=logit waldci lrci; run; ***********************************************************; *The predicted probabilities discussed at the top of p. 107. ; proc logistic data=crab desc noprint; model y = width ; output out=predict p=pi_hat; run; proc print data=predict; where width=21 or width=26.3 or width=33.5; var width pi_hat; run; ***********************************************************; /* Variance and covariance can be obtained by using the covout option in the proc statement and the confidence interval for individual predicted values can be obtained by using the upper and lower options in the model statement. Results p. 110. */; proc logistic data=crab2 desc covout outest=temp; model y = width ; output out=predict p=pi_hat upper=ucl lower=lcl; run; proc print data=temp; run; proc print data=predict; run; proc print data=temp; where _type_='COV'; var _name_ intercept width; run; proc sql; select distinct width, pi_hat, lcl, ucl from predict where width= 26.5 ; quit; ************************************************************************; * Table 5.2, p. 112.; proc sql; create table pred2 as select pi_hat, wcat, sum(y) as Num_yes, sum(1-y) as Num_no, sum(pi_hat) as Fitted_yes, sum(1-pi_hat) as Fitted_no from predict group by wcat; quit; proc sort data=pred2; by wcat; run; proc print data=pred2; run; data pred3; set pred2; by wcat; if first.wcat; run; proc print data = pred3; format wcat wcat.; var wcat Num_yes Num_no Fitted_yes Fitted_no; run; ************************************************************************; * Inputting the grouped Crab data, p. 271. ; data grouped; input width cases satell; cards; 22.69 14 5 23.84 14 4 24.77 28 17 25.84 39 21 26.79 22 15 27.74 24 20 28.67 18 15 30.41 14 14 run; * Formatting the variable width to make the output look nice. ; proc format; value width 22.69='<=23.25' 23.84='23.25-24.25' 24.77='24.25-25.25' 25.84='25.25-26.25' 26.79='26.25-27.25' 27.74='27.25-28.25' 28.67='28.25-29.25' 30.41='>29.25'; run; ******************************************************************************; /* Likelihood-ratio model comparisons test from the deviance of the model with width as the predictor and the deviance of the model without any predictors, p. 114. Note: In proc logistic SAS includes the -2log likelihood for the full model and for the model without any predictors. Moreover, the output includes various goodness of fit test in the table labeled Testing Global Null Hypothesis: BETA=0. */; proc logistic data=grouped desc; model satell/cases = width ; run; ******************************************************************; /* Table 5.3, p. 116. Note: The influence option in the model statement provides the deviance residual, the diagonal element of the hat matrix, two confidence interval displacement diagnostics (C and CBAR), the change in the Pearson chi-square statistic (DIFCHSQ), and the change in the deviance (DIFDEV). This option was shown for the model using width as a predictor. */; data grouped; set grouped; id = _n_; run; proc logistic data = grouped desc noprint; model satell/cases = ; output out=temp1 reschi=pearsona p=pi_hata; run; proc print data=temp1; run; data temp1; set temp1; keep id Fitted_yesa pearsona; Fitted_yesa= pi_hata*cases; run; proc print data=temp1; run; ******************************************************************; proc logistic data = grouped desc; model satell/cases= width /influence; output out=temp2 reschi=pearson p=pi_hat h=h; run; proc print data=temp2; run; ******************************************************************; data temp2; set temp2; Fitted_yes=pi_hat*cases; adjres = pearson/sqrt(1-h); keep pearson Fitted_yes adjres cases satell width id pi_hat; run; data combo; merge temp1 temp2; by id; run; proc print data = combo; format width width.; var width cases satell fitted_yesa pearsona Fitted_yes pearson adjres; run; ******************************************************************; * Fig. 5.3, p. 116. ; data temp2; set temp2; prop = satell/cases; run; goption reset = all; symbol1 v=diamond c=blue h=1 i=spline; symbol2 v=dot c=red h=.8 i=none; axis1 order=(0 to 1 by .2) label=(angle=90 'Proportion with Satellites'); axis2 order=(22 to 32 by 2); legend1 label=none value=(height=1 font=swiss 'Fitted' 'Observed' ) position=(bottom right inside) mode=share cborder=black; proc gplot data=temp2; plot (pi_hat prop)*width/ overlay legend=legend1 vaxis=axis1 haxis=axis2; run; quit; goptions reset=all; ******************************************************************; * Table 5.4, p. 118. ; proc logistic data = grouped desc noprint; model satell/cases= ; output out=temp1 difchisq=Pearson_diffa difdev=Likelihood_ratio_diffa; run; proc print data=temp1; run; data temp1; set temp1; keep id Pearson_diffa Likelihood_ratio_diffa; run; proc logistic data = grouped desc noprint; model satell/cases = width ; output out=temp2 difchisq=Pearson_diff difdev=Likelihood_ratio_diff c=c dfbetas=dfbeta_int dfbeta_width; run; proc print data=temp2; data temp2; set temp2; keep id width Pearson_diff Likelihood_ratio_diff c dfbeta_width; run; data combo; merge temp2 temp1; by id; run; proc print data = combo; format width width.; var width dfbeta_width c Pearson_diff Likelihood_ratio_diff Pearson_diffa Likelihood_ratio_diffa; run; ************************************************************; * Inputting the aids data, table 5.5, p. 119. ; data aids1; input race1 azt1 symptoms freq race2 azt2 race3 azt3; cards; 1 1 1 14 0 0 1 1 1 1 0 93 0 0 1 1 1 0 1 32 0 1 1 -1 1 0 0 81 0 1 1 -1 0 1 1 11 1 0 -1 1 0 1 0 52 1 0 -1 1 0 0 1 12 1 1 -1 -1 0 0 0 43 1 1 -1 -1 ; run; proc print data=aids1; run; *************************************************************; /* Fitting a Logit model using different dummy and effect coding. In table 5.6, p. 121, the Last=zero column corresponds to the parameter estimates obtained by using dummy variables race1 and azt1. The First=zero column corresponds to the parameter estimates obtained using dummy variables race2 and azt2. The Sum=zero column corresponds to the parameter estimates obtained using the effect coded variables race3 and azt3. Note: The option descending (desc) in the proc statement so that the lower value, in this case symptoms = zero, is defined as the nonevent. */; proc genmod data=aids1 descending; model symptoms = race1 azt1/ dist=bin link=logit; weight freq; run; proc genmod data=aids1 desc; model symptoms = race2 azt2/ dist=bin link=logit; weight freq; run; proc genmod data=aids1 desc; model symptoms = race3 azt3/ dist=bin link=logit; weight freq; run; ***************************************************************; /* Inputting the aids data and fitting a logit model using the code in Table A.9, p. 272. Note: The parameter estimates from the first model corresponds to the column labeled Last=zero and the estimated from the second model corresponds to the column labeled First=zero. */; data aids; input race $ azt $ yes no @@; cases = yes + no; cards; white y 14 93 white n 32 81 black y 11 52 black n 12 43 ; run; proc genmod data=aids order=data; class race azt; model yes/cases = race azt / dist=bin link=logit obstats type3; run; proc genmod data = aids desc; class race azt; model yes/cases = race azt/ dist=bin link=logit; run; *******************************************************; * Linear Logit Model; * Alcohol and Malformation; data alc; input Alc yes no @@; n=yes+no; cards; 0 48 17066 0.5 38 14464 1.5 5 788 4.0 1 126 7.0 1 37 ; run; proc print data=alc; run; proc genmod data = alc order=data; class alc ; model yes/n = alc / dist=bin link=logit; run; proc genmod data = alc order=data; class alc ; model yes/n = / dist=bin link=logit; run; proc genmod data = alc order=data; model yes/n =alc / dist=bin link=logit; run; data alc; set alc; if alc >= 1.5 then alc2=1; else alc2=0; run; proc print data=alc; run; proc genmod data = alc order=data; model yes/n =alc2 / dist=bin link=logit; run; **********Chcoran-Armitage Trend Test; data alc; input Alc mal count @@; cards; 0 1 48 0 0 17066 0.5 1 38 0.5 0 14464 1.5 1 5 1.5 0 788 4.0 1 1 4.0 0 126 7.0 1 1 7.0 0 37 ; run; proc print data=alc; run; proc freq data=alc order=data; tables mal*alc / chisq cmh ; weight count; run; proc freq data=alc order=data; tables mal*alc / chisq cmh1 ; weight count; run; ****************************************************************; /* Results from the logistic model with width and dummy variables for the categorical variable color as predictors. In order generate the predicted probabilities at the bottom of p. 123 two new observations were added to the dataset. */; data dummy; if _n_=1 then do; width=26.3; c1=1; c2=0; c3=0; output; width=26.3; c1=0; c2=0; c3=0; output; end; set crab; c1 = 0; if color=1 then c1=1; c2 = 0; if color=2 then c2=1; c3 = 0; if color=3 then c3=1; output; run; proc print data=dummy; run; proc logistic data = dummy desc; model y = c1 c2 c3 width; output out=temp p=pi_hat; run; proc print data = temp; where y=.; var width c1 c2 c3 pi_hat; run; *****************************************************; * Fig. 5.4, p. 124. ; proc sort data=temp; by width; run; data temp1; set temp; if c1=1 then pi1 = pi_hat ; else pi1= . ; if c2=1 then pi2=pi_hat; else pi2=.; if c3=1 then pi3=pi_hat; else pi3=.; if c2=0 and c1=0 and c3=0 then pi4=pi_hat; if c1 = 1 or c2=1 or c3=1 then pi4=.; run; goptions reset=all; symbol1 c=blue i=spline width=2 ; symbol2 c=red i=spline w=2 ; symbol3 c=green i=spline w=2 ; symbol4 c=cyan i=spline w=2 ; axis1 order=(0 to 1 by .1) label=(angle=90 'Est. prob.'); legend1 label=none value=(height=1 font=swiss 'Color 1' 'Color 2' 'Color 3' 'Color 4' ) position=(bottom right inside) mode=share cborder=black; proc gplot data=temp1; plot (pi1 pi2 pi3 pi4)*width / vaxis=axis1 overlay legend=legend1; run; quit; *****************************************************; /* Testing the main effect of color by testing the the three dummy variables simultaneously, p. 124. Note: There are several ways to accomplish this. One way is to add a test statement which uses a Wald Chi-squared test and SAS will provide the test statistic and a p-value. Another way is to run the model with and without the variables to be tested and then take the difference of the -2logL in the Model Fit Statistics table and compare this difference to the Chi-squared distribution with n degrees of freedom (where n=number of variables being tested). */; proc logistic data = dummy desc; model y = c1 c2 c3 width; test c1=c2=c3=0; run; proc logistic data = dummy desc; model y=width; run; proc logistic data = dummy desc; model y= color width; run; data dummy; set dummy; cscore=1; if color=4 then cscore=0; run; proc logistic data = dummy desc; model y= cscore width; run; *****************************************************; /* Estimating the main effects with Crab data, table 5.7, p. 127. Note: The type3 option tells SAS to test the main effects as well as the dummy variables for the categorical variables. */; proc genmod data = crab desc; class color spine ; model y = weight width color spine / dist=bin link=logit type3; run; proc logistic data = crab desc; class color spine / PARAM=REFERENCE ref=last ; model y = weight width color spine ; run; proc logistic data = crab desc; class color spine / PARAM=REFERENCE ref=last ; model y = weight width color spine / selection=stepwise; run; *****************************************************; /* Obtaining the same results using dummy variables and proc logistic. Note: Only proc logistic will provide the likelihood-ratio test comparing the full model to the null model. */; data main; set crab; c1 = 0; if color=1 then c1=1; c2 = 0; if color=2 then c2=1; c3 = 0; if color=3 then c3=1; spine1=0; if spine=1 then spine1=1; spine2=0; if spine=2 then spine2=1; run; proc print data=main; run; proc logistic data = main desc; model y = c1 c2 c3 spine1 spine2 width weight; test c1=c2=c3=0; test spine1=spine2=0; run; ******************************************************************; * The first four rows of the deviance and df columns of table 5.8, p. 128. ods listing close; proc genmod data=crab desc ; class color spine; model y = color|spine|width / dist=bin link=logit type3; ods output modelfit=temp; run; proc print data=temp; run; proc genmod data=crab desc; class color spine; model y = color|spine|width@2 / dist=bin link=logit type3; ods output modelfit=temp1; run; proc print data=temp1; run; proc genmod data=crab desc; class color spine; model y = color spine width color*spine spine*width / dist=bin link=logit type3; ods output modelfit=temp2; run; proc print data=temp2; run; proc genmod data=crab desc; class color spine; model y = color spine width color*width spine*width / dist=bin link=logit type3; ods output modelfit=temp3; run; proc print data=temp3; run; ods output close; ods listing; data combo; set temp temp1 temp2 temp3; run; proc print data = combo; where Criterion='Deviance'; var criterion df value; run; ******************************************************************; * The Diarrhea Example: ; data diarrhea; input cep age stay case count; datalines; 0 0 0 0 385 0 0 1 5 233 0 1 0 3 789 0 1 1 47 1081 1 1 1 5 5 ; run; proc logistic data = diarrhea descending exactonly; model case/count = age stay cep; exact 'parm' age stay cep / estimate = parm; run;