/* */ /* This is version 1.0.1 of the reldist functions for SAS 7 or later. */ /* */ /* by Don Gensimore and Mark S. Handcock */ /* */ /* August 11, 2000 */ /* */ /* See the README file for details. */ /* See the documentation for the SPlus function for additional details */ /* */ /* This file contains S-PLUS functions used to construct figures in */ /* */ /* "Relative Distribution Methods in the Social Sciences" */ /* by Mark S. Handcock and Martina Morris, */ /* Springer-Verlag, 1999, ISBN 0387987789 */ /* */ /* We make no claims that the code given */ /* is as efficient as it could be, and we would be interested to learn */ /* of more efficient approaches. */ /* */ /* Other software is available from the Relative Distribution Website:*/ /* */ /* http://www.csde.washington.edu/~handcock/RelDist */ /* */ /* A link to the website is maintained by the publisher at: */ /* */ /* http://www.springer-ny.com/stats */ /* */ /* under the heading "Author/Editor Home Pages." */ /* The website contains descriptions of the variables */ /* and data file formats. */ /* */ OPTIONS COMPRESS=NO MPRINT MLOGIC SYMBOLGEN LINESIZE=130; %macro reldist(ComparisonDataset=, ComparisonVariable=, ComparisonWeight=, ReferenceDataset=, ReferenceVariable=, ReferenceWeight=, OutputDataset=, z=, zo=, match=none, how=add, location=median, scale=IQR, robust=F, graph=T ); %local ComparisonVariable ComparisonWeight ReferenceVariable ReferenceWeight z zo match how location scale robust AdjustedComparisonWeight AdjustedReferenceWeight; %let how = %upcase(&how); %let location = %upcase(&location); %let match = %upcase(&match); %let scale = %upcase(&scale); %let robust = %upcase(&robust); OPTIONS COMPRESS=NO; ******************************************************************************************************; * ComparisonDataset comparison dataset * ComparisonVariable comparison variable y * ComparisonWeight comparison weight ywgt * ReferenceDataset reference dataset * ReferenceVariable reference variable yo * ReferenceWeight reference weight yowgt * OutputDataset output dataset list(x=x,y=y,wgt=ywgt,...) * Number observations - comparison m * Number observations - reference n * z Covariate "how" option * zo Covariate "how" option * match type of relative distribution * how form of matching to comparison sample * location how to measure "mean" or "median" * scale how to measure "standev" or "IQR" * robust ******************************************************************************************************; ******************************************************************************************************; * Verify that there are no missing weights in the Comparison or Reference datasets If there are, substitute 0 for missing, and 1 for non-missing ******************************************************************************************************; ******************************************************************************************************; * create copies of datasets so orginals remain intact, note which dataset is comparison and which * dataset is reference ; * sum the weights, divide individual weights by sum so that the individual weights sum to 1; ******************************************************************************************************; proc sql; create table CmpDset1 as select &ComparisonWeight/sum(&ComparisonWeight) as AdjustedComparisonWeight, &ComparisonVariable as y, COUNT(&ComparisonVariable) as m, &ComparisonWeight as ywgt from &ComparisonDataset; create table RefDset1 as select &ReferenceWeight/sum(&ReferenceWeight) as AdjustedReferenceWeight, COUNT(&ReferenceVariable) as n, &ReferenceVariable as yo, &ReferenceWeight as yowgt from &ReferenceDataset; proc univariate data=CmpDset1 NOPRINT; var y; weight ywgt; output out=uni_comp MEDIAN=med_y MEAN=mean_y; data _null_; set uni_comp; call symput("med_y",med_y); call symput("mean_y",mean_y); proc univariate data=RefDset1 NOPRINT; var yo; weight yowgt; output out=uni_ref MEDIAN=med_yo MEAN=mean_yo; data _null_; set uni_ref; call symput("med_yo",med_yo); call symput("mean_yo",mean_yo); run; ********************************************************************************************************************; * Do matching, can be covariate/multiplicative/additive ; ********************************************************************************************************************; %if %substr(&how,1,1) eq C %then %do; %put Covariate method not implemented at this time; %end; %if %substr(&match,1,4) eq NONE %then %do; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish; Vector = yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then do; call symput("N",b); end; data CmpDset; keep Vector VectWgt source; set CmpDset1 end=finish; Vector = y ; VectWgt = AdjustedComparisonWeight; source = 'C'; a+1; if finish then do; call symput("M",a); end; %end; %else %do; %if %substr(&how,1,1) eq M %then %do; %* Start Multiplicative ; %if %substr(&location,1,1) eq M %then %do; %* Start Location = "Median" or "Mean"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Median"; %else %do; %* Not median, use mean; %* "Remove this later"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = yo ; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Mean"; %if %substr(&match,1,1) eq L %then %do; %* Start Match = "Location"; data CmpDset; keep Vector VectWgt source; set CmpDset1 end=finish ; Vector = y ; VectWgt = AdjustedComparisonWeight; source = 'C'; a+1; if finish then call symput("M",a); %if %substr(&location,1,3) eq MED %then %do; %* Start Location = "Median"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &med_y * yo / &med_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Median"; %else %do; %* Not median, use mean; %* Start Location = "Mean"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &mean_y * yo / &mean_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Mean"; %end; %* End Match = "Location"; %if %substr(&match,1,1) eq S %then %do; %* Start Match = "Shape"; data CmpDset; keep Vector VectWgt source; set RefDset1 end=finish; Vector = yo; VectWgt = AdjustedReferenceWeight; source = 'C'; a+1; if finish then call symput("M",a); %if %substr(&location,1,3) eq MED %then %do; %* Start Location = "Median"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &med_y * yo / &med_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then do; call symput("M",b); call symput("N",b); end; %end; %* End Location = "Median"; %else %do; %* Not median, use mean; %* Start Location = "Mean"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &mean_y * yo / &mean_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then do; call symput("M",b); call symput("N",b); end; %end; %* End Location = "Mean"; %end; %* End Match = "Shape"; %end; %* End How = "multiplicative"; %if %substr(&how,1,1) eq A %then %do; %* Start How = "Additive"; %if %substr(&match,1,1) eq N %then %do; %* Start Match = "None"; data CmpDset; keep Vector VectWgt source; set CmpDset1 end=finish ; Vector = y ; VectWgt = AdjustedComparisonWeight; source = 'C'; a+1; if finish then call symput("M",a); %if %substr(&location,1,1) eq M %then %do; %* Start Location = "Median" or "Mean"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Median" or "Mean"; %else %do; %* Not median, use mean; %* "Remove this later"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = yo ; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Remove this later"; %end; %* End Match = "None"; %if %substr(&match,1,1) eq L %then %do; %* Start Match = "Location"; data CmpDset; keep Vector VectWgt source; set CmpDset1 end=finish ; Vector = y; VectWgt = AdjustedComparisonWeight; source = 'C'; a+1; if finish then call symput("M",a); %if %substr(&location,1,3) eq MED %then %do; %* Start Location = "Median"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &med_y + yo - &med_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Median"; %else %do; %* Not median, use mean; %* Start Location = "Mean"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &mean_y + yo - &mean_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then call symput("N",b); %end; %* End Location = "Mean"; %end; %* End Match = "Location"; %if %substr(&match,1,1) eq S %then %do; %* Start Match = "Shape"; data CmpDset; keep Vector VectWgt source; set RefDset1 end=finish; Vector = yo; VectWgt = AdjustedReferenceWeight; source = 'C'; a+1; if finish then call symput("M",a); %if %substr(&location,1,3) eq MED %then %do; %* Start Location = "Median"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &med_y + yo - &med_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then do; call symput("M",b); call symput("N",b); end; %end; %* End Location = "Median"; %else %do; %* Not median, use mean; %* Start Location = "Mean"; data RefDset; keep Vector VectWgt source; set RefDset1 end=finish ; Vector = &mean_y + yo - &mean_yo; VectWgt = AdjustedReferenceWeight; source = 'R'; b+1; if finish then do; call symput("M",b); call symput("N",b); end; %end; %* End Location = "Mean"; %end; %* End Match = "Shape"; %end; %* End How = "additive"; %end; %* End match ******************************************************************************************************; * Rank the reference, comparison, and joint datasets. Use the "mean" to break ties on the reference and comparison datasets. Use "high" to break ties on the joint dataset.; ******************************************************************************************************; proc rank data=RefDset out=RefDset2 ties=mean; var Vector; ranks RankRefVect; proc rank data=CmpDset out=CmpDset2 ties=mean; var Vector; ranks RankCmpVect; data JntVect; /* Create the joint vector {yo,y} */ set RefDset2 CmpDset2; proc rank data=JntVect out=JntVect2 ties=high; var Vector; ranks RankJntVect; proc sort data=JntVect2; by RankJntVect; title1 'Joint vector with ranks'; run; proc print data=JntVect2 (obs=20); /* TESTING ONLY */ ******************************************************************************************************; * Calculate the cumulative weights. If the observation originated in the reference dataset, then add the "adjusted weight" (i.e. weight divided by sum of weights) to the running total. If the observation was from the comparison dataset, then set this value to 0. If "_rc_" (rank in comparison dataset) is not missing, the observation was from the comparison dataset, so set the variable "ry_sy" to be the rank of the value in the joint dataset less the rank of the same value in the comparison data.; ******************************************************************************************************; data cumulative(keep = SOURCE RankCmpVect RankJntVect RankRefVect RY_SY X XWGT m n); * &OutputDataset (keep = X XWGT VectWgt m n); drop csyowgt_ ; retain csyowgt_ 0 L_R_YO_N 0 xwgt 0; set JntVect2; m=&M; n=&N; if source eq 'R' then do; csyowgt_ = csyowgt_ + VectWgt; L_R_YO_N = RankRefVect; end; if RankCmpVect ne . then ry_sy = RankJntVect - RankCmpVect; else ry_sy = .; if source eq 'C' and (ry_sy le L_R_YO_N) then do; * xwgt = sum(VectWgt,xwgt); xwgt = VectWgt; x = csyowgt_; if x ge 1 then x = .9999999; * x = round(x,.000000001); * xwgt = round(xwgt,.000000001); output; end; title1 'Dataset containing cumulative weights'; run; proc print data=cumulative (obs=300); /* TESTING ONLY */ format x xwgt 11.9 RankCmpVect RankJntVect RankRefVect RY_SY 5. SOURCE $1.; *************************************************************************************************************; * Create a table by selecting all observations where "ry_sy" is equal to 0. They are the values in the comparison data whose rank is lower than the lowest value in the reference data. Create 2 tables by selecting the appropriate variables from the dataset created above. What we are doing is splitting the data into two parts, and then merging it back together based on the condition that the rank of the variable in the reference dataset is equal to the difference in the ranks between the combined data and the comparison data (provided they are not missing). Combine the "0" table with the merged table; *************************************************************************************************************; /* proc sql; create table null as select X, ry_sy, xwgt from cumulative where (ry_sy eq 0); create table part1 as select ry_sy, X, xwgt, RankRefVect from cumulative; create table part2 as select X, xwgt, ry_sy, RankRefVect from cumulative; create table part1_2 as select * from part1, part2 where (RankRefVect eq ry_sy and RankRefVect ne . and ry_sy ne .); */ data &OutputDataset; keep x xwgt; set cumulative; title1 'Output dataset with relative distribution'; run; proc print data=&OutputDataset (obs=200) split='*' /* (where=(ry ge 3800))*/ ; format X xwgt 11.9 ; label xwgt = 'Weight' X = 'Cumulative*sum of*weights("X")'; %if &graph eq T %then %graph_reldist; %mend; %macro graph_reldist; goptions ftext=SWISS ctext=BLACK htext=1 cells; *** Distribution Plots ***; title1 'Relative Distribution Histogram'; symbol v=STAR c=BLUE h=1 cells; axis1 width=2 order=0.05 to .95 by .1 label=('Proportion of the Reference Group'); axis2 width=2 label=('Relative Density'); symbol1 c=white w=3; proc capability data=cumulative vardef=N NOPRINT ; var X; HISTOGRAM / cfill=BLUE pfill=SOLID midpoints = 0.05 to 0.95 by .1 VREF=.10 vscale=proportion wbarline=0 caxes=BLACK cframe=WHITE haxis=axis1 vaxis=axis2; run; symbol; goptions ftext= ctext= htext=; axis1; %mend; %macro calc_polarization(dataset=,x=x,weight=wgt,idataset=); data prelim; set &dataset; w = abs(&x - 0.5); if &x gt 0.5 then selu = 1; else selu = 0; if &x le 0.5 then sell = 1; else sell = 0; w2 = w*w; proc means data=prelim NOPRINT SUM VAR; var w w2; weight &weight; output out=wgt_w sum=sum_w sum_w2 var=var_w var_w2; proc means data=prelim NOPRINT SUM; var &weight; output out=wgt_sum sum=sum_wgt; proc means data=prelim(where=(selu eq 1)) NOPRINT SUM VAR; var w w2; weight &weight; output out=wgt_w_u sum=sum_w_u sum_w2_u var=var_w_u var_w2_u; proc means data=prelim(where=(selu eq 1)) NOPRINT SUM; var &weight; output out=wgt_sum_u sum=sum_wgt_u; proc means data=prelim(where=(sell eq 1)) NOPRINT SUM VAR; var w w2; weight &weight; output out=wgt_w_l sum=sum_w_l sum_w2_l var=var_w_l var_w2_l; proc means data=prelim(where=(sell eq 1)) NOPRINT SUM; var &weight; output out=wgt_sum_l sum=sum_wgt_l; proc sql; create table rel_dist_parts as select * from wgt_sum, wgt_w, wgt_w_u, wgt_sum_u, wgt_w_l, wgt_sum_l; data prelim; set &idataset; w = abs(&x - 0.5); if &x gt 0.5 then selu = 1; else selu = 0; if &x le 0.5 then sell = 1; else sell = 0; w2 = w*w; proc means data=prelim NOPRINT SUM VAR; var w w2; weight &weight; output out=wgt_w sum=sum_w sum_w2 var=var_w var_w2; proc means data=prelim NOPRINT SUM; var &weight; output out=wgt_sum sum=sum_wgt; proc means data=prelim(where=(selu eq 1)) NOPRINT SUM VAR; var w w2; weight &weight; output out=wgt_w_u sum=sum_w_u sum_w2_u var=var_w_u var_w2_u; proc means data=prelim(where=(selu eq 1)) NOPRINT SUM; var &weight; output out=wgt_sum_u sum=sum_wgt_u; proc means data=prelim(where=(sell eq 1)) NOPRINT SUM VAR; var w w2; weight &weight; output out=wgt_w_l sum=sum_w_l sum_w2_l var=var_w_l var_w2_l; proc means data=prelim(where=(sell eq 1)) NOPRINT SUM; var &weight; output out=wgt_sum_l sum=sum_wgt_l; proc sql; create table irel_dist_parts as select * from wgt_sum, wgt_w, wgt_w_u, wgt_sum_u, wgt_w_l, wgt_sum_l; data cipieces; set rel_dist_parts; rpm = 4 * sum_w / sum_wgt - 1; * c1 = var_w; c1 = sum_w2/sum_wgt - (sum_w/sum_wgt)**2; rpu = 4 * sum_w_u / sum_wgt_u - 1; * c1u = var_w_u; c1u = sum_w2_u/sum_wgt_u - (sum_w_u/sum_wgt_u)**2; rpl = 4 * sum_w_l / sum_wgt_l - 1; * c1l = var_w_l; c1l = sum_w2_l/sum_wgt_l - (sum_w_l/sum_wgt_l)**2; m = _freq_; data icipieces; set irel_dist_parts; irpm = 4 * sum_w / sum_wgt - 1; * c2 = var_w; c2 = sum_w2/sum_wgt - (sum_w/sum_wgt)**2; irpu = 4 * sum_w_u / sum_wgt_u - 1; * c2u = var_w_u; c2u = sum_w2_u/sum_wgt_u - (sum_w_u/sum_wgt_u)**2; irpl = 4 * sum_w_l / sum_wgt_l - 1; * c2l = var_w_l; c2l = sum_w2_l/sum_wgt_l - (sum_w_l/sum_wgt_l)**2; n = _freq_; data final; merge cipieces icipieces; label rpm='polarization' rpml='95% lower bound for polarization' rpmu='95% upper bound for polarization'; label rpl='lower polarization' rpll='95% lower bound for lower polarization' rplu='95% upper bound for lower polarization'; label rpu='upper polarization' rpul='95% lower bound for upper polarization' rpuu='95% upper bound for upper polarization'; serp = 4*sqrt(c1/m + c2/n); serpu = 8*sqrt(c1u/m + c2u/n); serpl = 8*sqrt(c1l/m + c2l/n); rpml = rpm - 1.96*serp; rpmu = rpm + 1.96*serp; rpll = rpl - 1.96*serpl; rplu = rpl + 1.96*serpl; rpul = rpu - 1.96*serpu; rpuu = rpu + 1.96*serpu; proc print data=final split='*' nobs; var rpm rpml rpmu rpl rpll rplu rpu rpul rpuu; %mend;