# # This is version 1.0.2 of the reldist functions. # by Mark S. Handcock # # February 16, 2001 # # See the README file for details. # # This file contains S-PLUS functions used to construct figures in # # "Relative Distribution Methods in the Social Sciences" # by Mark S. Handcock and Martina Morris, # Springer-Verlag, 1999, ISBN 0387987789 # # We make no claims that the code given # is as efficient as it could be, and we would be interested to learn # of more efficient approaches. # # Other software is available from the Relative Distribution Website: # # http://www.csde.washington.edu/~handcock/RelDist # # The website contains related software, descriptions of the variables # and data file formats. # assign("RDlib.loc", "./library") # # relative distribution and density functions for continuous data # reldist <- function(y, yo=F, ywgt=F,yowgt=F, match="none", how="add", location="median", scale="IQR", rpmult=F, z=F, zo=F, smooth = 0.35, cdfplot=F, ci=F, bar="no", add=F, graph=T, type="l", xlab="Reference proportion",ylab="Relative Density",yaxs="r", yolabs=pretty(yo), yolabslabs=NULL, ylabs=pretty(y), yolabsloc=0.6, ylabsloc=1, ylim=NULL, cex=0.8, lty=1, binn=100, aicc=seq(0.1,1.8, by=0.05), deciles=(0:10)/10, method="binning", ...) { # # # INPUT variables # # required variables # y sample from comparison distribution # yo sample from reference distribution # # smoothness variables # smooth degree of smoothness required in the fit. Higher values lead # to smoother curves, lower to closer fits to the observed # data. # If the local-linear binning estimator is used it is the span # of the smoother as a proportion of [0,1]. # If the logspline estimator is used it is the number # of knots used in the spline fit. If it is not specified # the number of knots that minimizes the BIC is used. # # # plotting variables # graph graph the results on the current device # bar graph the deciles on the current device # "no" no deciles plotted # "yes" deciles plotted with the non-paramteric fit # "only" plot the deciles only and not the non-parametric fit. # add add the density to the current plot? # ylim plotting limits for the vertical axis # lty line type to use for the density # xlab horizontal label # ylab vertical label # ylabs locations for labels to be added to the right axis # ylabslabs labels indicating the original scale # ylabsloc distance of labels to right of axis (in lines) # yolabs locations for labels to be added to the top axis # yolabslabs labels indicating the original scale # yolabsloc distance of labels above axis (in lines) # yaxs style of vertical axis # # options # cdfplot calculate and plot the CDF rather than the density. # ci include 95% pointwise confidence bands on the plot? # ywgt weights on the comparison sample # yowgt weights on the reference sample # match type of relative distribution to produce # (the reference is always matched to the comparison) # none comparison to reference # location matched reference to reference # shape comparison to matched reference # how form of matching to the comparison sample # mult multiplitively scale the reference # add additively shift the reference # ls location/scale shift the reference # covariate location/scale shift the reference # (requires z and zo to be specified) # location how to measure location: "mean", "median" # scale how to measure scale: "standev", "IQR" # rpmult in calculation of polarization indices ONLY, multiplicatively # scale the reference sample to # the comparison sample before comparing the two distributions? # # debugging options # binn number of bins used in the smoother # yaxs style of vertical axis # # # OUTPUT list components # x horizontal ordinates for the density (typically percent) # y density at x # rp 95% confidence interval for the median relative polarization # as lower bound, estimate, upper bound. # rpl lower relative polarization # 95% confidence interval for the lower relative polarization # as lower bound, estimate, upper bound. # rpu upper relative polarization # 95% confidence interval for the upper relative polarization # as lower bound, estimate, upper bound. # cdf list for the CDF # x horizontal ordinates for the CDF (typically percent) # y CDF at x # # # NOTE: Most of the code is for the plotting and tinkering. # The guts of the method are forming the relative data # at the top. The rest is a standard fixed interval # density estimation with a few bells and whistles. # # ------------------------------------------------------------------------- # # if(how=="covariate"){ # stop("This has not been implemented yet. sorry. MSH") # } if(is.logical(bar)){ if(bar){bar <- "yes"}else{bar <- "no"} } if(missing(yo)){ m <- length(y) if(missing(ywgt)){ywgt <- rep(1/m,length=m)} y <- as.vector(y) x <- sort.list(y) ywgt <- as.vector(ywgt)[x] x <- y[x] m <- length(x) n <- 0 rmd <- list(x=x,wgt=ywgt,n=n,m=m) }else{ rmd <- rmdata(y=as.vector(y), yo=as.vector(yo), ywgt=as.vector(ywgt), yowgt=as.vector(yowgt), match=match, how=how, location=location, scale=scale, z=as.vector(z), zo=as.vector(zo) ) y <- rmd$y yo <- rmd$yo ywgt <- rmd$ywgt yowgt <- rmd$yowgt x <- rmd$x n <- rmd$n m <- rmd$m } r <- seq(0, 1, length = binn + 1)[-1] - 0.5/binn # # Now work on the data # if(method=="logspline"){ # # Use Kooperberg and Stone's logspline density estimation # aicc <- NULL if(missing(smooth)){ warning( "Make sure you have the `nlsd` library to choose the smoothing parameter" ) smooth <- 8 yl <- nlsd.fit(x, wgt=m*ywgt, nknots=smooth, lbound = 0, ubound = 1, wttype=0) smooth <- yl$nknots gpdf <- dnlsd(r, fit=yl) # # calc the CDF # cdfgr <- pnlsd(r, fit=yl) }else{ warning("Make sure you have the `logspline` library to use this option") yl <- logspline.fit(uncensored=x, nknots=smooth, lbound = 0, ubound = 1, delete=F) gpdf <- dlogspline(r, fit=yl) # # calc the CDF # cdfgr <- plogspline(r, fit=yl) } if(really.missing(ywgt)){ cdfg <- seq(along=x)/m }else{ cdfg <- cumsum(ywgt) } if(really.missing(yowgt)){ xcdfg <- seq(along=x)/m }else{ cdfg <- cumsum(ywgt) } }else{ # # By default use binning and local polynomial methods # if(really.missing(ywgt)){ xx <- table(cut(x, breaks = seq(0, 1, length = binn + 1), include.lowest=T))/m }else{ ywgt <- ywgt/sum(ywgt) xxx <- cut(x, breaks = seq(0, 1, length = binn + 1),include.lowest=T) xx <- rep(0,length=binn) for (i in unique(xxx)){ xx[i] <- sum(ywgt[xxx==i],na.rm=T) } } xx <- as.vector(xx) # if(method=="quick"){ # # Anscombe transformation to stabilize variances # not quite as good # aicc <- NULL vstxx <- 2*sqrt(m*xx + 3/8) yl <- loess(vstxx ~ r, span = smooth, degree = 1) gpdf <- yl$fitted.values * yl$fitted.values/4 - 3/8 gpdf[gpdf<0] <- 0 } if(method=="adaptive"){ # # Use ksmooth density estimation # and Ricardo Cao adaptive method to choose the bandwidth # aicc <- relasp(y=y,yo=yo,binn=binn)[,2] aicc[aicc > 3*mean(aicc)] <- mean(aicc) aicc[aicc < mean(aicc)/3] <- mean(aicc)/3 # aicc[aicc < 0.03] <- 0.03 gpdf <- r xdat <- c(-x,x,2-x) for(i in (1:binn)){ # cat(i,r[i],aicc[i],"\n") gpdf[i] <- ksmooth(xdat, kernel="normal", bandwidth=2.696*aicc[i], x.points=r[i])$y*3 } # scalef <- binn/sum(gpdf) # gpdf <- gpdf * scalef # }else{ # # Use local-likelihood density estimation # and aicc to choose the bandwidth # based on a grid search # gamdata <<-data.frame(x=r,y=m*xx) if(missing(smooth)){ aicc <- cbind(aicc,aicc) dimnames(aicc)[[2]] <- c("smooth","aicc") for(i in seq(along=aicc[,1])){ smwidth <<- aicc[i,1] yl <- gam(y ~ lo(x, degree = 2, smwidth), family = "poisson", data=gamdata) df <- yl$nl.df + 2 if(df >= binn - 2){ aicc[i,2] <- Inf }else{ aicc[i,2] <- log(yl$deviance) + (1 + df/binn)/(1 - df/binn - 2/binn) } } aicc <- aicc[order(aicc[,2]),] smooth <- aicc[1,1] } smwidth <<- smooth yl <- gam(y ~ lo(x,degree = 2, smwidth), family = "poisson", data=gamdata) gpdf <- predict(yl, type = "response") # scalef <- binn/sum(gpdf) gpdf <- gpdf * scalef if(ci){ gpdfse <- predict.gam(yl, type = "response", se.fit=T)$se.fit gpdfse <- as.vector(gpdfse) * scalef } } # # calc the CDF # if(really.missing(ywgt)){ cdfg <- seq(along=x)/m }else{ cdfg <- cumsum(ywgt) } # # next is the old way # # cdfgr <- approx(x=x,y=cdfg,xout=r)$y # cdfgr <- cumsum(gpdf)/sum(gpdf) } # # calc deciles # if(really.missing(ywgt)){ xx <- table(cut(x, breaks = deciles, include.lowest=T))/m }else{ ywgt <- ywgt/sum(ywgt) xxx <- cut(x, breaks = deciles, include.lowest=T) xx <- rep(0,length=length(deciles)) for (i in unique(xxx)){ xx[i] <- sum(ywgt[xxx==i],na.rm=T) } } cdfgdd <- as.vector(xx)[-length(deciles)] # # next the old way # # cdfgdd <- approx(x=c(0,seq(along=x)/m,1),y=c(0,cdfg,1), # xout=deciles[-c(1,length(deciles))])$y # cdfgdd[is.na(cdfgdd)] <- 1 # cdfgdd <- diff(c(0,cdfgdd,1)) # # Calculate the entropy of the relative distribution # egv <- gpdf[gpdf > 0] entropy <- sum(egv*log(egv))/binn # if(method == "histogram"){ bar <- "yes" type <- "n" } if(missing(ylim)){ if(bar!="no"){ add <- T barplot(height=(length(deciles)-1)*cdfgdd, histo=T, width=deciles, cex=cex, yaxs=yaxs, xlab = xlab, ylab = ylab, xaxt="n", yaxt="n", ...) } if(add){ if(method != "histogram" & method != "binning"){graph <- F} if(cdfplot){ lines(x = r, y = cdfgr, lty=lty) }else{if(method != "histogram" & bar != "only"){ lines(x = r, y = gpdf,lty=lty) } } }else{ if(graph & !cdfplot & bar!="only"){ plot(x = r, y = gpdf, type = type, lty=lty, cex=cex, yaxs=yaxs,xlab = xlab, ylab = ylab, xaxt="n", yaxt="n", ...) } if(graph & cdfplot){ plot(x = r, y = cdfgr, type = "l", lty=lty, ylim=c(0,1), xlim=c(0,1),cex=cex, yaxs=yaxs,xlab = xlab, ylab = ylab, xaxt="n", yaxt="n", ...) abline(h = (0:10)/10, lty = 1, lwd = 0.5, col=0.2) abline(v = (0:10)/10, lty = 1, lwd = 0.5, col=0.2) } } }else{ if(bar!="no"){ add <- T barplot(height=(length(deciles)-1)*cdfgdd,histo=T, width=deciles, ylim=ylim, cex=cex, yaxs=yaxs, xlab = xlab, ylab = ylab, xaxt="n", yaxt="n", ...) } if(add){ if(method != "histogram" & method != "binning"){graph <- F} if(cdfplot){ lines(x = r, y = cdfgr, lty=lty) }else{if(method != "histogram" & bar!="only"){ lines(x = r, y = gpdf,lty=lty) } } }else{ if(graph & !cdfplot & bar!="only"){ plot(x = r, y = gpdf, type = type, ylim=ylim,lty=lty, cex=cex, yaxs=yaxs,xlab = xlab, ylab = ylab, yaxt="n", xaxt="n", ...) } if(graph & cdfplot){ plot(x = r, y = cdfgr, type = "l", ylim=ylim, lty=lty, xlim=c(0,1), cex=cex, yaxs=yaxs,xlab = xlab, ylab = ylab, xaxt="n", yaxt="n", ...) abline(h = (0:10)/10, lty = 2, lwd = 0.5) abline(v = (0:10)/10, lty = 2, lwd = 0.5) } } } if(graph & !cdfplot){abline(h = 1, lty = 2)} if(graph & cdfplot){abline(a = 0, b = 1, lty = 2)} # if(graph){ if(graph & missing(ylabs)){ if(cdfplot){axis(2,cex=cex,at=(0:10)/10,mgp=c(1,ylabsloc,0))}else{ axis(2,cex=cex,mgp=c(3,ylabsloc,0))}} # if(cdfplot){axis(2,cex=cex,at=(0:10)/10,mgp=c(2,1,0))}else{ # axis(2,cex=cex,mgp=c(2,0.5,0))}} else{ if(cdfplot){axis(2,cex=cex,at=(0:10)/10,mgp=c(1,ylabsloc,0))}else{ axis(2,cex=cex,mgp=c(3,ylabsloc,0))} # if(cdfplot){axis(2,cex=cex,at=(0:10)/10,mgp=c(2,1,0))}else{ # axis(2,cex=cex,mgp=c(2,0.5,0))} if(length(ylabs) == 1 & ylabs[1] == T){ylabs<-pretty(y)} yxlabs <- rmdata(ylabs,y)$x # # next places it on the top inside (outside is tck0.02) # mgp c(3,2,0) is normal outside c(3,-2,0) is normal inside # # axis(side = 4, at = yxlabs, labels = paste(ylabs), tck=-0.02, # cex=cex, mgp=c(2,0.5,0)) # cex=cex, mgp=c(1,.6,0)) axis(side = 4, at = yxlabs, labels = paste(ylabs), tck=-0.02, cex=cex, mgp=c(1,ylabsloc,0)) } # if(missing(yolabs)){ if(cdfplot){axis(1,cex=cex,at=(0:10)/10)}else{ axis(1,cex=cex,mgp=c(3,yolabsloc,0))}} # if(cdfplot){axis(1,cex=cex,at=(0:10)/10,mgp=c(2,0.5,0))}else{ # axis(1,cex=cex,mgp=c(2,0.5,0))}} else{ if(cdfplot){axis(1,cex=cex,at=(0:10)/10)}else{ axis(1,cex=cex,mgp=c(3,yolabsloc,0))} # if(cdfplot){axis(1,cex=cex,at=(0:10)/10,mgp=c(2,0.5,0))}else{ # axis(1,cex=cex,mgp=c(2,0.5,0))} if(length(yolabs) == 1 & yolabs[1] == T){yolabs<-pretty(yo)} if(missing(yo)){ yoxlabs <- yolabs }else{ yoxlabs <- rmdata(yolabs,yo)$x } # # next places it on the top outside # if(missing(yolabslabs)){yolabslabs <- paste(yolabs)} axis(side = 3, at = yoxlabs, labels = yolabslabs, tck=-0.02, cex=cex, mgp=c(3,yolabsloc,0)) # axis(side = 3, at = yoxlabs, labels = yolabslabs, tck=-0.02, # cex=cex, mgp=c(2,0.5,0)) } } gpdf.ci <- NULL if(ci){ if(n > 0){ if(method=="binning"){ varg <- gpdfse^2 + (1/(2*sqrt(pi)))*gpdf*gpdf/(n*smooth) }else{ varg <- (1/(2*sqrt(pi)))*(1/n+gpdf/m)*gpdf } }else{ varg <- gpdfse^2 } gpdf.ci <- list(x=r, l=gpdf-1.96*sqrt(varg), u=gpdf+1.96*sqrt(varg)) if(graph & !cdfplot & bar!="only"){ points(x=gpdf.ci$x,y=gpdf.ci$l,pch=".",cex=0.3) points(x=gpdf.ci$x,y=gpdf.ci$u,pch=".",cex=0.3) } } cdf.out <- list(x=seq(along=x)/m,y=cdfg) # # Calculate polarizations # if(!missing(yo)){ if(!(match=="location" & how=="mult") & rpmult & !missing(yo)){ x <- rmdata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match="location", how="mult", location=location, scale=scale)$x }else{ if(how=="covariate" | (match!="location" & !rpmult)){ x <- rmdata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match="location", how="add", location=location, scale=scale)$x } } } w <- abs(x - 0.5) selu <- x > 0.5 sell <- x <= 0.5 if(!really.missing(ywgt)){ rpm <- 4 * sum(w*ywgt)/sum(ywgt) - 1 c1 <- wtd.var(w,weight=ywgt) rpu <- 4 * sum(w*ywgt*selu)/sum(ywgt*selu) - 1 c1u <- wtd.var(w,weight=ywgt*selu) rpl <- 4 * sum(w*ywgt*sell)/sum(ywgt*sell) - 1 c1l <- wtd.var(w,weight=ywgt*sell) }else{ rpm <- 4 * mean(w) - 1 c1 <- var(w) rpu <- 4 * sum(w*selu)/sum(selu) - 1 c1u <- var(w[selu]) rpl <- 4 * sum(w*sell)/sum(sell) - 1 c1l <- var(w[sell]) } if(really.missing(yo)){ serp <- 4*sqrt(c1/m) serpu <- 8*sqrt(c1u/m) serpl <- 8*sqrt(c1l/m) }else{ if(match=="location" & how=="mult" | rpmult){ x <- rmdata(y=yo, yo=y, ywgt=yowgt,yowgt=ywgt, match="location", how="mult", location=location, scale=scale)$x }else{ x <- rmdata(y=yo, yo=y, ywgt=yowgt,yowgt=ywgt, match="location", how="add", location=location, scale=scale)$x } w <- abs(x - 0.5) selu <- x > 0.5 sell <- x <= 0.5 if(really.missing(yowgt)){ c2 <- var(w) c2u <- var(w[selu]) c2l <- var(w[sell]) }else{ c2 <- wtd.var(w,weight=yowgt) c2u <- wtd.var(w,weight=yowgt*selu) c2l <- wtd.var(w,weight=yowgt*sell) } serp <- 4*sqrt(c1/m + c2/n) serpu <- 8*sqrt(c1u/m + c2u/n) serpl <- 8*sqrt(c1l/m + c2l/n) } rpm <- rpm+1.96*serp *c(-1, 0, 1) rpu <- rpu+1.96*serpu*c(-1, 0, 1) rpl <- rpl+1.96*serpl*c(-1, 0, 1) # # Output # list(x=r,y=gpdf,ci=gpdf.ci, rp = rpm, rpl= rpl, rpu= rpu, cdf=cdf.out, deciles=cdfgdd, entropy=entropy, smooth=smooth,aicc=aicc ) } # # median polarization index # "rpy" <- function(y=F, yo=F, ywgt=F,yowgt=F, pvalue=F, z=F, zo=F, match="location", how="add", location="median", scale="IQR", rpmult=F ){ # # pvalue produce two-sided p-value for the polarization index # # See "reldist" for a description of the other variables # # # OUTPUT list components # 95% confidence interval for the median relative polarization # as lower bound, estimate, upper bound. # # ------------------------------------------------------------------------- # if(really.missing(yo)){ rdata <- list(x=y, m=length(x), ywgt=rep(1,length(x))) }else{ if(how=="covariate" & !really.missing(z) & !really.missing(zo)){ rdata <- rmdata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match=match, how=how, z=z, zo=zo, location=location, scale=scale) y <- rdata$y yo <- rdata$yo ywgt <- rdata$ywgt yowgt <- rdata$yowgt if(!rpmult){ how <- "add" }else{ how <- "mult" } } rdata <- rmdata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match=match, how=how, location=location, scale=scale) y <- rdata$y yo <- rdata$yo yowgt <- rdata$yowgt n <- rdata$n } x <- rdata$x m <- rdata$m ywgt <- rdata$ywgt # # Calculate polarizations # w <- abs(x - 0.5) rpm <- 4 * sum(w*ywgt)/sum(ywgt) - 1 c1 <- wtd.var(w,weight=ywgt) x <- rmdata(y=yo, yo=y, ywgt=yowgt,yowgt=ywgt, match=match, how=how, location=location, scale=scale)$x w <- abs(x - 0.5) c2 <- wtd.var(w,weight=yowgt) serp <- 4*sqrt(c1/m + c2/n) if(pvalue){ c(rpm+1.96*serp *c(-1, 0, 1), 1-pnorm(abs(rpm/serp))) }else{ rpm+1.96*serp *c(-1, 0, 1) } } # # lower and upper polarization indices # "rpluy" <- function(y=F, yo=F, ywgt=F,yowgt=F, pvalue=F,upper=F,lower=T, match="location", how="add", location="median", scale="IQR", rpmult=F, z=F, zo=F ){ # # pvalue produce two-sided p-value for the polarization index # upper create upper polarization index # lower create lower polarization index # # See "reldist" for a description of the other variables # # # OUTPUT list components # 95% confidence interval for the median relative polarization # as lower bound, estimate, upper bound. # # ------------------------------------------------------------------------- # if(upper){lower<-F} # if(really.missing(yo)){ rdata <- list(x=y, m=length(x), ywgt=rep(1,length(x))) }else{ if(how=="covariate" & !really.missing(z) & !really.missing(zo)){ rdata <- rmdata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match=match, how=how, z=z, zo=zo, location=location, scale=scale) y <- rdata$y yo <- rdata$yo ywgt <- rdata$ywgt yowgt <- rdata$yowgt if(!rpmult){ how <- "add" }else{ how <- "mult" } } rdata <- rmdata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match=match, how=how, z=z, zo=zo, location=location, scale=scale) y <- rdata$y yo <- rdata$yo yowgt <- rdata$yowgt n <- rdata$n } x <- rdata$x m <- rdata$m ywgt <- rdata$ywgt # w <- abs(x - 0.5) if(upper){ select <- x > 0.5 }else{ select <- x <= 0.5 } rpu <- 4 * sum(w*ywgt*select)/sum(ywgt*select) - 1 c1u <- wtd.var(w,weight=ywgt*select) x <- rmdata(y=yo, yo=y, ywgt=yowgt,yowgt=ywgt, match=match, how=how, location=location, scale=scale)$x w <- abs(x - 0.5) if(upper){ select <- x > 0.5 }else{ select <- x <= 0.5 } c2u <- wtd.var(w,weight=yowgt*select) serp <- 8*sqrt(c1u/m + c2u/n) if(pvalue){ c(rpu+1.96*serp*c(-1, 0, 1), 1-pnorm(abs(rpu/serp))) }else{ rpu+1.96*serp*c(-1, 0, 1) } } "wtd.var" <- function(x, weight=rep(1,length(x))) { sum(x * x * weight)/sum(weight) - (sum(x * weight)/sum(weight))^2 } "wtd.mean" <- function(x, weight=rep(1,length(x))) { (sum(x * weight)/sum(weight)) } "wtd.median"<- function(x, na.rm = F, weight=F) { sl.simp <- function(x, partial) { .Internal(sort.list(x, partial), "S_sort_list", T, 0) } if(mode(x) != "numeric") stop("need numeric data") x <- as.vector(x) wnas <- which.na(x) if(length(wnas)) { if(na.rm) x <- x[ - wnas] if(!really.missing(weight)){weight <- weight[ - wnas]} else return(NA) } n <- length(x) half <- (n + 1)/2 if(n %% 2 == 1) { if(!really.missing(weight)){ weight <- weight/sum(weight) sx <- sort.list(x) sweight <- cumsum(weight[sx]) min(x[sx][sweight >= 0.5]) }else{ x[sl.simp(x, half)[half]] } } else { if(!really.missing(weight)){ weight <- weight/sum(weight) sx <- sort.list(x) sweight <- cumsum(weight[sx]) min(x[sx][sweight >= 0.5]) }else{ half <- floor(half) + 0:1 sum(x[sl.simp(x, half)[half]])/2 } } } "rmdata" <- function(y, yo, ywgt=F,yowgt=F, z=F, zo=F, match="none", how="add", location="median", scale="IQR", robust=F ){ # # First remove NAs # if(!really.missing(ywgt)){ nas <- is.na(y) | is.na(ywgt) ywgt <- ywgt[!nas]/sum(ywgt[!nas]) }else{ nas <- is.na(y) } y <- y[!nas] if(!really.missing(z)){z <- z[!nas]} if(!really.missing(yowgt)){ nas <- is.na(yo) | is.na(yowgt) yowgt <- yowgt[!nas]/sum(yowgt[!nas]) }else{ nas <- is.na(yo) } yo <- yo[!nas] if(!really.missing(zo)){zo <- zo[!nas]} # n <- length(yo) m <- length(y) # # Do matching # # First covariate # if(how=="covariate" & !really.missing(z) & !really.missing(zo)){ if(really.missing(yowgt)){yowgt<-rep(1/n,length=n)} if(really.missing( ywgt)){ ywgt<-rep(1/m,length=m)} if(match=="location"){ ywgt <- yowgt*rdsamp(z,zo,ywgt,yowgt) ywgt <- ywgt/sum(ywgt) y <- yo m <- n }else{ yowgt <- yowgt*rdsamp(z,zo,ywgt,yowgt) yowgt <- yowgt/sum(yowgt) } } # # Now multiplicative # if(how=="mult"){ if(match=="location"){ if(location=="median"){ if(!really.missing(yowgt)){ yo <- wtd.median(y,weight=ywgt)*yo/wtd.median(yo,weight=yowgt) }else{ yo <- median(y)*yo/median(yo) } }else{ if(!really.missing(yowgt)){ yo <- wtd.mean(y,weight=ywgt)*yo/wtd.mean(yo,weight=yowgt) }else{ yo <- mean(y)*yo/mean(yo) } } } if(match=="shape"){ if(location=="median"){ if(!really.missing(ywgt)){ y <- wtd.median(y,weight=ywgt)*yo/wtd.median(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- median(y)*yo/median(yo) m <- n } }else{ if(!really.missing(ywgt)){ y <- wtd.mean(y,weight=ywgt)*yo/wtd.mean(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- mean(y)*yo/mean(yo) m <- n } } } } # # Now additive # if(how=="add"){ if(match=="location"){ if(location=="median"){ if(!really.missing(yowgt)){ yo <- wtd.median(y,weight=ywgt)+yo-wtd.median(yo,weight=yowgt) }else{ yo <- median(y)+(yo-median(yo)) } }else{ if(!really.missing(yowgt)){ yo <- wtd.mean(y,weight=ywgt)+yo-wtd.mean(yo,weight=yowgt) }else{ yo <- mean(y)+(yo-mean(yo)) } } } if(match=="shape"){ if(location=="median"){ if(!really.missing(ywgt)){ y <- wtd.median(y,weight=ywgt)+yo-wtd.median(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- median(y)+(yo-median(yo)) m <- n } }else{ if(!really.missing(ywgt)){ y <- wtd.mean(y,weight=ywgt)+yo-wtd.mean(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- mean(y)+(yo-mean(yo)) m <- n } } } } # # Now location and scale # if(how=="ls"){ if(match=="location"){ if(location=="median" & scale=="IQR"){ if(!really.missing(yowgt)){ yo <- wtd.median(y,weight=ywgt)+wtd.iqr(y,weight=ywgt)* (yo-wtd.median(yo,weight=yowgt))/wtd.iqr(yo,weight=yowgt) }else{ yo <- median(y)+iqr(y)*(yo-median(yo))/iqr(yo) } }else{ if(!really.missing(yowgt)){ yo <- wtd.mean(y,weight=ywgt)+sqrt(wtd.var(y,weight=ywgt))* (yo-wtd.mean(yo,weight=yowgt))/sqrt(wtd.var(yo,weight=yowgt)) }else{ yo <- mean(y)+sqrt(var(y))*(yo-mean(yo))/sqrt(var(yo)) } } } if(match=="shape"){ if(location=="median"){ if(!really.missing(ywgt)){ y <- wtd.median(y,weight=ywgt)+wtd.iqr(y,weight=ywgt)* (yo-wtd.median(yo,weight=yowgt))/wtd.iqr(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- median(y)+iqr(y)*(yo-median(yo))/iqr(yo) m <- n } }else{ if(!really.missing(ywgt)){ y <- wtd.mean(y,weight=ywgt)+sqrt(wtd.var(y,weight=ywgt))* (yo-wtd.mean(yo,weight=yowgt))/sqrt(wtd.var(yo,weight=yowgt)) ywgt <- yowgt m <- n }else{ y <- mean(y)+sqrt(var(y))*(yo-mean(yo))/sqrt(var(yo)) m <- n } } } } # # sy is 1:n # ys is the ordered y's # ry is the ranks of ys in the joint vector of {yo, y} # ry - sy is the number of yo's le ys # sy <- seq(along = y) ys <- sort.list(y) if(!really.missing(ywgt)){ywgt <- ywgt[ys]} ys <- y[ys] # ry <- sort.list(sort.list(c(yo, ys)))[sy + n] ry <- sort.list(sort.list(c(yo, ys))) if(robust){ # y <- y*(1+sqrt(var(y))*(runif(m)-0.5)/100000)} jv <- c(yo, ys) njv <- !is.na(jv) for(i in unique(jv)[duplicated(jv)]) { which <- jv == i & njv ry[which] <- mean(ry[which]) } } ry <- ry[sy + n] if(really.missing(yowgt)){ x <- (ry - sy)/n }else{ yos <- sort.list(yo) yowgts <- yowgt[yos] # # x is the new relative data, but note it has weight ywgt # x <- c(0,cumsum(yowgts))[ry - sy + 1] } x[x > 1] <- 1 x[x < 0] <- 0 if(really.missing(ywgt)){ywgt <- rep(1,length=m)/m} # # Use x[order(order(y))] # to get the relative data back in original order # # returned values are in the original order # rorder <- order(order(y)) list(x=x[rorder],wgt=ywgt[rorder],y=y,yo=yo,ywgt=ywgt[rorder],yowgt=yowgt,n=n,m=m) } "rddata" <- function(y, yo, ywgt=F,yowgt=F, pool=1, match="none", how="add", location="median", scale="IQR", z=F, zo=F, robust=F ){ # # First remove NAs # if(!really.missing(ywgt)){ nas <- is.na(y) | is.na(ywgt) ywgt <- ywgt[!nas]/sum(ywgt[!nas]) }else{ nas <- is.na(y) } y <- y[!nas] if(!really.missing(yowgt)){ nas <- is.na(yo) | is.na(yowgt) yowgt <- yowgt[!nas]/sum(yowgt[!nas]) }else{ nas <- is.na(yo) } yo <- yo[!nas] # n <- length(yo) m <- length(y) # # Do matching # # First covariate # if(how=="covariate" & !really.missing(z) & !really.missing(zo)){ if(really.missing(yowgt)){yowgt<-rep(1/n,length=n)} if(really.missing( ywgt)){ ywgt<-rep(1/m,length=m)} if(match=="location"){ ywgt <- yowgt*rdsamp(z,zo,ywgt,yowgt) y <- yo m <- n } if(match=="shape"){ yowgt <- yowgt*rdsamp(z,zo,ywgt,yowgt) } } # # Now multiplicative # if(how=="mult"){ if(match=="location"){ if(location=="median"){ if(!really.missing(yowgt)){ yo <- wtd.median(y,weight=ywgt)*yo/wtd.median(yo,weight=yowgt) }else{ yo <- median(y)*yo/median(yo) } }else{ if(!really.missing(yowgt)){ yo <- wtd.mean(y,weight=ywgt)*yo/wtd.mean(yo,weight=yowgt) }else{ yo <- mean(y)*yo/mean(yo) } } } if(match=="shape"){ if(location=="median"){ if(!really.missing(ywgt)){ y <- wtd.median(y,weight=ywgt)*yo/wtd.median(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- median(y)*yo/median(yo) m <- n } }else{ if(!really.missing(ywgt)){ y <- wtd.mean(y,weight=ywgt)*yo/wtd.mean(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- mean(y)*yo/mean(yo) m <- n } } } } # # Now additive # if(how=="add"){ if(match=="location"){ if(location=="median"){ if(!really.missing(yowgt)){ yo <- wtd.median(y,weight=ywgt)+yo-wtd.median(yo,weight=yowgt) }else{ yo <- median(y)+(yo-median(yo)) } }else{ if(!really.missing(yowgt)){ yo <- wtd.mean(y,weight=ywgt)+yo-wtd.mean(yo,weight=yowgt) }else{ yo <- mean(y)+(yo-mean(yo)) } } } if(match=="shape"){ if(location=="median"){ if(!really.missing(ywgt)){ y <- wtd.median(y,weight=ywgt)+yo-wtd.median(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- median(y)+(yo-median(yo)) m <- n } }else{ if(!really.missing(ywgt)){ y <- wtd.mean(y,weight=ywgt)+yo-wtd.mean(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- mean(y)+(yo-mean(yo)) m <- n } } } } # # Now location and scale # if(how=="ls"){ if(match=="location"){ if(location=="median" & scale=="IQR"){ if(!really.missing(yowgt)){ yo <- wtd.median(y,weight=ywgt)+wtd.iqr(y,weight=ywgt)* (yo-wtd.median(yo,weight=yowgt))/wtd.iqr(yo,weight=yowgt) }else{ yo <- median(y)+iqr(y)*(yo-median(yo))/iqr(yo) } }else{ if(!really.missing(yowgt)){ yo <- wtd.mean(y,weight=ywgt)+sqrt(wtd.var(y,weight=ywgt))* (yo-wtd.mean(yo,weight=yowgt))/sqrt(wtd.var(yo,weight=yowgt)) }else{ yo <- mean(y)+sqrt(var(y))*(yo-mean(yo))/sqrt(var(yo)) } } } if(match=="shape"){ if(location=="median"){ if(!really.missing(ywgt)){ y <- wtd.median(y,weight=ywgt)+wtd.iqr(y,weight=ywgt)* (yo-wtd.median(yo,weight=yowgt))/wtd.iqr(yo,weight=yowgt) ywgt <- yowgt m <- n }else{ y <- median(y)+iqr(y)*(yo-median(yo))/iqr(yo) m <- n } }else{ if(!really.missing(ywgt)){ y <- wtd.mean(y,weight=ywgt)+sqrt(wtd.var(y,weight=ywgt))* (yo-wtd.mean(yo,weight=yowgt))/sqrt(wtd.var(yo,weight=yowgt)) ywgt <- yowgt m <- n }else{ y <- mean(y)+sqrt(var(y))*(yo-mean(yo))/sqrt(var(yo)) m <- n } } } } # sy <- sort(yo) sy <- c(min(y,yo),(sy[-1] - 0.5*diff(sy)),max(y,yo)) sy <- unique(sy) # po <- table(cut(yo,breaks=sy,include.lowest = T)) # if(really.missing(yowgt)){ po <- table(cut(yo,breaks=sy,include.lowest = T)) }else{ yowgt <- yowgt/mean(yowgt) xxx <- cut(yo,breaks=sy,include.lowest = T) po <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ po[i] <- sum(yowgt[xxx==i],na.rm=T) } } # # remove zero # eee <- seq(along=po)[po < 1] if(eee[length(eee)] == length(po)){eee[length(eee)] <- eee[length(eee)]-1} # eee <- eee[eee < length(po)] # sy <- sy[-(eee+1)] if(really.missing(yowgt)){ po <- table(cut(yo,breaks=sy,include.lowest = T)) # po <- po/n }else{ yowgt <- yowgt/mean(yowgt) xxx <- cut(yo,breaks=sy,include.lowest = T) po <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ po[i] <- sum(yowgt[xxx==i],na.rm=T) } # po <- po/mean(po) } if(pool>=5){ # # remove small cells # ggg <- seq(5,pool,length=4) # # Pool successively # for ( px in ggg ){ eee <- seq(along=po)[po <= px] # if(length(eee)>0 & length(sy) > 2){ if(eee[length(eee)] == length(po)){ eee[length(eee)] <- eee[length(eee)]-1} sy <- sy[-(eee+1)] } if(really.missing(yowgt)){ po <- table(cut(yo,breaks=sy,include.lowest = T)) }else{ yowgt <- yowgt/mean(yowgt) xxx <- cut(yo,breaks=sy,include.lowest = T) po <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ po[i] <- sum(yowgt[xxx==i],na.rm=T) } # po <- po/mean(po) } } } po <- po / n # if(really.missing(yowgt)){ pix <- table(cut(y,breaks=sy,include.lowest = T)) }else{ ywgt <- ywgt/mean(ywgt) xxx <- cut(y,breaks=sy,include.lowest = T) pix <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ pix[i] <- sum(ywgt[xxx==i],na.rm=T) } } # pix <- as.vector(pix)+0.5 pix <- pix/sum(pix) xx <- cumsum(po) yy <- pix/po nx <- length(xx) # xx[xx > 1] <- 1 xx[xx < 0] <- 0 if(really.missing(ywgt)){ywgt <- rep(1,length=m)/m} # list(x=xx,gx=yy,nx=nx,pix=pix,po=po,y=y,yo=yo,ywgt=ywgt,yowgt=yowgt, breaks=sy,n=n,m=m) } rddist <- function(y, yo, pool=1, add=F, ylim=NULL,ywgt=F,yowgt=F, match="none", how="add", location="median", scale="IQR", rpmult=F, z=F, zo=F, ci=F,graph=T, cex=0.8,yaxs="r", xlab="baseline proportion", ylab="density", yolabslabs=NULL, ylabslabs=NULL, yolabs=pretty(yo), ylabs=pretty(y)) { # options(warn=-1) if(missing(yo)){ x <- sort(y) m <- length(x) n <- 0 wgt <- rep(1/m,length=m) rmd <- list(x=x,wgt=wgt,n=n,m=m) }else{ rmd <- rddata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match=match, how=how, location=location, scale=scale) gx <- rmd$gx nx <- rmd$nx pix <- rmd$pix po <- rmd$po ywgt <- rmd$ywgt yowgt <- rmd$yowgt x <- rmd$x n <- rmd$n m <- rmd$m } # x1 <- rep(c(0,x),rep(2,nx+1))[-1] y1 <- rep(gx,c(rep(2,nx-1),3)) y2 <- rep(gx,c(rep(2,nx-1),4))[-1] x2 <- c(rep(x,rep(2,nx)),1) # # Calculate the entropy of the relative distribution # dx <- diff(c(0,x)) egv <- gx[gx > 0] edx <- dx[gx > 0] entropy <- sum(egv*log(egv)*edx) # chisq <- sum((egv-1)^2*edx) # if(graph){ if(!add){ if(missing(ylim)){ plot(x = x1, y = y1, type = "n", xaxt="n", yaxs=yaxs, xlab = xlab, ylab = ylab) }else{ plot(x = x1, y = y1, type = "n", ylim=ylim,xaxt="n", yaxs=yaxs, xlab = xlab, ylab = ylab) } if(missing(ylabs)){ axis(2)} else{ axis(2) if(length(ylabs) == 1 & ylabs[1] == T){ylabs<-sort(unique(y))} yxlabs <- rmdata(ylabs,y)$x yxlabs <- yxlabs - 0.5*diff(c(0,yxlabs)) # # next places it on the top outside # if(missing(ylabslabs)){ylabslabs <- paste(ylabs)} axis(side = 4, at = yxlabs, labels = ylabslabs, tck=0.02, cex=cex) } if(missing(yolabs)){ axis(1)} else{ axis(1) if(length(yolabs) == 1 & yolabs[1] == T){yolabs<-sort(unique(yo))} yoxlabs <- rmdata(yolabs,yo)$x # # next places it on the top outside # if(missing(yolabslabs)){yolabslabs <- paste(yolabs)} axis(side = 3, at = yoxlabs, labels = yolabslabs, tck=0.02, cex=cex) } add <- 1 } segments(x1,y1,x2,y2,lty=add) abline(h = 1, lty = 2) } # # Calculate polarization # Q <- length(gx) se <- (n+m+1)/(3*n*m) cpo <- cumsum(po) # hlf <- match(T,cpo >= 0.5) -1 g11 <- c(po[1:hlf], 0.5 - cpo[hlf]) g13 <- gx[1:(hlf + 1)] g14 <- c(0, cpo[1:hlf]) rpl <- 2 * sum(g13 * g11 * (1 - 2 * g14 - g11)) / sum(g11*g13) - 1 # g12 <- c(cpo[hlf + 1] - 0.5, po[(hlf + 2):Q]) g23 <- gx[(hlf + 1):Q] g24 <- c(0.5, cpo[(hlf + 1):(Q - 1)]) rpu <- -2 * sum(g23 * g12 * (1 - 2 * g24 - g12)) / sum(g12*g23) - 1 # rp <- 0.5 * (rpl + rpu) # # CI # gx.ci <- NULL if(ci){ vargx <- gx*gx*((1-pix)/(m*pix) + (1-po)/(n*po)) gx.ci <- list(l=gx-1.96*sqrt(vargx), u=gx+1.96*sqrt(vargx)) if(graph){ segments(c(0,x),gx.ci$l,c(x,1),gx.ci$l,lty=2) segments(c(0,x),gx.ci$u,c(x,1),gx.ci$u,lty=2) } } list(x=x,y=gx, rp=c(rp-1.96*sqrt(se),rp,rp+1.96*sqrt(se)), ci=gx.ci,rpl=rpl,rpu=rpu,breaks=rmd$breaks,chisq=chisq,entropy=entropy) } "entropy" <- function(x,xo=F){ # # Calculate the entropy of the relative distribution # if(missing(xo)){ egv <- x$y[x$y > 0] dgv <- diff(c(0,x$x)) entropy <- sum(egv*log(egv)*dgv) }else{ xout <- seq(0,1,length=1000) x <- approx(x=x$x,y=x$y,xout=xout,rule=2) xo <- approx(x=xo$x,y=xo$y,xout=xout,rule=2) egv <- x$y[x$y > 0] egvo <- xo$y[x$y > 0] dgvo <- diff(c(0,xo$x)) entropy <- sum(egvo*log(egv)*dgvo) } entropy } really.missing <- function(x) { if(!missing(x)){ (is.logical(x) & x[1]==F) }else{ T } } # # a function to change the weights to represent the probs # rdsamp <- function(targetvalues, samplevalues, targetvalueswgt=F, samplevalueswgt=F, target=samplevalues, pool=1){ # # targetvalues is the sample from distribution of the covariate # that we wish to match to # # samplevalues is the sample from distribution of the covariate # that we start with # n <- length(samplevalues) m <- length(targetvalues) # sy <- sort(samplevalues) sy <- c(min(targetvalues,samplevalues), (sy[-1] - 0.5*diff(sy)), max(targetvalues,samplevalues)) sy <- unique(sy) # spb <- table(cut(samplevalues,breaks=sy,include.lowest = T)) # if(really.missing(samplevalueswgt)){ spb <- table(cut(samplevalues,breaks=sy,include.lowest = T)) }else{ samplevalueswgt <- samplevalueswgt/mean(samplevalueswgt) xxx <- cut(samplevalues,breaks=sy,include.lowest = T) spb <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ spb[i] <- sum(samplevalueswgt[xxx==i],na.rm=T) } } # # remove zero # eee <- seq(along=spb)[spb < 1] if(length(eee) > 0){ if(eee[length(eee)] == length(spb)){ eee[length(eee)] <- eee[length(eee)]-1} # eee <- eee[eee < length(spb)] # sy <- sy[-(eee+1)] if(really.missing(samplevalueswgt)){ spb <- table(cut(samplevalues,breaks=sy,include.lowest = T)) }else{ samplevalueswgt <- samplevalueswgt/mean(samplevalueswgt) xxx <- cut(samplevalues,breaks=sy,include.lowest = T) spb <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ spb[i] <- sum(samplevalueswgt[xxx==i],na.rm=T) } } } # # remove small cells # if(pool > 4){ ggg <- seq(5,pool,length=4) # # Pool successively # for ( px in ggg ){ eee <- seq(along=spb)[spb <= px] # if(length(eee)>0){ if(eee[length(eee)] == length(spb)){ eee[length(eee)] <- eee[length(eee)]-1} sy <- sy[-(eee+1)] } if(really.missing(samplevalueswgt)){ spb <- table(cut(samplevalues,breaks=sy,include.lowest = T)) }else{ samplevalueswgt <- samplevalueswgt/mean(sampleprobswgt) xxx <- cut(samplevalues,breaks=sy,include.lowest = T) spb <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ spb[i] <- sum(samplevalueswgt[xxx==i],na.rm=T) } } }} # spb <- spb / n # if(really.missing(samplevalueswgt)){ tpb <- table(cut(targetvalues,breaks=sy,include.lowest = T)) }else{ targetvalueswgt <- targetvalueswgt/mean(targetvalueswgt) xxx <- cut(targetvalues,breaks=sy,include.lowest = T) tpb <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ tpb[i] <- sum(targetvalueswgt[xxx==i],na.rm=T) } } tpb <- as.vector(tpb)+0.5 tpb <- tpb/sum(tpb) xx <- cumsum(spb) yy <- tpb/spb nx <- length(xx) x1 <- rep(c(0,xx),rep(2,nx+1))[-1] y1 <- rep(yy,c(rep(2,nx-1),3)) y2 <- rep(yy,c(rep(2,nx-1),4))[-1] x2 <- c(rep(xx,rep(2,nx)),1) # spb <- as.vector(spb) # relprobs <- tpb/spb # # cbind(sy, spb, tpb, relprobs) # # sy are the category cutpoints # spb are the proportions of the sample values in each category. # tpb are the proportions of the target values in each category. # relprobs is then the density ratio of the targetvalues # to the samplevalues, with values defined by the categories # # relprobs[match(samplevalues, sort(unique(targetvalues)))]/n # # relprobs[cut(samplevalues,breaks=sy,include.lowest = T)] # relprobs[cut(target,breaks=sy,include.lowest = T)]/length(target) # # This returns the probabilities for each target value in samplevalues. # with the probabilities of the targetvalues. # } rdeciles <- function(y, yo, ywgt=F,yowgt=F, reference=yo,refwgt=yowgt, binn=10, match="none", how="add", location="median", scale="IQR", z=F, zo=F ){ # # # INPUT variables # # required variables # y sample from comparison distribution # yo sample from reference distribution # # options # ywgt weights on the comparison sample # yowgt weights on the comparison sample # binn number of categories (default 10) # reference sample from an alternative # reference distribution # The reference cutpoints are taken # from this instead of yo # refwgt weights from an alternative # reference distribution # # OUTPUT list components # deciles list for the CDF # x relative ratio of the deciles # refcuts decile cutpoints from the reference # yx proportion of comparison within # each reference cutpoints # (If reference=yo this is close to x/binn). # This might deviate due to ties in the # reference or comparison distributions. # yox proportion of yo within # each reference cutpoints # (If reference=yo this should be # close to 1/binn). # This might deviate due to ties in the # reference or comparison distributions. # # See reldist for other arguments # rmd <- rmdata(y=y, yo=yo, ywgt=ywgt,yowgt=yowgt, match=match, how=how, location=location, scale=scale, z=z, zo=zo ) y <- rmd$y yo <- rmd$yo ywgt <- rmd$ywgt yowgt <- rmd$yowgt x <- rmd$x n <- rmd$n m <- rmd$m # if(missing(reference)){reference <- yo} if(really.missing(refwgt)){refwgt <- yowgt} sy <- seq(along = reference) ys <- sort.list(reference) if(!really.missing(refwgt)){refwgt <- refwgt[ys]} reference <- reference[ys] # if(really.missing(refwgt)){ cdfref <- sy/m }else{ refwgt <- refwgt/sum(refwgt) cdfref <- cumsum(refwgt) } refcuts <- seq(0, 1, length = binn + 1)[c(-1,-(binn+1))] refcuts <- c(min(y,yo,reference)-0.01, approx(x=cdfref,y=reference,xout=refcuts)$y, max(y,yo,reference)+0.01) # if(really.missing(ywgt)){ yx <- table(cut(y, breaks = refcuts, include.lowest=T))/m }else{ ywgt <- ywgt/sum(ywgt) xxx <- cut(y, breaks = refcuts,include.lowest=T) yx <- rep(0,length=binn) for (i in unique(xxx)){ yx[i] <- sum(ywgt[xxx==i],na.rm=T) } } # if(really.missing(yowgt)){ yox <- table(cut(yo, breaks = refcuts, include.lowest=T))/n }else{ yowgt <- yowgt/sum(yowgt) xxx <- cut(yo, breaks = refcuts,include.lowest=T) yox <- rep(0,length=binn) for (i in unique(xxx)){ yox[i] <- sum(yowgt[xxx==i],na.rm=T) } } ratio <- yx/yox # # Output # list(x=as.numeric(ratio),refcuts=refcuts,yx=yx,yox=as.numeric(yox)) } "relasp"<- function(y,yo,binn=100,boundary=0.05){ y <- sort(y) m <- length(y) y <- c(min(y)-y[1:(boundary*m)],y,2*max(y)-y[((1-boundary)*m):m]) m <- length(y) yo <- sort(yo) n <- length(yo) yo <- c(min(yo)-yo[1:(0.1*n)],yo,2*max(yo)-yo[(0.9*n):n]) n <- length(yo) write(binn, "cao.in", append = F) write(n+m, "cao.in", append = T) rdata <- cbind(0,c(y,yo),1,rep(c(1,-1),c(n,m))) write(t(rdata), "cao.in", append = T, ncol = 4) # # Call converted Pascal program # aaa <- unix("cao < cao.in > cao.out",output=F) out <- matrix(as.numeric(scan("cao.out")), ncol = 2,byrow=T) # out[is.na(out[, 2]), 2] <- 0.1 out[is.na(out[, 2]), 2] <- mean(as.numeric(out[!is.na(out[, 2]), 2])) dimnames(out) <- list(NULL,c("x","h(x)")) out } print(cat("Installation of reldist complete\n"))