# # This is version 1.0.1 of the reldistaux functions. # by Mark S. Handcock # # February 16, 2001 # # IMPORTANT: This file requires the current version # of "reldist.s" to reconstruct the figures # in the book. # # See the README file for details. # # This file contains S-PLUS functions used to construct figures in # # "Relative Distribution Methods in the Social Sciences" # by Mark S. Handcock and Martina Morris, # Springer-Verlag, 1999, ISBN 0387987789 # # We make no claims that the code given is as efficient as # it could be, and we would be interested to learn # of more efficient approaches. # # The commands for each figure should be considered in isolation from # those for any other figure (so, for example, a setting of graphical # parameters using par() does not carry over to the next figure). # # All of the figures in the book were written to Postscript files using # the postscript() driver. Other graphics drivers might yield different # results. # # Other software is available from the Relative Distribution Website: # # http://www.stat.washington.edu/~handcock/RelDist # # The website contains related software, descriptions of the variables # and data file formats. # # First read in the RelDist specific software (reldist.s) # and then the book specific functions. # source("reldist.s") # # Now make book specific functions # gini <- function(x){ sx <- sort(x) n <- length(sx) i <- seq(along = sx) ci <- -2*(1-2*(i-1)/(n-1)) 0.5*mean(ci*sx)/mean(sx) } rdcdf <- function(y, yo, pool=1, add=F, ylim=NULL,ywgt=F,yowgt=F, ci=F,graph=T, matchmult=F,matchlocation=F, cex=0.8,yaxs="r", matchshape=F, lty=1, xlab="Reference proportion", ylab="Density", yolabslabs=NULL, ylabslabs=NULL, yolabs=pretty(yo), ylabs=pretty(y)) { n <- length(yo) m <- length(y) if(matchmult){ if(!really.missing(ywgt)){ y <- wtd.median(yo,weight=yowgt)*y/wtd.median(y,weight=ywgt) }else{ y <- median(yo)*y/median(y) } } # if(matchlocation){ if(!really.missing(ywgt)){ y <- wtd.median(yo,weight=yowgt)+y-wtd.median(y,weight=ywgt) }else{ y <- median(yo)+(y-median(y)) } } if(matchshape){ if(!really.missing(yowgt)){ yo <- wtd.median(yo,weight=yowgt)+y-wtd.median(y,weight=ywgt) yowgt <- ywgt n <- m }else{ yo <- median(yo)+(y-median(y)) n <- m } } # sy <- sort(yo) sy <- c(min(y,yo),(sy[-1] - 0.5*diff(sy)),max(y,yo)) sy <- unique(sy) # aaa <- table(cut(yo,breaks=sy,include.lowest = T)) # if(really.missing(yowgt)){ aaa <- table(cut(yo,breaks=sy,include.lowest = T)) }else{ yowgt <- yowgt/mean(yowgt) xxx <- cut(yo,breaks=sy,include.lowest = T) aaa <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ aaa[i] <- sum(yowgt[xxx==i],na.rm=T) } } # # remove zero # eee <- seq(along=aaa)[aaa < 1] if(eee[length(eee)] == length(aaa)){ eee[length(eee)] <- eee[length(eee)]-1 } # sy <- sy[-(eee+1)] if(really.missing(yowgt)){ aaa <- table(cut(yo,breaks=sy,include.lowest = T)) # aaa <- aaa/n }else{ yowgt <- yowgt/mean(yowgt) xxx <- cut(yo,breaks=sy,include.lowest = T) aaa <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ aaa[i] <- sum(yowgt[xxx==i],na.rm=T) } # aaa <- aaa/mean(aaa) } if(pool>=5){ # # remove small cells # ggg <- seq(5,pool,length=4) # # Pool successively # for ( px in ggg ){ eee <- seq(along=aaa)[aaa <= px] # if(length(eee)>0 & length(sy) > 2){ if(eee[length(eee)] == length(aaa)){ eee[length(eee)] <- eee[length(eee)]-1} sy <- sy[-(eee+1)] } if(really.missing(yowgt)){ aaa <- table(cut(yo,breaks=sy,include.lowest = T)) }else{ yowgt <- yowgt/mean(yowgt) xxx <- cut(yo,breaks=sy,include.lowest = T) aaa <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ aaa[i] <- sum(yowgt[xxx==i],na.rm=T) } } } } aaa <- aaa / n # if(really.missing(yowgt)){ bbb <- table(cut(y,breaks=sy,include.lowest = T)) }else{ ywgt <- ywgt/mean(ywgt) xxx <- cut(y,breaks=sy,include.lowest = T) bbb <- rep(0,length=length(sy)-1) for (i in unique(xxx)){ bbb[i] <- sum(ywgt[xxx==i],na.rm=T) } } bbb <- as.vector(bbb)+0.5 bbb <- bbb/sum(bbb) xx <- cumsum(aaa) nx <- length(xx) # x1 <- c(0,xx) y1 <- c(0,cumsum(bbb)) y2 <- c(cumsum(bbb),1) x2 <- c(xx,1) # if(graph){ if(!add){ plot(x = x1, y = y1, type = "n", lty=lty, ylim=c(0,1), xlim=c(0,1),cex=cex, yaxs=yaxs,xlab = xlab, ylab = ylab, xaxt="n", yaxt="n") abline(h = (0:10)/10, lty = 2) abline(v = (0:10)/10, lty = 2) abline(a = 0, b = 1, lty = 2) if(!missing(ylabs)){ axis(2,cex=cex,at=(0:10)/10,mgp=c(1,.6,0)) if(length(ylabs) == 1 & ylabs[1] == T){ylabs<-pretty(y)} yxlabs <- rmdata(ylabs,y)$x # # next places it on the top inside (outside is tck0.02) # mgp c(3,2,0) is normal outside c(3,-2,0) is normal inside # if(missing(ylabslabs)){ylabslabs <- paste(ylabs)} axis(side = 4, at = yxlabs, labels = ylabslabs, tck=-0.02, cex=cex,mgp=c(1,.6,0)) } axis(1,cex=cex,at=(0:10)/10) if(!missing(yolabs)){ if(length(yolabs) == 1 & yolabs[1] == T){yolabs<-pretty(yo)} if(missing(yo)){ yoxlabs <- yolabs }else{ yoxlabs <- rmdata(yolabs,yo)$x } # # next places it on the top outside # if(missing(yolabslabs)){yolabslabs <- paste(yolabs)} axis(side = 3, at = yoxlabs, labels = yolabslabs, tck=-0.02, cex=cex) } } add <- 1 } segments(x1,y1,x2,y2,lty=add) abline(h = 1, lty = 2) } # "lldensity" <- function(x,n=100,width=0.25, from=min(x)-0.5*width,to=max(x)+0.5*width, aicc=seq(0.1,3.0, by=0.05), weight=NULL ){ # # Use local-likelihood density estimation # and aicc to chose the bandwidth # based on a grid search # if(missing(weight)){weight <- rep(1,length=length(x))} # if(missing(width)){ aicc <- cbind(aicc,aicc) dimnames(aicc)[[2]] <- c("smooth","aicc") for(i in seq(along=aicc[,1])){ aicc[i,2] <- loclike.de(x, aicc[i,1], nbin = n, xrange=c(from,to), display = F, weight=weight)$aicc } aicc <- aicc[order(aicc[,2]),] smooth <- aicc[1,1] }else{ smooth <- width } # compdens <- loclike.de(x, smooth, nbin = n, xrange=c(from,to), display = F) list(x=compdens$grid, y=compdens$est, aicc=aicc,smooth=smooth) } "loclike.de"<- function(x, h, degree = 2, xrange = range(x, na.rm = T), nbin = 100, weight=NULL, display = T, ...) { # # Function to construct local likelihood density estimate based on # binning and categorical data smoothing. Inputs are data (x) and span # (h), and optionally the range of evaluation and the number of bins # in which the data are binned. By default a density # estimate is plotted. If "display=F" is set, output is a list of the # AICC criterion ($aicc), and density estimate ($est) with associated grid # points ($grid). Roughly speaking, the span is the proportion of the # estimation interval covered by the kernel. For discussion see Simonoff # (1998, International Statistical Review). # # This code written by Jeffery Siminoff # # To construct the AICC density estimate, minimize the AICC criterion over # the degrees of freedom using a grid search. # bwidth <- (xrange[2] - xrange[1])/nbin b <- c(0:nbin) * bwidth + xrange[1] y <- hist(x, breaks = b, plot = F)$counts if(missing(weight)){ y <- hist(x, breaks = b, plot = F)$counts }else{ weight <- weight/sum(weight) y <- cut(x, breaks = b,include.lowest=T) xx <- rep(0,length=nbin) for (i in unique(y)){ xx[i] <- sum(weight[y==i],na.rm=T) } y <- length(x)*as.vector(xx) } # cc <- cat.crit.loess(h, y, degree = degree) if(display) { plot(x, c(0, rep(max(cc$prob/bwidth), length(x) - 1)), ylab = "Density", type = "n", xlim = xrange, ...) lines(b[1:nbin] + bwidth/2, cc$prob/bwidth, type = "l") invisible(list(aicc = cc$aicc, est = cc$prob/bwidth, grid = b[1: nbin] + bwidth/2)) } if(!display) list(aicc = cc$aicc, est = cc$prob/bwidth, grid = b[1:nbin] + bwidth/2) } # "cat.crit.loess"<- function(h, y, degree = 2) { # # AICC criterion function for loess likelihood estimator. y contains observed # relative frequencies or counts, while h is the span (roughly speaking, # the proportion of bins covered by the kernel). See Simonoff # (1998, Int. Statist. Rev.) for discussion. # # This code written by Jeffery Siminoff # # The AICC loess likelihood estimator is determined by minimizing this # function over h. In my experience, a grid search is necessary to find the # optimal value (the function nlminb doesn't seem to work). The output list # consists of the associated AICC value ($aicc) and the estimated cell # probabilities ($prob). # k <- length(y) x <- c(1:k) g <- gam(y ~ lo(x, degree = degree, h), family = "poisson") df <- g$nl.df + 2 gfit <- predict(g, type = "response") if(df >= k - 2) list(aicc = Inf, prob = gfit/sum(gfit)) else list(aicc = log(g$deviance) + (1 + df/k)/(1 - df/k - 2/k), prob = gfit/sum(gfit)) } ############################################################# # These function for the log-spline estimate only ############################################################# nlsd.fit <- function(data, wgt, lbound, ubound, maxknots = 0, knots, nknots = 0, penalty = -1, silent = T, mind = -1, wttype = 1) { call <- match.call() u2 <- length(data) data <- data[!is.na(data)] nsample <- length(data) if(nsample < 10) stop("not enough data") if(u2 != nsample) print(paste("***", u2 - nsample, " NAs ignored in data")) data <- sort(data) if(missing(wgt)) wgt <- rep(1, nsample) # data can not be beyond the boundaries of the density if(!missing(lbound)) if(data[1] < lbound) stop("data below lbound") if(!missing(ubound)) if(data[nsample] > ubound) stop("data above ubound") mm <- range(data) if(!missing(lbound)) mm <- range(c(mm, lbound)) if(!missing(ubound)) mm <- range(c(mm, ubound)) # boundaries ilow <- (!missing(lbound)) * 1 iupp <- (!missing(ubound)) * 1 low <- 0 upp <- 0 if(ilow == 1) low <- lbound if(iupp == 1) upp <- ubound # get the maximal dimension intpars <- c(-100, rep(0, 9)) # if(!is.loaded(symbol.C("nlogcensor"))) dyn.load(paste(RDlib.loc[1],"/nlsd/nlsd.o",sep="")) # z <- .C("nlogcensor", z = as.integer(intpars)) maxp <- z$z[1] # organize knots kts <- vector(mode = "double", length = max(maxp)) if(maxknots > maxp - 5) warning(paste("maxknots reduced to", maxp)) nknots <- - nknots if(!missing(knots)) { nknots <- length(knots) knots <- sort(knots) if(!missing(lbound)) if(min(knots) < lbound) stop("data (knots) below lbound") if(!missing(ubound)) if(max(knots) > ubound) stop("data (knots) above ubound") if(nknots < 3) stop("need at least three starting knots") if(nknots > maxp - 5) stop(paste("at most", maxp - 5, "knots possible")) kts[1:nknots] <- knots } silent <- silent == F # group parameters intpars <- c(nsample, maxknots, nknots, silent, 1 - ilow, 1 - iupp, mind, wttype) dpars <- c(penalty, low, upp) data <- c(data, rep(0, maxp)) # do it z <- .C("nlogcensor", ip = as.integer(intpars), coef = as.double(data), as.double(wgt), dp = as.double(dpars), logl = as.double(rep(0, maxp)), ad = as.integer(rep(0, maxp)), kts = as.double(kts)) # error messages if(z$ip[1] != 0 && z$ip[1] < 100) { if(z$ip[1] == 17) stop("too many knots beyond data") if(z$ip[1] == 18) stop("too many knots before data") if(z$ip[1] == 39) stop("too much data close together") if(z$ip[1] == 40) stop("no model could be fitted") if(z$ip[1] == 2) stop("error while solving system") if(z$ip[1] == 8) stop("too much step-halving") if(z$ip[1] == 5) stop("too much step-halving") if(z$ip[1] == 7) stop("numerical problems, likely tail related. Try lbound/ubound" ) if(z$ip[1] == 1) stop("no convergence") i <- 0 if(missing(knots)) i <- 1 if(z$ip[1] == 3 && i == 1) stop("right tail extremely heavy, try running with ubound" ) if(z$ip[1] == 4 && i == 1) stop("left tail extremely heavy, try running with lbound" ) if(z$ip[1] == 6 && i == 1) stop("both tails extremely heavy, try running with lbound and ubound" ) if(z$ip[1] == 3 && i == 0) stop("right tail too heavy or not enough knots in right tail" ) if(z$ip[1] == 4 && i == 0) stop("left tail too heavy or not enough knots in left tail" ) if(z$ip[1] == 6 && i == 0) stop("both tails too heavy or not enough knots in both tail" ) } if(z$ip[1] > 100) { warning(" Not all models could be fitted") } # organize logl logl <- cbind(z$ad, z$logl) logl <- cbind(2 + (1:z$ip[3]), logl[1 + (1:z$ip[3]), ]) kk <- (1:length(logl[, 1])) kk <- kk[logl[, 2] == 0] if(length(kk) > 0) logl <- logl[ - kk, ] # bye bye list(call = call, nknots = z$ip[2], coef.pol = z$coef[1:2], coef.kts = z$coef[2 + (1:z$ip[2])], knots = z$kts[1:z$ip[2]], maxknots = z$ ip[3] + 2, penalty = z$dp[1], bound = c(ilow, low, iupp, upp), samples = nsample, logl = logl, range = mm, mind = z$ip[7], wttype = wttype) } dnlsd <- function(x, fit) { y <- fit$coef.pol[1] + x * fit$coef.pol[2] for(i in 1:length(fit$knots)) y <- y + fit$coef.kts[i] * ((abs(x - fit$knots[i]) + x - fit$ knots[i])/2)^3 y <- exp(y) if(fit$bound[1] > 0) y[x < fit$bound[2]] <- 0 if(fit$bound[3] > 0) y[x > fit$bound[4]] <- 0 y } pnlsd <- function(q, fit) { sq <- rank(q) q <- sort(q) z <- .C("rpqlsd", as.double(c(fit$coef.pol, fit$coef.kts)), as.double(fit$knots), as.double(fit$bound), as.integer(1), pp = as.double(q), as.integer(length(fit$knots)), as.integer(length(q))) zz <- z$pp[sq] if(fit$bound[1] > 0) zz[q < fit$bound[2]] <- 0 if(fit$bound[3] > 0) zz[q > fit$bound[4]] <- 1 zz } print(cat("Installation of reldistaux complete\n"))