# some handy functions for R and S-Plus # Hugo QuenŽ, h.quene@uu.nl, www.hugoquene.nl # 2006.11.10 first collected # 2006.12.12 function bonferroni.adjusted added # 2007.05.30 call with source(file=url(...)) added # 2007.11.19 function nPVI added # 2008.09.30 function same added # 2009.03.02 function inside added # 2009.08.24 some functions are only defined if !exists(function) # this applies to functions sd, se, logit, antilogit # # Steps to use these functions: # EASY METHOD # R> source(file=url("http://www.let.uu.nl/~Hugo.Quene/personal/tools/hqfunctions.r")) # # HARD METHOD # 1. Save this file to a place on your computer, e.g. to # E:\\mypath\\hqfunctions.r # 2. From within R, execute commands given in the saved file, # by calling # R> source(file="e:\\mypath\\hqfunctions.r") # This reads commands from the indicated file instead # of from the standard input (keyboard). # # Now the functions are defined and may be used, e.g. # R> y.trim <- trim(y,.05) # Good luck! trim <- function(x,p) { # first NAs are removed from x; # then fraction p is removed from x at BOTH upper AND lower end of range of x # NOTE: as a side effect, argument x is sorted ! x <- x[!is.na(x)]; x<-sort(x); n<-length(x); m<-round(n*p); return(x[(m+1):(n-m)]) } howmany.na <- function(x) { # returns the number of missing observations in x return( length(which.na(x)) ) } summary.long <- function(x, trim = 0.1, geo = F, ci=.95, na.rm = T) { # Sherry LaMonica, insightful.com # contributed to S-Plus discussion list, item #45, # http://spluscom.insightful.com/discussions/discussion3.cfm?topic_id=10&dept_id=6&discussionitemID=35&parentitemid=0#45 # se added 20021206 HQ # CI added 20060210 HQ dn <- list(c("Sample Size", "# Missing Values", "Non-missing Sample Size", "Mean", "Median", paste(trunc(trim * 100), "% Trimmed Mean", sep = ""), "Skew", "Kurtosis", "Min", "Max", "Range", "1st Quartile", "3rd Quartile", "IQR", "Standard Deviation", "Standard Error", paste( as.character(ci*100),"% Conf.Int. Lower Bound", sep=""), paste( as.character(ci*100),"% Conf.Int. Upper Bound", sep=""), "Median Absolute Deviation", "CV"), "Statistics") if(geo) { dn[[1]] <- append(dn[[1]], "Geometric Mean", after = 6) } if(length(x) == 0) { sum.vec <- rep(NA, ifelse(geo, 18+2, 17+2)) warning("x is an empty object") } else { s <- summary(x) n.miss <- sum(is.na(x)) n <- length(x) n.w.no.miss <- n - n.miss if(na.rm) x <- x[!is.na(x)] if(!na.rm & n.miss != 0) v <- NA else v <- var(x) v.mle <- ((n.w.no.miss - 1)/n.w.no.miss) * v sd <- sqrt(v) se <- sd / sqrt(n.w.no.miss) # two-sided testing crit <- qt( 1-((1-ci)/2), (n.w.no.miss-1) ) lowerbound <- s[4]-crit*se upperbound <- s[4]+crit*se skew <- mean((x - s[4])^3)/(v.mle^1.5) kurtosis <- (mean((x - s[4])^4)/(v.mle^2)) - 3 sum.vec <- c(n, n.miss, n.w.no.miss, s[4], s[3], mean(x, trim = +trim, na.rm = na.rm), skew, kurtosis, s[1], s[6], diff(range(x, na.rm = na.rm)), s[2], s[5], s[5] - s[2], sd, se, lowerbound, upperbound, mad(x), sd/s[4]) if(geo) { if(any(x < 0)) stop("Negative values in vector; cannot compute +geometric mean") sum.vec <- append(sum.vec, geo.mean(x, na.rm = na.rm), after = 6) } } mat <- matrix(sum.vec, ncol = 1) dimnames(mat) <- dn return(mat) } ## Define a substitute function if necessary: if(!exists("sd", mode="function")) sd <- function(y,...) { # compute and return standard deviation return ( sqrt(var(y,...)) ) } valid.n <- function(x) { return( length(x)-length(which(is.na(x))) ) } ## Define a substitute function if necessary: if(!exists("se", mode="function")) se <- function(y,...) { # compute and return standard error (of the mean) return ( sd(y,...) / sqrt(valid.n(y)) ) } ebplot <- function (x,y,conf=.95,pch=15,col="darkgrey",ww=.10,...) { # add average point and error bar to current plot # Hugo Quené, hugo.quene@let.uu.nl, November 2006 # 20070618 added ... placeholder # # x is x coordinate of point and bar # y is whole vector, compute average and error bar in function # conf is width of confidence interval # col is color for plot symbol (pch=15) at average # ww is width (expressed in units of x dimension) of whiskers on error bar # ... is placeholder for other (graphical) parameters and arguments, e.g. bg="COLOR" # remove extremes and NAs in y y <- trim(y,0.005) # determine critical t value, RE conf and df if (length(y)<=1) { crit <- 0 } else { crit <- qt( 1-((1-conf)/2), df=(length(y)-1) ) } # bar could also be defined as 2*sd(y), etc. # here, function se (see above) is called to compute standard error of y bar <- crit*se(y,na.rm=T) # error bar is in default color (usually black) lines(x=rep(x,2), y=c(mean(y)-bar,mean(y)+bar), lwd=1.5 ) lines(x=c(x-ww,x+ww), y=rep(mean(y)-bar,2), lwd=1.5 ) # lower whisker lines(x=c(x-ww,x+ww), y=rep(mean(y)+bar,2), lwd=1.5 ) # upper whisker # plot point after error bar, so that it overlaps the bar, 20070618 # plot point is enlarged, in specified color points(x, mean(y), pch=pch, cex=2, col=col, ... ) } ## Define a substitute function if necessary: if(!exists("logit", mode="function")) logit <- function(p) { # compute and return logit of p; # if p=.5 then logit==0 else sign(logit)==(p>.5) return( log(p/(1-p)) ) } ## Define a substitute function if necessary: if(!exists("antilogit", mode="function")) antilogit <- function(x) { # compute and return antilogit of x; # this returns a proportion p for which logit(p)==x; return( exp(x)/(1+exp(x)) ) } resample <- function(x, size, ...) { # safe resample, with replacement, for bootstrap # after example in help(sample) if(length(x) <= 1) { if(!missing(size) && size == 0) x[FALSE] else x } else sample(x, size, replace=T, ...) } bonferroni.adjusted <- function( alpha=.05, k=3 ) { # apply bonferroni adjustement to alpha level, # see http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=1112991 # alpha is nominal alpha level, joint probability of Type I error over all comparisons # k is number of posthoc comparisons return( 1-(1-alpha)^(1/k) ) } ci <- function( m, b, c=qnorm(1-.05/2) ) { # report confidence interval, using critical Z score # m is mean of data # b is variability measure of data, such as standard error of mean # c is critical value, default z=1.96 for 95%CI, or enter exact t value cat("interval (",m-c*b,m+c*b,")\n") } # some aliases that I find easier to remember. pwd <- getwd() chdir <- function(...) { getwd(...) } nPVI <- function ( dur=c(50,100,50,75,125) ) { # calculate nPVI, normalized Pairwise Variability Index, # see Grabe & Low 2002, Proc LabPhon 7 # HQ 20071119 x <- diff(dur) ub <- length(raw) # upper bound of difference for (k in 1:ub) x[k] <- abs ( x[k]*2 / (dur[k]+dur[k+1]) ) return( (100/ub)*sum(x) ) } same <- function( x, k=3 ) { # test whether k consecutive elements (triplets, quartets, etc) of x are identical; # the return value is an integer vector of the same length as x. # if all k values are NA, then the return value is FALSE not TRUE. # the return values are integers with meaning 0=false, 1=true. # HQ 20080929 # initialize if (length(x) %% k > 0) # if remainder after division > 0 warning( paste("length of data is not a multiple of",k) ) maxi <- length(x) %/% k # integer divide result <- as.integer( rep(NA,length(x)) ) # loop for (i in 1:maxi) { values <- unique( x[ ((i-1)*k+1) : ((i-1)*k+k)] ) # unique values of k consecutive elements curr <- (length(values)==1) && ((NA %in% values)==FALSE) # test identity, NA excluded result[ ((i-1)*k+1) : ((i-1)*k+k) ] <- as.integer(curr) } return(result) } inside <- function ( ll=0,x,ul=1 ) # HQ 20090901 defaults added { # test whether x lies inside interval spanning from ll to ul (inclusive) # HQ 20090228 return( ll<=x & x<=ul ) }