###################################################################### # Data Examination Procedures # # Jay Emerson, Yale University, June 2005 # ###################################################################### ###################################################################### ###################################################################### ## Copyright (C) 2005 John W. Emerson ## ## This document is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2, or (at your option) ## any later version. ## ## This program is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## A copy of the GNU General Public License is available via WWW at ## http://www.gnu.org/copyleft/gpl.html. You can also obtain it by ## writing to the Free Software Foundation, Inc., 675 Mass Ave, ## Cambridge, MA 02139, USA. ## ## Bug reports to john.emerson@yale.edu ###################################################################### ###################################################################### ###################################################################### # Improvements by Walton Green, Yale University, include formatting # of long variable names, and the addition of line (variable) numbers. ###################################################################### # Each function simply takes a data frame and returns potentially # helpful information as you poke and prod a real data set for # the first time. Work in progress, no documentation yet. ###################################################################### whatis <- function(x, var.name.truncate = 20) { if (!is.data.frame(x)) { x <- data.frame(x) warning("Object coerced to a data frame.\n") } if ( is.na(length(dim(x))) | is.null(length(dim(x))) ) stop("You can not be serious!\n") sum.na <- function(y) sum(y, na.rm=TRUE) size <- dim(x) is.fac <- unlist(lapply(x, is.factor)) is.char <- unlist(lapply(lapply(x, as.vector), is.character)) is.fc <- is.fac | is.char sum.var.na <- apply(is.na(x), 2, sum) is.mixed <- rep(TRUE, size[2]) this.type <- rep("", size[2]) num.values <- rep(0, size[2]) summ <- matrix(NA,size[2],2) summ <- as.data.frame(summ) names(summ) <- c("min","max") precision <- rep(NA,size[2]) var.abbrev <- rep('',size[2]) uniquifier <- 1 for (i in 1:(size[2])) { if (nchar(names(x)[i]) > var.name.truncate) { var.abbrev[i] <- paste(substr(names(x)[i], 1, 15), '...', uniquifier, sep = '') uniquifier <- uniquifier + 1 } else var.abbrev[i] <- names(x)[i] num.values[i] <- length(table(x[,i])) z <- as.character(x[,i]) has.nonnumeric <- (regexpr("[^0-9\.-]", gsub(" ", "", z), perl=TRUE) > 0) has.numeric <- (regexpr("[0-9\.-]", gsub(" ", "", z), perl=TRUE) > 0) is.mixed[i] <- sum.na(has.nonnumeric)>0 & sum.na(has.numeric)>0 if (is.fac[i] & !is.mixed[i]) this.type[i] <- "pure factor" if (is.fac[i] & is.mixed[i]) this.type[i] <- "mixed factor" if (is.char[i] & !is.fac[i]) this.type[i] <- "character" if (!is.char[i] & !is.fac[i]) this.type[i] <- "numeric" xtemp <- x[!is.na(x[,i]),i] if (length(xtemp)==0) { summ[i,] <- rep(NA, 2) precision[i] <- NA } else { if (this.type[i] == "numeric") { z <- as.character(x[,i]) has.decimal <- as.vector(regexpr('.', z, fixed=TRUE)) has.decimal[is.na(has.decimal)] <- 0 has.decimal[has.decimal<0] <- 0 has.decimal[has.decimal>0] <- nchar(z[has.decimal>0]) - has.decimal[has.decimal>0] precision[i] <- 10^(-1*max(has.decimal[!is.na(has.decimal)])) } if (this.type[i]=="numeric") summ[i,] <- c(min(x[,i], na.rm=TRUE), max(x[,i], na.rm=TRUE)) if (this.type[i]!="numeric") summ[i,] <- sort(as.character(x[,i]))[c(1,length(xtemp))] } } summary.by.variable <- data.frame(variable.name=var.abbrev, type=this.type, missing=sum.var.na, distinct.values=num.values, precision=precision) summary.by.variable <- cbind(summary.by.variable, summ) row.names(summary.by.variable) <- 1:ncol(x) return(summary.by.variable) } tellmemore <- function(x, nsd=4) { if (!is.data.frame(x)) { x <- data.frame(x) warning("Object coerced to a data frame.\n") } sd.na <- function(y) sd(y, na.rm=TRUE) sum.na <- function(y) sum(y, na.rm=TRUE) mean.na <- function(y) mean(y, na.rm=TRUE) size <- dim(x) var.names <- names(x) is.fac <- unlist(lapply(x, is.factor)) is.char <- unlist(lapply(lapply(x, as.vector), is.character)) is.fc <- is.fac | is.char sum.var.na <- apply(is.na(x), 2, sum) sd.by.var <- 1*(!is.fc) sd.by.var[!is.fc] <- unlist(lapply(x[,!is.fc], sd.na)) sd.by.var[is.fc] <- NA outliers <- rep(NA, size[2]) is.mixed <- rep(TRUE, size[2]) this.type <- rep("", size[2]) num.values <- rep(0, size[2]) t.singles <- rep(NA, size[2]) x.out <- x for (i in 1:(size[2])) { num.values[i] <- length(table(x[,i])) if (is.fc[i]) t.singles[i] <- sum(table(x[,i])==1) z <- as.character(x[,i]) has.nonnumeric <- (regexpr("[^0-9\.-]", gsub(" ", "", z), perl=TRUE) > 0) has.numeric <- (regexpr("[0-9\.-]", gsub(" ", "", z), perl=TRUE) > 0) is.mixed[i] <- sum.na(has.nonnumeric)>0 & sum.na(has.numeric)>0 if (is.fac[i] & !is.mixed[i]) this.type[i] <- "pure factor" if (is.fac[i] & is.mixed[i]) this.type[i] <- "mixed factor" if (is.char[i] & !is.fac[i]) this.type[i] <- "character" if (!is.char[i] & !is.fac[i]) this.type[i] <- "numeric" if (this.type[i]=="numeric") { x.out[,i] <- abs( (x[,i] - mean.na(x[,i])) / sd.na(x[,i]) ) x.out[!is.na(x.out[,i]),i] <- x.out[!is.na(x.out[,i]),i] > nsd } } outliers[!is.fc] <- unlist(lapply(x.out[,!is.fc], sum.na)) summary.by.variable <- data.frame(type=this.type, missing=sum.var.na, distinct.values=num.values, outliers=outliers, singleton.levels=t.singles) return(summary.by.variable) }