哈德利很酷talk http://www.edii.uclm.es/~useR-2013/invited/useR2013_wickham.pdf关于他的新包dplyr https://github.com/hadley/dplyr and ggvis https://github.com/rstudio/ggvis在用户2013。但他自己或许可以更好地讲述更多相关内容。
我不确定您的应用程序设计是什么样的,但我经常在将数据输入 R 之前进行数据库内预处理。例如,如果您正在绘制时间序列,则实际上没有必要显示每一秒X 轴上的天。相反,您可能想要聚合并获得最小/最大/平均值,例如一或五分钟的时间间隔。
下面是我几年前编写的一个函数示例,它在 SQL 中执行类似的操作。此特定示例使用模运算符,因为时间存储为纪元毫秒。但是,如果 SQL 中的数据正确存储为日期/日期时间结构,则 SQL 具有一些更优雅的本机方法来按时间段进行聚合。
#' @param table name of the table
#' @param start start time/date
#' @param end end time/date
#' @param aggregate one of "days", "hours", "mins" or "weeks"
#' @param group grouping variable
#' @param column name of the target column (y axis)
#' @export
minmaxdata <- function(table, start, end, aggregate=c("days", "hours", "mins", "weeks"), group=1, column){
#dates
start <- round(unclass(as.POSIXct(start))*1000);
end <- round(unclass(as.POSIXct(end))*1000);
#must aggregate
aggregate <- match.arg(aggregate);
#calcluate modulus
mod <- switch(aggregate,
"mins" = 1000*60,
"hours" = 1000*60*60,
"days" = 1000*60*60*24,
"weeks" = 1000*60*60*24*7,
stop("invalid aggregate value")
);
#we need to add the time differene between gmt and pst to make modulo work
delta <- 1000 * 60 * 60 * (24 - unclass(as.POSIXct(format(Sys.time(), tz="GMT")) - Sys.time()));
#form query
query <- paste("SELECT", group, "AS grouping, AVG(", column, ") AS yavg, MAX(", column, ") AS ymax, MIN(", column, ") AS ymin, ((CMilliseconds_g +", delta, ") DIV", mod, ") AS timediv FROM", table, "WHERE CMilliseconds_g BETWEEN", start, "AND", end, "GROUP BY", group, ", timediv;")
mydata <- getquery(query);
#data
mydata$time <- structure(mod*mydata[["timediv"]]/1000 - delta/1000, class=c("POSIXct", "POSIXt"));
mydata$grouping <- as.factor(mydata$grouping)
#round timestamps
if(aggregate %in% c("mins", "hours")){
mydata$time <- round(mydata$time, aggregate)
} else {
mydata$time <- as.Date(mydata$time);
}
#return
return(mydata)
}