我只是想弄清楚如何对两个 data.tables 进行条件连接。
我写了一个sqldf
条件连接给我提供其开始或结束时间在其他开始/结束时间之内的电路。
sqldf("select dt2.start, dt2.finish, dt2.counts, dt1.id, dt1.circuit
from dt2
left join dt1 on (
(dt2.start >= dt1.start and dt2.start < dt1.finish) or
(dt2.finish >= dt1.start and dt2.finish < dt1.finish)
)")
这给了我正确的结果,但对于我的大型数据集来说太慢了。
什么是data.table
如何在没有矢量扫描的情况下做到这一点?
这是我的数据:
dt1 <- data.table(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L,
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200,
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000,
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""),
end = structure(c(1393626600, 1393631519, 1393639200, 1393632000,
1393660500, 1393673400, 1393667999, 1393671600, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001",
"1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009"
), class = "factor")), .Names = c("circuit", "start", "end",
"id"), class = "data.frame", row.names = c(NA, -9L)))
dt2 <- data.table(structure(list(start = structure(c(1393621200, 1393624800, 1393626600,
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299,
1393655999, 1393660500, 1393666199, 1393671600, 1393673400), class = c("POSIXct",
"POSIXt"), tzone = ""), end = structure(c(1393624799, 1393626600,
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299,
1393655999, 1393660500, 1393666199, 1393671600, 1393673400, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), seconds = c(3599L,
1800L, 1319L, 480L, 3601L, 7200L, 7199L, 3900L, 5700L, 4501L,
5699L, 5401L, 1800L, 3600L), counts = c(1L, 1L, 0L, 1L, 2L, 1L,
0L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), .Names = c("start", "end",
"seconds", "counts"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame"))