我们将使用 XML 包
library(XML)
并创建一个闭包,其中包含一个处理“SCHOOL”节点的函数,以及两个用于在完成后检索结果的辅助函数。 SCHOOL 函数在每个 SCHOOL 节点上调用。如果它找到曲棍球队,它会使用 /SCHOOL/NAME/text() 作为“密钥”,并使用 /SCHOOL/TEAMS/HOCKEY/text() 和 //STUDENT/text() (或 /SCHOOL/GRADES /STUDENT/text()) 作为值。每 100 所(默认情况下)拥有曲棍球队的学校会打印一条消息,以便显示一些进展情况。事后使用“get”函数来检索结果。
teams <- function(progress=1000) {
res <- new.env(parent=emptyenv()) # for results
it <- 0L # iterator -- nodes visited
list(SCHOOL=function(elt) {
## handle 'SCHOOL' nodes
if (getNodeSet(elt, "not(/SCHOOL/TEAMS/HOCKEY)"))
## early exit -- no hockey team
return(NULL)
it <<- it + 1L
if (it %% progress == 0L)
message(it)
school <- getNodeSet(elt, "string(/SCHOOL/NAME/text())") # 'key'
res[[school]] <-
list(team=getNodeSet(elt,
"normalize-space(/SCHOOL/TEAMS/HOCKEY/text())"),
students= xpathSApply(elt, "//STUDENT", xmlValue))
}, getres = function() {
## retrieve the 'res' environment when done
res
}, get=function() {
## retrieve 'res' environment as data.frame
school <- ls(res)
team <- unlist(eapply(res, "[[", "team"), use.names=FALSE)
student <- eapply(res, "[[", "students")
len <- sapply(student, length)
data.frame(school=rep(school, len), team=rep(team, len),
student=unlist(student, use.names=FALSE))
})
}
我们使用该函数作为
branches <- teams()
xmlEventParse("event.xml", handlers=NULL, branches=branches)
branches$get()