您可以做两件事:1)在每次迭代时将 data.frame 保存到“.RData 文件”中。执行此操作时,您需要的内存较少,因为您不在 RAM 中存储数据 2) 使用并行计算。这是一个例子:
library(parallel)
library(doParallel)
library(RSelenium)
fn_Par <- function(core_Id, all_Index, list_remDr, nb_Core)
{
library(jsonlite)
library(RSelenium)
remDr <- list_remDr[[core_Id]]
remDr$open()
setwd("D:\\")
part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 <- "h&before="
part3 <- "h&size=500"
nb_Index_All <- length(all_Index)
nb_Id_Per_Core <- floor(nb_Index_All / nb_Core)
index_To_Extract <- all_Index[(1 + (core_Id - 1) * nb_Id_Per_Core) : min((core_Id * nb_Id_Per_Core), nb_Index_All)]
for(i in index_To_Extract)
{
url_i <- paste0(part1, i + 1, part2, i, part3)
remDr$navigate(url_i)
Sys.sleep(0.5)
web_Obj <- remDr$findElement("css selector", 'body > pre')
r_i <- tryCatch(data.frame(fromJSON(web_Obj$getElementText()[[1]])), error = function(e) NA)
if(is.null(dim(r_i)) == FALSE)
{
Sys.sleep(10)
remDr$navigate(url_i)
web_Obj <- remDr$findElement("css selector", 'body > pre')
r_i <- tryCatch(data.frame(fromJSON(web_Obj$getElementText()[[1]])), error = function(e) NA)
}
save(r_i, file = paste0(i, "_core_Id_", core_Id, ".RData"))
Sys.sleep(0.5)
}
}
nb_CPU <- 4
cluster <- parallel::makeCluster(nb_CPU)
doParallel::registerDoParallel(cl = cluster)
list_remDr <- list()
list_rd <- list()
for(i in 1 : nb_CPU)
{
print(i)
port <- as.integer(4444L + rpois(lambda = 1000, 1))
list_rd[[i]] <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = port)
list_remDr[[i]] <- list_rd[[i]]$client
}
parLapply(cluster, X = 1 : nb_CPU, fun = fn_Par, all_Index = 1 : 2000, list_remDr = list_remDr, nb_Core = nb_CPU)