假设您的数据如下所示
library(dplyr)
library(sparklyr)
df <- data.frame(text = c("1.0,2.0,3.0", "4.0,5.0,6.0"))
sdf <- copy_to(sc, df, "df", overwrite = TRUE)
并且您已经创建了一个spark_connection
你可以执行以下操作
n <- 3
# There is no function syntax for array access in Hive
# so we have to build [] expressions
# CAST(... AS double) could be handled in sparklyr / dplyr with as.numeric
exprs <- lapply(
0:(n - 1),
function(i) paste("CAST(bits[", i, "] AS double) AS x", i, sep=""))
sdf %>%
# Convert to Spark DataFrame
spark_dataframe() %>%
# Use expression with split and explode
invoke("selectExpr", list("split(text, ',') AS bits")) %>%
# Select individual columns
invoke("selectExpr", exprs) %>%
# Register table in the metastore ("registerTempTable" in Spark 1.x)
invoke("createOrReplaceTempView", "exploded_df")
And use dplyr::tbl
去取回sparklyr
object:
tbl(sc, "exploded_df")
Source: query [2 x 3]
Database: spark connection master=local[8] app=sparklyr local=TRUE
x0 x1 x2
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
在最新版本中,您还可以使用sdf_separate_column
:
sdf %>%
mutate(text=split(text, ",")) %>%
sdf_separate_column("text", paste0("x", 0:2))
# Source: table<sparklyr_tmp_87125f13b89> [?? x 4]
# Database: spark_connection
text x0 x1 x2
<list> <chr> <chr> <chr>
1 <list [3]> 1.0 2.0 3.0
2 <list [3]> 4.0 5.0 6.0