如果您遇到枢轴性能问题,下面的方法是同一问题的另一种解决方案,尽管它允许您通过使用 for 循环将作业分为每个类别的阶段来获得更多控制。对于每次迭代,这会将类别_x 的新数据附加到 acc_df 中,该数据将保存累积结果。
schema = ArrayType(
StructType((
StructField("p_date", StringType(), False),
StructField("d_warranty", StringType(), False)
))
)
tuple_list_udf = udf(tuple_list, schema)
buf_size = 5 # if you get OOM error decrease this to persist more often
categories = df.select("category").distinct().collect()
acc_df = spark.createDataFrame(sc.emptyRDD(), df.schema) # create an empty df which holds the accumulated results for each category
for idx, c in enumerate(categories):
col_name = c[0].replace(" ", "_") # spark complains for columns containing space
cat_df = df.where(df["category"] == c[0]) \
.groupBy("product_id") \
.agg(
F.collect_list(F.col("purchase_date")).alias("p_date"),
F.collect_list(F.col("days_warranty")).alias("d_warranty")) \
.withColumn(col_name, tuple_list_udf(F.col("p_date"), F.col("d_warranty"))) \
.drop("p_date", "d_warranty")
if idx == 0:
acc_df = cat_df
else:
acc_df = acc_df \
.join(cat_df.alias("cat_df"), "product_id") \
.drop(F.col("cat_df.product_id"))
# you can persist here every buf_size iterations
if idx + 1 % buf_size == 0:
acc_df = acc_df.persist()
函数tuple_list 负责生成一个包含purchase_date 和days_warranty 列中的元组的列表。
def tuple_list(pdl, dwl):
return list(zip(pdl, dwl))
其输出将是:
+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|product_id |CATEGORY_B |CATEGORY_A |
+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|02147465400|[[2017-04-16 00:00:00, 90], [2018-09-16 00:00:00, 90], [2017-10-09 00:00:00, 90], [2018-01-12 00:00:00, 90], [2018-07-11 00:00:00, 90], [2017-01-21 00:00:00, 90], [2018-04-14 00:00:00, 90], [2017-01-05 00:00:00, 90], [2017-07-15 00:00:00, 90]]|[[2017-06-14 00:00:00, 30], [2018-08-14 00:00:00, 30], [2018-01-11 00:00:00, 30], [2018-04-12 00:00:00, 30], [2017-10-11 00:00:00, 30], [2017-05-16 00:00:00, 30], [2018-05-15 00:00:00, 30], [2017-04-15 00:00:00, 30], [2017-02-15 00:00:00, 30], [2018-02-12 00:00:00, 30], [2017-01-21 00:00:00, 30], [2018-07-11 00:00:00, 30], [2018-06-14 00:00:00, 30], [2017-03-16 00:00:00, 30], [2017-07-20 00:00:00, 30], [2018-08-23 00:00:00, 30], [2017-09-12 00:00:00, 30], [2018-03-12 00:00:00, 30], [2017-12-12 00:00:00, 30], [2017-08-14 00:00:00, 30], [2017-11-11 00:00:00, 30]]|
+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+