在 JavaScript 中有几种不相交集并集(DSU)的实现,这里this https://gist.github.com/KSoto/3300322fc2fb9b270dce2bf1e3d80cf3用于 BigQuery 中的用户定义函数 (UDF)。主要思想是使用查找和联合函数并将链接保存在树中,请表示为数组详情请参阅此处 https://en.wikipedia.org/wiki/Disjoint-set_data_structure.
create temp function DSU(A array<struct<a string,b string>>)
returns array<struct<a string,b string>>
language js as
"""
// https://gist.github.com/KSoto/3300322fc2fb9b270dce2bf1e3d80cf3
// Disjoint-set bigquery
class DSU {
constructor() {
this.parents = [];
}
find(x) {
if(typeof this.parents[x] != "undefined") {
if(this.parents[x]<0) {
return x;
} else {
if(this.parents[x]!=x) {
this.parents[x]=this.find(this.parents[x]);
}
return (this.parents[x]);
}
} else {
this.parents[x]=-1;
return x;
}
}
union(x,y) {
var xpar = this.find(x);
var ypar = this.find(y);
if(xpar != ypar) {
this.parents[xpar]+=this.parents[ypar];
this.parents[ypar]=xpar;
}
}
console_print() {
// console.log(this.parents);
}
}
var dsu = new DSU();
for(var i in A){
dsu.union(A[i].a,A[i].b);
}
var out=[]
for(var i in A){
out[i]={b:dsu.find(A[i].a),a:A[i].a};
}
return out;
""";
with #recursive
your_table as (
SELECT 1 as user, "cat food" as item
UNION ALL SELECT 1, "cat toy"
UNION ALL SELECT 2, "cat snacks"
UNION ALL SELECT 2, "cat toy"
UNION ALL SELECT 10, "dog food"
union all select 10, "dog collar"
union all select 11, "dog food"
union all select 11, "candy"
union all select 12, "candy"
union all select 12, "apples"
union all select 15, "paper"
), helper as (
select distinct a, b
from (
Select user,min(item) as b, array_agg(item) as a_list
from your_table
group by 1
), unnest(a_list) as a
)
Select * except(tmp_count),
first_value(item) over(partition by b order by tmp_count desc,b) as item_most_common
from
(
select * ,
count(item) over(partition by b,item) as tmp_count
from your_table
left join (select X.a, min(X.b) as b from (select DSU(array_agg(struct(''||a,''||b))) as X from helper),unnest(X) X group by 1 order by 1) as combinder
on ''||item=combinder.a
)
数据在表中your_table
. A helper
表用于构建任何用户带来的所有两个项目对。组合为一个数组,这将提供给 UDFDSU
。该函数返回列中的所有项目a
并在列中b
群组。我们希望组中最常见的项目显示为组名称,因此我们使用一些窗口函数来确定它。