不久前我遇到了类似的问题。我不想写一个完整的UDAF
所以我只是做了一个组合砖房收集 https://github.com/klout/brickhouse/blob/master/src/main/java/brickhouse/udf/collect/CollectUDAF.java和我自己的UDF
。假设你有这个数据
id value
1 A
1 A
1 A
1 B
1 B
1 A
1 C
1 C
1 D
2 D
2 D
2 D
2 D
2 F
2 F
2 F
2 A
2 W
2 A
my UDF
was
package com.something;
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class RemoveSequentialDuplicates extends UDF {
public ArrayList<Text> evaluate(ArrayList<Text> arr) {
ArrayList<Text> newList = new ArrayList<Text>();
newList.add(arr.get(0));
for (int i=1; i<arr.size(); i++) {
String front = arr.get(i).toString();
String back = arr.get(i-1).toString();
if (!back.equals(front)) {
newList.add(arr.get(i));
}
}
return newList;
}
}
然后我的查询是
add jar /path/to/jar/brickhouse-0.7.1.jar;
add jar /path/to/other/jar/duplicates.jar;
create temporary function remove_seq_dups as 'com.something.RemoveSequentialDuplicates';
create temporary function collect as 'brickhouse.udf.collect.CollectUDAF';
select id
, remove_seq_dups(value_array) no_dups
from (
select id
, collect(value) value_array
from db.table
group by id ) x
output
1 ["A","B","A","C","D"]
2 ["D","F","A","W","A"]
顺便说一句,内置collect_list
不必按照它们分组的顺序保留列表的元素;砖房collect
将要。希望这可以帮助。