Digest
你有正确的使用想法itertools.combinations()
。进一步的关键步骤:
- Apply
itertools.combinations()
对每个可能的维数进行求和(从1
to n_dim-1
). i.e. itertools.combinations(range(1, 1+n_dim), i)
, for i in range(1, 1+n_dim)
.
- Use
df.groupby(by=column_combinations).sum()
自动从类的组合中获取结果。
Code
该程序由 3 个逻辑部分组成。
- 从每个维度按类聚合。这部分基本上与您所做的相同,但是通过DFS方法重新设计以减少正在处理的数据总量。当需要处理数百万行时,这非常有用。后续步骤也是基于此中间数据集而不是原始数据集来计算的。
- 一个生成器,用于循环摘要 1 中提到的维度组合,并且无需显式枚举。
- 执行摘要 2 中提到的分组计算并输出结果数据帧列表,这些结果数据帧可以在程序末尾连接。
Caution:请务必在生产使用中测试性能和内存问题。
import pandas as pd
import numpy as np
import itertools
df = pd.DataFrame(
{'Dim1': ['A', 'A', 'B', 'B'],
'Dim2': ['X', 'Y', 'X', 'Y'],
'Dim3': ['Z', 'Z', 'Z', 'Z'],
'Spend': [100, 200, 300, 400]
}
)
# constants: column names and dimensions
n_dim = 3
dim_cols = [f"Dim{i}" for i in range(1, n_dim + 1)]
cols = dim_cols + ["Spend"]
# 1. compute sums with every dimension
def dfs(df, ls_out, dim_now=1, ls_classes=[]):
# termination condition (every dimension has been traversed)
if dim_now == n_dim + 1:
# perform aggregation
sum = df["Spend"].sum()
ls_classes.append(sum)
ls_out.append(ls_classes)
return
# proceed
col = f"Dim{dim_now}"
# get categories
classes = df[col].unique()
classes.sort()
for c in classes:
# recurse next dimension with subset data
dfs(df[df[col] == c], ls_out,
dim_now=dim_now + 1,
ls_classes=ls_classes + [c])
ls_out = [] # the output container
dfs(df, ls_out)
# convert to dataframe
df_every_dim = pd.DataFrame(data=ls_out, columns=df.columns)
del ls_out
print(df_every_dim)
# 2. generate combinations of groupby-dimensions
def multinomial_combinations(n_dim):
for i in range(1, 1+n_dim):
for tup in itertools.combinations(range(1, 1+n_dim), i):
yield tup
print("Check multinomial_combinations(4):")
for i in multinomial_combinations(4):
print(i)
# 3. Sum based on from df_every_dim
def aggr_by_dims(df, by_dims):
# guard
if not (0 < len(by_dims) < n_dim):
raise ValueError(f"Wrong n_dim={n_dim}, len(by_dims)={len(by_dims)}")
# by-columns
by_cols = [f"Dim{i}" for i in by_dims]
# groupby-sum
df_grouped = df.groupby(by=by_cols).sum().reset_index()
# create none-columns (cannot be empty here)
arr = np.ones(n_dim+1, dtype=int)
arr[list(by_dims)] = 0
for i in range(1, 1+n_dim):
if arr[i] == 1:
df_grouped[f"Dim{i}"] = None # or np.nan as you wish
# reorder columns
return df_grouped[cols]
print("\nCheck aggr_by_dims(df_every_dim, [1, 3]):")
print(aggr_by_dims(df_every_dim, [1, 3]))
# combine 2. and 3.
ls = []
for by_dims in multinomial_combinations(n_dim):
if len(by_dims) < n_dim:
df_grouped = aggr_by_dims(df_every_dim, by_dims)
ls.append(df_grouped)
# no none-dimensions
ls.append(df_every_dim)
# final result
df_ans = pd.concat(ls, axis=0)
df_ans.reset_index(drop=True, inplace=True)
print(df_ans)
Output
(省略中间输出)
Dim1 Dim2 Dim3 Spend
0 A None None 300
1 B None None 700
2 None X None 400
3 None Y None 600
4 None None Z 1000
5 A X None 100
6 A Y None 200
7 B X None 300
8 B Y None 400
9 A None Z 300
10 B None Z 700
11 None X Z 400
12 None Y Z 600
13 A X Z 100
14 A Y Z 200
15 B X Z 300
16 B Y Z 400