每月组和Name
,取消堆叠并重新采样到月份,因此所有月份都存在,并且每个月份都会获得一列Name
df2 = df.groupby([pd.TimeGrouper('M'), 'Name', ])['data'].apply(set).unstack().resample('M').sum()
df2
Name Alpha Beta
Date
2017-01-31 {A} {A, D}
2017-02-28 {A} None
2017-03-31 {B} {C}
2017-04-30 None {C}
2017-05-31 None {B}
多个迭代器
一些 itertools 魔法可以在同一列上多次迭代
def multiple_iterator(iterable, r=2):
iterators = itertools.tee(iterable, r)
try:
for i, it in enumerate(iterators):
for j in range(i):
next(it)
except StopIteration:
return None
return iterators
真正的工作
def get_unique_items_rolling(df, period):
for col_name, col in df2.iteritems():
s = pd.Series()
# print(f'---{col_name}---')
for idx, *iterators in zip(col.index[period-1:], *multiple_iterator(col, period)):
result = set(itertools.chain.from_iterable(i for i in iterators if pd.notnull(i) and i))
# print(idx, result)
s[idx] = result
yield col_name, s
df3 = pd.DataFrame.from_items(get_unique_items_rolling(df2, period))
Alpha Beta
2017-03-31 {A, B} {A, D, C}
2017-04-30 {A, B} {C}
2017-05-31 {B} {B, C}
df3.stack().apply(len)
Date Name
2017-03-31 Alpha 2
Beta 3
2017-04-30 Alpha 2
Beta 1
2017-05-31 Alpha 1
Beta 2
dtype: int64