我们将使用defaultdict
默认值是-np.inf
因为我将采用最大值,并且我想要一个所有值都大于的默认值。
solution
给定一组组g
和累积最大值的值v
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
示范
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)
accuracy
vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()
True
深潜
与 Divakar 的答案进行比较
headline chart
code
我对 Divakar 的函数进行了一些修改,以使其准确。
%%cython
import numpy as np
from collections import defaultdict
# this is a cythonized version of the next function
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def pir2(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def argsort_unique(idx):
# Original idea : http://stackoverflow.com/a/41242285/3293881
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def div1(groupby, value):
sidx = np.argsort(groupby,kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = sorted_groupby * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
def div2(groupby, value):
sidx = np.argsort(groupby, kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# factorize groups to integers
sorted_groupby = np.append(
0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = (sorted_groupby - sorted_groupby.min()) * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
NOTES:
- 有必要对迪瓦卡解决方案中的群进行因式分解以对其进行概括
accuracy
整数群
在基于整数的组上,两者div1
and div2
产生相同的结果
np.isclose(div1(g, v), pir1(g, v)).all()
True
np.isclose(div2(g, v), pir1(g, v)).all()
True
一般团体
基于字符串和浮点的组div1
变得不准确但很容易修复
g = g / 1000000
np.isclose(div1(g, v), pir1(g, v)).all()
False
np.isclose(div2(g, v), pir1(g, v)).all()
True
时间测试
results = pd.DataFrame(
index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))
size_map = dict(Large=100, Medium=10, Small=1)
from timeit import timeit
for i in results.index:
for j in results.columns:
results.set_value(
i, j,
timeit(
'{}(g // size_map[j], v)'.format(i),
'from __main__ import {}, size_map, g, v, j'.format(i),
number=100
)
)
results
results.T.plot.barh()