Option 1
我会用groupby
and transform
with first
.
transform
将在所有实例中广播第一个遇到的值
团体的。
df.assign(
Account_Number=
df.groupby('Account_Number')
.Dummy_Account
.transform('first')
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
Option 2
使用 Numpy 的np.unique
获得第一个值和逆值的索引。
指数(idx
) 标识第一个唯一位置'Account_Number'
发生了。我用这个来切片'Dummy_Account'
。然后我使用逆数组(inv
)旨在将唯一值放回原位,但我将其用于重合数组中位于相同位置的事物上。
u, idx, inv = np.unique(
df.Account_Number.values,
return_index=True,
return_inverse=True
)
df.assign(
Account_Number=
df.Dummy_Account.values[idx][inv]
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
Option 3
或者使用pd.factorize
and pd.Series.duplicated
.
与选项 2 类似,但是我让duplicated
起到识别第一个值在哪里的作用。然后,我用生成的布尔数组对重合值进行切片,然后用以下结果将其反转pd.factorize
. f
扮演着同样的角色inv
来自选项 2。
d = ~df.Account_Number.duplicated().values
f, u = pd.factorize(df.Account_Number.values)
df.assign(
Account_Number=
df.Dummy_Account.values[d][f]
)
Account_Number Dummy_Account
0 ACC0000000000001 ACC0000000000001
1 ACC0000000000002 ACC0000000000002
2 ACC0000000000001 ACC0000000169532
3 ACC0000000123005 ACC0000000123005
4 ACC0000000000004 ACC0000000000004
5 ACC0000000000005 ACC0000000000005
6 ACC0000000123005 ACC0000000000040
时间测试
Results
res.plot(loglog=True)
res.div(res.min(1), 0)
jez cold pir1 pir2 pir3
10 4.524811 3.819322 2.870916 1.000000 1.238144
30 4.833144 4.093932 3.310285 1.000000 1.382189
100 4.863337 4.048008 3.146154 1.000000 1.320060
300 5.144460 3.894850 3.157636 1.000000 1.357779
1000 4.870499 3.742524 2.348021 1.000000 1.069559
3000 5.375105 3.432398 1.852771 1.126024 1.000000
10000 7.100372 4.335100 1.890134 1.551161 1.000000
30000 7.227139 3.993985 1.530002 1.594531 1.000000
100000 8.052324 3.811728 1.380440 1.708170 1.000000
300000 8.690613 4.204664 1.539624 1.942090 1.000000
1000000 7.787494 3.668117 1.498758 2.129085 1.000000
Setup
def jez(d):
v = d.sort_values('Account_Number')
v['Account_Number'] = v['Dummy_Account'].mask(v.duplicated('Account_Number')).ffill()
return v.sort_index()
def cold(d):
m = d.drop_duplicates('Account_Number', keep='first')\
.set_index('Account_Number')\
.Dummy_Account
return d.assign(Account_Number=d.Account_Number.map(m))
def pir1(d):
return d.assign(
Account_Number=
d.groupby('Account_Number')
.Dummy_Account
.transform('first')
)
def pir2(d):
u, idx, inv = np.unique(
d.Account_Number.values,
return_index=True,
return_inverse=True
)
return d.assign(
Account_Number=
d.Dummy_Account.values[idx][inv]
)
def pir3(d):
p = ~d.Account_Number.duplicated().values
f, u = pd.factorize(d.Account_Number.values)
return d.assign(
Account_Number=
d.Dummy_Account.values[p][f]
)
res = pd.DataFrame(
index=[10, 30, 100, 300, 1000, 3000, 10000,
30000, 100000, 300000, 1000000],
columns='jez cold pir1 pir2 pir3'.split(),
dtype=float
)
np.random.seed([3, 1415])
for i in res.index:
d = pd.DataFrame(dict(
Account_Number=np.random.randint(i // 2, size=i),
Dummy_Account=range(i)
))
d = pd.concat([df] * i, ignore_index=True)
for j in res.columns:
stmt = f'{j}(d)'
setp = f'from __main__ import {j}, d'
res.at[i, j] = timeit(stmt, setp, number=100)