您可以使用创建自定义函数来查找两个数据帧的所有匹配索引,然后提取这些索引并使用pd.concat https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html.
import re
def merge_regex(df1, df2):
idx = [(i,j) for i,r in enumerate(df1.regex) for j,v in enumerate(df2.col2) if re.match(r,v)]
df1_idx, df2_idx = zip(*idx)
t = df1.iloc[list(df1_idx),0].reset_index(drop=True)
t1 = df2.iloc[list(df2_idx),0].reset_index(drop=True)
return pd.concat([t,t1],axis=1)
merge_regex(df1, df2)
col1 col2
0 a ab
1 a a
2 b ab
3 c cd
4 d cd
计时结果
# My solution
In [292]: %timeit merge_regex(df1,df2)
1.21 ms ± 22.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
#Chris's solution
In [293]: %%timeit
...: df1['matches'] = df1.apply(lambda r: [x for x in df2['col2'].values if re.findall(r['regex'], x)], axis=1)
...:
...: df1.set_index('col1').explode('matches').reset_index().drop(columns=['regex'])
...:
...:
4.62 ms ± 25.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)