Summary
人们可以写出这个计算大约。速度提高一个数量级(对于您指定的输入):
import numpy as np
def biweight_midcorrelation(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
return np.einsum('mi,mj->ij', arr, arr) / norms[:, None] / norms[None, :]
通过以下方式桥接到 Pandas 数据框:
import pandas as pd
def corr_np2pd(df, func):
return pd.DataFrame(func(np.array(df)), index=df.columns, columns=df.columns)
其用法是:
corr_df = corr_np2pd(df, biweight_midcorrelation)
通过使用 Numba 实现最后一次计算,这可以变得更快。
介绍
我不太清楚为什么您期望广播在当前代码中有所帮助。
您可能指的是矢量化吗?
无论如何,我相信可以编写更快的代码,并且“旧”方法的矢量化版本将优于当前的方法。
使用 Numba 可以使这变得更快。
有两种实用方法可以解决您的问题:
- 手动计算相关矩阵
- 生成要传递给的相关函数
pd.DataFrame.corr()
当执行(1)时,如果不计算相关矩阵的不必要部分,则可能无法避免显式循环。
执行 (2) 时,需要计算每对(对称)一维输入的计算辅助值(2 * comb(n, 2) https://docs.python.org/3/library/math.html#math.comb次),而不是为每个 1D 输入仅计算一次辅助值(n
次)。例如,对于问题中指定的输入,需要执行n == 4
预先计算,但是,如果以成对方式完成,这个数字就会变成2 * comb(4, 2) == 12
.
让我们看看如何提高这两种情况下的性能。
手动计算相关矩阵
让我们首先定义一个函数作为 Pandas 到 NumPy 的桥梁:
import numpy as np
import pandas as pd
def corr_np2pd(df, func):
return pd.DataFrame(func(np.array(df)), index=df.columns, columns=df.columns)
现在注释中具有显式循环的函数属于此类别的,如下所示biweight_midcorrelation_pd_OP()
:
def biweight_midcorrelation_pd_OP(X):
median = X.median()
mad = (X - median).abs().median()
U = (X - median) / (9 * mad)
adjacency = np.square(1 - np.square(U)) * ((1 - U.abs()) > 0)
estimator = (X - median) * adjacency
bicor_matrix = np.empty((X.shape[1], X.shape[1]), dtype=float)
for i, ac in enumerate(estimator):
for j, bc in enumerate(estimator):
a = estimator[ac]
b = estimator[bc]
c = (a * b).sum() / (
np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()))
bicor_matrix[i, j] = c
bicor_matrix[j, i] = c
return pd.DataFrame(bicor_matrix, index=X.columns, columns=X.columns)
稍加修改的版本,其中计算完全在 NumPy 中完成,并且应该与corr_np2pd()
, reads:
def biweight_midcorrelation_OP(arr):
n, m = arr.shape
med = np.median(arr, axis=0, keepdims=True)
mad = np.median(np.abs(arr - med), axis=0, keepdims=True)
u = (arr - med) / (9 * mad)
adj = ((1 - u ** 2) ** 2) * ((1 - np.abs(u)) > 0)
est = (arr - med) * adj
result = np.empty((m, m))
for i in range(m):
for j in range(m):
a = est[:, i]
b = est[:, j]
c = (a * b).sum() / (
np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))
result[i, j] = result[j, i] = c
return result
现在,这有一些改进点:
最后一点可以通过两种方式改进:
- 通过只计算对称索引一次,结果是
biweight_midcorrelation_np()
- 通过以向量化形式编写它,结果是
biweight_midcorrelation_npv()
def biweight_midcorrelation_np(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
result = np.empty((m, m))
np.fill_diagonal(result, 1.0)
for i, j in zip(*np.triu_indices(m, 1)):
result[i, j] = result[j, i] = \
np.sum(arr[:, i] * arr[:, j]) / norms[i] / norms[j]
return result
def biweight_midcorrelation_npv(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
return np.einsum('mi,mj->ij', arr, arr) / norms[:, None] / norms[None, :]
第一个会很快,只要m
很小,因为显式循环。
第二个通常会很快,但计算矩阵的某些条目两次似乎效率低下。
为了解决这两个问题,可以使用 Numba 重写最终循环:
import numba as nb
@nb.jit
def _biweight_midcorrelation_triu_nb(n, m, est, norms, result):
for i in range(m):
for j in range(i + 1, m):
x = 0
for k in range(n):
x += est[k, i] * est[k, j]
result[i, j] = result[j, i] = x / norms[i] / norms[j]
def biweight_midcorrelation_nb(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
result = np.empty((m, m))
np.fill_diagonal(result, 1.0)
_biweight_midcorrelation_triu_nb(n, m, arr, norms, result)
return result
成对相关函数
您现在提出的方法的稍微修改版本属于此类别的:
def pairwise_biweight_midcorrelation_OP(a, b):
a_median = np.median(a)
b_median = np.median(b)
a_mad = np.median(np.abs(a - a_median))
b_mad = np.median(np.abs(b - b_median))
u_a = (a - a_median) / (9 * a_mad)
u_b = (b - b_median) / (9 * b_mad)
adj_a = (1 - u_a ** 2) ** 2 * ((1 - np.abs(u_a)) > 0)
adj_b = (1 - u_b ** 2) ** 2 * ((1 - np.abs(u_b)) > 0)
a = (a - a_median) * adj_a
b = (b - b_median) * adj_b
return np.sum(a * b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))
使用与上面类似的简化,可以写得更简洁一些,结果是:
def pairwise_biweight_midcorrelation_opt(a, b):
a = a - np.median(a)
b = b - np.median(b)
v_a = 1 - (a / (9 * np.median(np.abs(a)))) ** 2
v_b = 1 - (b / (9 * np.median(np.abs(b)))) ** 2
a = a * v_a ** 2 * (v_a > 0)
b = b * v_b ** 2 * (v_b > 0)
return np.sum(a * b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))
最后一个操作是求和a
and b
三次,但实际上可以在一个循环中完成,这可以使用 Numba 再次快速完成:
@nb.jit
def pairwise_biweight_midcorrelation_nb(a, b):
n = a.size
a = a - np.median(a)
b = b - np.median(b)
v_a = 1 - (a / (9 * np.median(np.abs(a)))) ** 2
v_b = 1 - (b / (9 * np.median(np.abs(b)))) ** 2
a = (v_a > 0) * a * v_a ** 2
b = (v_b > 0) * b * v_b ** 2
s_ab = s_aa = s_bb = 0
for i in range(n):
s_ab += a[i] * b[i]
s_aa += a[i] * a[i]
s_bb += b[i] * b[i]
return s_ab / np.sqrt(s_aa) / np.sqrt(s_bb)
但没有简单的方法可以避免执行预计算2 * comb(n, 2) https://docs.python.org/3/library/math.html#math.comb次而不是n
次。
故事的另一面是,这类方法需要更少的内存,因为每次迭代只考虑两个一维数组。
Testing
对于建议的输入:
import pandas as pd
df = pd.DataFrame({'sepal_length': {'iris_0': 5.1, 'iris_1': 4.9, 'iris_2': 4.7, 'iris_3': 4.6, 'iris_4': 5.0, 'iris_5': 5.4, 'iris_6': 4.6, 'iris_7': 5.0, 'iris_8': 4.4, 'iris_9': 4.9, 'iris_10': 5.4, 'iris_11': 4.8, 'iris_12': 4.8, 'iris_13': 4.3, 'iris_14': 5.8, 'iris_15': 5.7, 'iris_16': 5.4, 'iris_17': 5.1, 'iris_18': 5.7, 'iris_19': 5.1, 'iris_20': 5.4, 'iris_21': 5.1, 'iris_22': 4.6, 'iris_23': 5.1, 'iris_24': 4.8, 'iris_25': 5.0, 'iris_26': 5.0, 'iris_27': 5.2, 'iris_28': 5.2, 'iris_29': 4.7, 'iris_30': 4.8, 'iris_31': 5.4, 'iris_32': 5.2, 'iris_33': 5.5, 'iris_34': 4.9, 'iris_35': 5.0, 'iris_36': 5.5, 'iris_37': 4.9, 'iris_38': 4.4, 'iris_39': 5.1, 'iris_40': 5.0, 'iris_41': 4.5, 'iris_42': 4.4, 'iris_43': 5.0, 'iris_44': 5.1, 'iris_45': 4.8, 'iris_46': 5.1, 'iris_47': 4.6, 'iris_48': 5.3, 'iris_49': 5.0, 'iris_50': 7.0, 'iris_51': 6.4, 'iris_52': 6.9, 'iris_53': 5.5, 'iris_54': 6.5, 'iris_55': 5.7, 'iris_56': 6.3, 'iris_57': 4.9, 'iris_58': 6.6, 'iris_59': 5.2, 'iris_60': 5.0, 'iris_61': 5.9, 'iris_62': 6.0, 'iris_63': 6.1, 'iris_64': 5.6, 'iris_65': 6.7, 'iris_66': 5.6, 'iris_67': 5.8, 'iris_68': 6.2, 'iris_69': 5.6, 'iris_70': 5.9, 'iris_71': 6.1, 'iris_72': 6.3, 'iris_73': 6.1, 'iris_74': 6.4, 'iris_75': 6.6, 'iris_76': 6.8, 'iris_77': 6.7, 'iris_78': 6.0, 'iris_79': 5.7, 'iris_80': 5.5, 'iris_81': 5.5, 'iris_82': 5.8, 'iris_83': 6.0, 'iris_84': 5.4, 'iris_85': 6.0, 'iris_86': 6.7, 'iris_87': 6.3, 'iris_88': 5.6, 'iris_89': 5.5, 'iris_90': 5.5, 'iris_91': 6.1, 'iris_92': 5.8, 'iris_93': 5.0, 'iris_94': 5.6, 'iris_95': 5.7, 'iris_96': 5.7, 'iris_97': 6.2, 'iris_98': 5.1, 'iris_99': 5.7, 'iris_100': 6.3, 'iris_101': 5.8, 'iris_102': 7.1, 'iris_103': 6.3, 'iris_104': 6.5, 'iris_105': 7.6, 'iris_106': 4.9, 'iris_107': 7.3, 'iris_108': 6.7, 'iris_109': 7.2, 'iris_110': 6.5, 'iris_111': 6.4, 'iris_112': 6.8, 'iris_113': 5.7, 'iris_114': 5.8, 'iris_115': 6.4, 'iris_116': 6.5, 'iris_117': 7.7, 'iris_118': 7.7, 'iris_119': 6.0, 'iris_120': 6.9, 'iris_121': 5.6, 'iris_122': 7.7, 'iris_123': 6.3, 'iris_124': 6.7, 'iris_125': 7.2, 'iris_126': 6.2, 'iris_127': 6.1, 'iris_128': 6.4, 'iris_129': 7.2, 'iris_130': 7.4, 'iris_131': 7.9, 'iris_132': 6.4, 'iris_133': 6.3, 'iris_134': 6.1, 'iris_135': 7.7, 'iris_136': 6.3, 'iris_137': 6.4, 'iris_138': 6.0, 'iris_139': 6.9, 'iris_140': 6.7, 'iris_141': 6.9, 'iris_142': 5.8, 'iris_143': 6.8, 'iris_144': 6.7, 'iris_145': 6.7, 'iris_146': 6.3, 'iris_147': 6.5, 'iris_148': 6.2, 'iris_149': 5.9}, 'sepal_width': {'iris_0': 3.5, 'iris_1': 3.0, 'iris_2': 3.2, 'iris_3': 3.1, 'iris_4': 3.6, 'iris_5': 3.9, 'iris_6': 3.4, 'iris_7': 3.4, 'iris_8': 2.9, 'iris_9': 3.1, 'iris_10': 3.7, 'iris_11': 3.4, 'iris_12': 3.0, 'iris_13': 3.0, 'iris_14': 4.0, 'iris_15': 4.4, 'iris_16': 3.9, 'iris_17': 3.5, 'iris_18': 3.8, 'iris_19': 3.8, 'iris_20': 3.4, 'iris_21': 3.7, 'iris_22': 3.6, 'iris_23': 3.3, 'iris_24': 3.4, 'iris_25': 3.0, 'iris_26': 3.4, 'iris_27': 3.5, 'iris_28': 3.4, 'iris_29': 3.2, 'iris_30': 3.1, 'iris_31': 3.4, 'iris_32': 4.1, 'iris_33': 4.2, 'iris_34': 3.1, 'iris_35': 3.2, 'iris_36': 3.5, 'iris_37': 3.6, 'iris_38': 3.0, 'iris_39': 3.4, 'iris_40': 3.5, 'iris_41': 2.3, 'iris_42': 3.2, 'iris_43': 3.5, 'iris_44': 3.8, 'iris_45': 3.0, 'iris_46': 3.8, 'iris_47': 3.2, 'iris_48': 3.7, 'iris_49': 3.3, 'iris_50': 3.2, 'iris_51': 3.2, 'iris_52': 3.1, 'iris_53': 2.3, 'iris_54': 2.8, 'iris_55': 2.8, 'iris_56': 3.3, 'iris_57': 2.4, 'iris_58': 2.9, 'iris_59': 2.7, 'iris_60': 2.0, 'iris_61': 3.0, 'iris_62': 2.2, 'iris_63': 2.9, 'iris_64': 2.9, 'iris_65': 3.1, 'iris_66': 3.0, 'iris_67': 2.7, 'iris_68': 2.2, 'iris_69': 2.5, 'iris_70': 3.2, 'iris_71': 2.8, 'iris_72': 2.5, 'iris_73': 2.8, 'iris_74': 2.9, 'iris_75': 3.0, 'iris_76': 2.8, 'iris_77': 3.0, 'iris_78': 2.9, 'iris_79': 2.6, 'iris_80': 2.4, 'iris_81': 2.4, 'iris_82': 2.7, 'iris_83': 2.7, 'iris_84': 3.0, 'iris_85': 3.4, 'iris_86': 3.1, 'iris_87': 2.3, 'iris_88': 3.0, 'iris_89': 2.5, 'iris_90': 2.6, 'iris_91': 3.0, 'iris_92': 2.6, 'iris_93': 2.3, 'iris_94': 2.7, 'iris_95': 3.0, 'iris_96': 2.9, 'iris_97': 2.9, 'iris_98': 2.5, 'iris_99': 2.8, 'iris_100': 3.3, 'iris_101': 2.7, 'iris_102': 3.0, 'iris_103': 2.9, 'iris_104': 3.0, 'iris_105': 3.0, 'iris_106': 2.5, 'iris_107': 2.9, 'iris_108': 2.5, 'iris_109': 3.6, 'iris_110': 3.2, 'iris_111': 2.7, 'iris_112': 3.0, 'iris_113': 2.5, 'iris_114': 2.8, 'iris_115': 3.2, 'iris_116': 3.0, 'iris_117': 3.8, 'iris_118': 2.6, 'iris_119': 2.2, 'iris_120': 3.2, 'iris_121': 2.8, 'iris_122': 2.8, 'iris_123': 2.7, 'iris_124': 3.3, 'iris_125': 3.2, 'iris_126': 2.8, 'iris_127': 3.0, 'iris_128': 2.8, 'iris_129': 3.0, 'iris_130': 2.8, 'iris_131': 3.8, 'iris_132': 2.8, 'iris_133': 2.8, 'iris_134': 2.6, 'iris_135': 3.0, 'iris_136': 3.4, 'iris_137': 3.1, 'iris_138': 3.0, 'iris_139': 3.1, 'iris_140': 3.1, 'iris_141': 3.1, 'iris_142': 2.7, 'iris_143': 3.2, 'iris_144': 3.3, 'iris_145': 3.0, 'iris_146': 2.5, 'iris_147': 3.0, 'iris_148': 3.4, 'iris_149': 3.0}, 'petal_length': {'iris_0': 1.4, 'iris_1': 1.4, 'iris_2': 1.3, 'iris_3': 1.5, 'iris_4': 1.4, 'iris_5': 1.7, 'iris_6': 1.4, 'iris_7': 1.5, 'iris_8': 1.4, 'iris_9': 1.5, 'iris_10': 1.5, 'iris_11': 1.6, 'iris_12': 1.4, 'iris_13': 1.1, 'iris_14': 1.2, 'iris_15': 1.5, 'iris_16': 1.3, 'iris_17': 1.4, 'iris_18': 1.7, 'iris_19': 1.5, 'iris_20': 1.7, 'iris_21': 1.5, 'iris_22': 1.0, 'iris_23': 1.7, 'iris_24': 1.9, 'iris_25': 1.6, 'iris_26': 1.6, 'iris_27': 1.5, 'iris_28': 1.4, 'iris_29': 1.6, 'iris_30': 1.6, 'iris_31': 1.5, 'iris_32': 1.5, 'iris_33': 1.4, 'iris_34': 1.5, 'iris_35': 1.2, 'iris_36': 1.3, 'iris_37': 1.4, 'iris_38': 1.3, 'iris_39': 1.5, 'iris_40': 1.3, 'iris_41': 1.3, 'iris_42': 1.3, 'iris_43': 1.6, 'iris_44': 1.9, 'iris_45': 1.4, 'iris_46': 1.6, 'iris_47': 1.4, 'iris_48': 1.5, 'iris_49': 1.4, 'iris_50': 4.7, 'iris_51': 4.5, 'iris_52': 4.9, 'iris_53': 4.0, 'iris_54': 4.6, 'iris_55': 4.5, 'iris_56': 4.7, 'iris_57': 3.3, 'iris_58': 4.6, 'iris_59': 3.9, 'iris_60': 3.5, 'iris_61': 4.2, 'iris_62': 4.0, 'iris_63': 4.7, 'iris_64': 3.6, 'iris_65': 4.4, 'iris_66': 4.5, 'iris_67': 4.1, 'iris_68': 4.5, 'iris_69': 3.9, 'iris_70': 4.8, 'iris_71': 4.0, 'iris_72': 4.9, 'iris_73': 4.7, 'iris_74': 4.3, 'iris_75': 4.4, 'iris_76': 4.8, 'iris_77': 5.0, 'iris_78': 4.5, 'iris_79': 3.5, 'iris_80': 3.8, 'iris_81': 3.7, 'iris_82': 3.9, 'iris_83': 5.1, 'iris_84': 4.5, 'iris_85': 4.5, 'iris_86': 4.7, 'iris_87': 4.4, 'iris_88': 4.1, 'iris_89': 4.0, 'iris_90': 4.4, 'iris_91': 4.6, 'iris_92': 4.0, 'iris_93': 3.3, 'iris_94': 4.2, 'iris_95': 4.2, 'iris_96': 4.2, 'iris_97': 4.3, 'iris_98': 3.0, 'iris_99': 4.1, 'iris_100': 6.0, 'iris_101': 5.1, 'iris_102': 5.9, 'iris_103': 5.6, 'iris_104': 5.8, 'iris_105': 6.6, 'iris_106': 4.5, 'iris_107': 6.3, 'iris_108': 5.8, 'iris_109': 6.1, 'iris_110': 5.1, 'iris_111': 5.3, 'iris_112': 5.5, 'iris_113': 5.0, 'iris_114': 5.1, 'iris_115': 5.3, 'iris_116': 5.5, 'iris_117': 6.7, 'iris_118': 6.9, 'iris_119': 5.0, 'iris_120': 5.7, 'iris_121': 4.9, 'iris_122': 6.7, 'iris_123': 4.9, 'iris_124': 5.7, 'iris_125': 6.0, 'iris_126': 4.8, 'iris_127': 4.9, 'iris_128': 5.6, 'iris_129': 5.8, 'iris_130': 6.1, 'iris_131': 6.4, 'iris_132': 5.6, 'iris_133': 5.1, 'iris_134': 5.6, 'iris_135': 6.1, 'iris_136': 5.6, 'iris_137': 5.5, 'iris_138': 4.8, 'iris_139': 5.4, 'iris_140': 5.6, 'iris_141': 5.1, 'iris_142': 5.1, 'iris_143': 5.9, 'iris_144': 5.7, 'iris_145': 5.2, 'iris_146': 5.0, 'iris_147': 5.2, 'iris_148': 5.4, 'iris_149': 5.1}, 'petal_width': {'iris_0': 0.2, 'iris_1': 0.2, 'iris_2': 0.2, 'iris_3': 0.2, 'iris_4': 0.2, 'iris_5': 0.4, 'iris_6': 0.3, 'iris_7': 0.2, 'iris_8': 0.2, 'iris_9': 0.1, 'iris_10': 0.2, 'iris_11': 0.2, 'iris_12': 0.1, 'iris_13': 0.1, 'iris_14': 0.2, 'iris_15': 0.4, 'iris_16': 0.4, 'iris_17': 0.3, 'iris_18': 0.3, 'iris_19': 0.3, 'iris_20': 0.2, 'iris_21': 0.4, 'iris_22': 0.2, 'iris_23': 0.5, 'iris_24': 0.2, 'iris_25': 0.2, 'iris_26': 0.4, 'iris_27': 0.2, 'iris_28': 0.2, 'iris_29': 0.2, 'iris_30': 0.2, 'iris_31': 0.4, 'iris_32': 0.1, 'iris_33': 0.2, 'iris_34': 0.2, 'iris_35': 0.2, 'iris_36': 0.2, 'iris_37': 0.1, 'iris_38': 0.2, 'iris_39': 0.2, 'iris_40': 0.3, 'iris_41': 0.3, 'iris_42': 0.2, 'iris_43': 0.6, 'iris_44': 0.4, 'iris_45': 0.3, 'iris_46': 0.2, 'iris_47': 0.2, 'iris_48': 0.2, 'iris_49': 0.2, 'iris_50': 1.4, 'iris_51': 1.5, 'iris_52': 1.5, 'iris_53': 1.3, 'iris_54': 1.5, 'iris_55': 1.3, 'iris_56': 1.6, 'iris_57': 1.0, 'iris_58': 1.3, 'iris_59': 1.4, 'iris_60': 1.0, 'iris_61': 1.5, 'iris_62': 1.0, 'iris_63': 1.4, 'iris_64': 1.3, 'iris_65': 1.4, 'iris_66': 1.5, 'iris_67': 1.0, 'iris_68': 1.5, 'iris_69': 1.1, 'iris_70': 1.8, 'iris_71': 1.3, 'iris_72': 1.5, 'iris_73': 1.2, 'iris_74': 1.3, 'iris_75': 1.4, 'iris_76': 1.4, 'iris_77': 1.7, 'iris_78': 1.5, 'iris_79': 1.0, 'iris_80': 1.1, 'iris_81': 1.0, 'iris_82': 1.2, 'iris_83': 1.6, 'iris_84': 1.5, 'iris_85': 1.6, 'iris_86': 1.5, 'iris_87': 1.3, 'iris_88': 1.3, 'iris_89': 1.3, 'iris_90': 1.2, 'iris_91': 1.4, 'iris_92': 1.2, 'iris_93': 1.0, 'iris_94': 1.3, 'iris_95': 1.2, 'iris_96': 1.3, 'iris_97': 1.3, 'iris_98': 1.1, 'iris_99': 1.3, 'iris_100': 2.5, 'iris_101': 1.9, 'iris_102': 2.1, 'iris_103': 1.8, 'iris_104': 2.2, 'iris_105': 2.1, 'iris_106': 1.7, 'iris_107': 1.8, 'iris_108': 1.8, 'iris_109': 2.5, 'iris_110': 2.0, 'iris_111': 1.9, 'iris_112': 2.1, 'iris_113': 2.0, 'iris_114': 2.4, 'iris_115': 2.3, 'iris_116': 1.8, 'iris_117': 2.2, 'iris_118': 2.3, 'iris_119': 1.5, 'iris_120': 2.3, 'iris_121': 2.0, 'iris_122': 2.0, 'iris_123': 1.8, 'iris_124': 2.1, 'iris_125': 1.8, 'iris_126': 1.8, 'iris_127': 1.8, 'iris_128': 2.1, 'iris_129': 1.6, 'iris_130': 1.9, 'iris_131': 2.0, 'iris_132': 2.2, 'iris_133': 1.5, 'iris_134': 1.4, 'iris_135': 2.3, 'iris_136': 2.4, 'iris_137': 1.8, 'iris_138': 1.8, 'iris_139': 2.1, 'iris_140': 2.4, 'iris_141': 2.3, 'iris_142': 1.9, 'iris_143': 2.3, 'iris_144': 2.5, 'iris_145': 2.3, 'iris_146': 1.9, 'iris_147': 2.0, 'iris_148': 2.3, 'iris_149': 1.8}})
我们获得:
print(np.all(np.isclose(biweight_midcorrelation_pd_OP(df), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_OP), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_np), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_npv), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_nb), result)))
# True
print(np.all(np.isclose(df.corr(method=pairwise_biweight_midcorrelation_OP), result)))
# True
print(np.all(np.isclose(df.corr(method=pairwise_biweight_midcorrelation_opt), result)))
# True
print(np.all(np.isclose(df.corr(method=pairwise_biweight_midcorrelation_nb), result)))
# True
基准测试
%timeit biweight_midcorrelation_pd_OP(df)
# 10 loops, best of 3: 22.1 ms per loop
%timeit corr_np2pd(df, biweight_midcorrelation_OP)
# 1000 loops, best of 3: 682 µs per loop
%timeit corr_np2pd(df, biweight_midcorrelation_np)
# 1000 loops, best of 3: 422 µs per loop
%timeit corr_np2pd(df, biweight_midcorrelation_npv)
# 1000 loops, best of 3: 341 µs per loop
%timeit corr_np2pd(df, biweight_midcorrelation_nb)
# 1000 loops, best of 3: 325 µs per loop
%timeit df.corr(method=pairwise_biweight_midcorrelation_OP)
# 100 loops, best of 3: 1.96 ms per loop
%timeit df.corr(method=pairwise_biweight_midcorrelation_opt)
# 100 loops, best of 3: 1.83 ms per loop
%timeit df.corr(method=pairwise_biweight_midcorrelation_nb)
# 1000 loops, best of 3: 506 µs per loop
这些结果表明基于 Numba 的方法是最快的,紧随其后的是原始方法的 NumPy 向量化版本。
请注意,从基于 Pandas 的计算到纯粹基于 NumPy 的方法(即使使用显式循环),我们获得了几乎 30 倍的速度系数。
并将两者矢量化for
Loops 又给我们买了大约。 2 倍系数。
The pd.DataFrame.corr()
当不使用 Numba 时,基于方法大约是。比用 NumPy 重写的原始方法慢 4 倍,因此即使您没有看到显式循环,也要小心!
Numba 加速pairwise_biweight_midcorrelation_nb()
给这一系列方法带来了显着的提升,但它不可能避免预计算的开销。
最后警告:所有这些基准都应该持保留态度!
(EDITED包括基于 Numba 的方法来使用pd.DataFrame.corr()
).