我尝试使用标准化数据高斯函数每个参数的正数和负数各2次dataset https://drive.google.com/file/d/1Llmc0UunCqocAgCHOp6MDL8uKYUWpdDO/view。数据集还包含缺失数据。问题是我想通过散点图突出显示异常值cmap='coolwarm'
对于参数 A、B,特别是 T,使得:
- 该区间之外的异常值可以标记为
(x)
or (*)
with cmap='coolwarm'
- 在图表的右侧
cbar
应该是可用的。
- 我的目标是在应用清洁数据之前以优雅的方式突出显示它们,然后比较原始数据和处理后的数据之前和之后的图表以一页中的子图的形式。
- 是否可以通过以下方式突出显示异常值
from sklearn.neighbors import LocalOutlierFactor
?或定义Vmin
and Vmax
从此得到启发answer https://stackoverflow.com/a/44652377/10452700或者我应该在突出显示之前标记异常值Boolean masking
(为了学习)或定义函数来检测它们。
我使用的代码对异常值进行着色,如下所示:
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
def outlier_fix(data, _min, _max):
for i in range (0, data.size):
if (data.iat[i] > _max):
data.iat[i] = _max
if (data.iat[i] < _min):
data.iat[i] = _min
return data
def createpositiveandnegativelist(listtocreate):
l_negative = []
l_positive = []
for value in listtocreate:
if (value <= 0):
l_negative.append(value)
elif (value > 0):
l_positive.append(value)
#print(t_negative)
#print(t_positive)
return l_negative,l_positive
def calculatemean(listtocalculate):
return sum(listtocalculate)/len(listtocalculate)
def plotboundedCI(s, mu, sigma, lists):
plt.figure()
'''
print("\nS:\n",s)
print("\nmuuu:\n",mu)
print("\nsigma:\n",sigma)
'''
count, bins, ignored = plt.hist(s,30,density=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp(-(bins-mu)**2/(2*sigma**2)),linewidth=2, color= 'r')
#confidential interval calculation
ci = scipy.stats.norm.interval(0.68, loc = mu, scale = sigma)
#confidence interval for left line
one_x12, one_y12 = [ci[0],ci[0]], [0,3]
#confidence interval for right line
two_x12, two_y12 = [ci[1],ci[1]], [0,3]
'''
print("\n\n\n",ci[0])
print("\n\n\n",ci[1])
'''
plt.title("Gaussian 68% Confidence Interval", fontsize=12, color='black', loc='left', style='italic')
plt.plot(one_x12, one_y12, two_x12, two_y12, marker = 'o')
#plt.show()
results = []
for value in lists:
if(ci[0]< value <ci[1]):
results.append(value)
else:
#print("NOT WANTED: ",value)
pass
return results
df_orig = df.copy()
df_orig[df_orig == np.inf] = np.nan
df_orig[df_orig == -np.inf] = np.nan
def miss_contain_cycles(data):
miss_cycles = []
for i in range(math.ceil(data.shape[0] // 480)):
temp = data[i*480:(i+1)*480]
if np.sum(temp == np.inf) > 0 or np.sum(temp == -np.inf) > 0 or np.sum(np.isnan(temp)) > 0:
miss_cycles.append(i)
return miss_cycles
def missing_stats(data):
inf_stats = np.sum(data == np.inf)
minus_inf_stats = np.sum(data == -np.inf)
nan_stats = np.sum(np.isnan(data))
miss_cycles = miss_contain_cycles(data)
return inf_stats, minus_inf_stats, nan_stats, miss_cycles
dft = pd.read_csv('me_300_SOF.csv', header=None)
df_plot.columns = ['A', 'B' ,'T','S','C','Cycle']
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(20,10), squeeze=False)
df_plot.plot.scatter(ax=ax[0, 0] , alpha=0.8 , x='Cycle', y='A', colormap='coolwarm', c='A') ; ax[0, 0].set_title('A Vs Cycle', fontweight='bold', fontsize=14) ; ax[0, 0].set_ylabel('A')
df_plot.plot.scatter(ax=ax[1, 0] , alpha=0.8 , x='Cycle', y='B', colormap='coolwarm', c='B') ; ax[1, 0].set_title('B Vs Cycle', fontweight='bold', fontsize=14) ; ax[1, 0].set_ylabel('B')
df_plot.plot.scatter(ax=ax[2, 0] , alpha=0.8 , x='Cycle', y='T', colormap='coolwarm', c='T') ; ax[2, 0].set_title('C Vs Cycle', fontweight='bold', fontsize=14) ; ax[2, 0].set_ylabel('T')
plt.suptitle('Exploratory Data Analysis (EDA) ', color='yellow', backgroundcolor='black', fontsize=15, fontweight='bold')
plt.subplots_adjust(top=0.9, bottom=0.07, left=0.06, right=0.96, hspace=0.4, wspace=0.2)
plt.show()
任何帮助将不胜感激!