要使用数字列的平均值和非数字列的最常见值,您可以执行类似的操作。您可以进一步区分整数和浮点数。我想使用整数列的中位数可能是有意义的。
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
def __init__(self):
"""Impute missing values.
Columns of dtype object are imputed with the most frequent value
in column.
Columns of other types are imputed with mean of column.
"""
def fit(self, X, y=None):
self.fill = pd.Series([X[c].value_counts().index[0]
if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.fill)
data = [
['a', 1, 2],
['b', 1, 1],
['b', 2, 2],
[np.nan, np.nan, np.nan]
]
X = pd.DataFrame(data)
xt = DataFrameImputer().fit_transform(X)
print('before...')
print(X)
print('after...')
print(xt)
打印,
before...
0 1 2
0 a 1 2
1 b 1 1
2 b 2 2
3 NaN NaN NaN
after...
0 1 2
0 a 1.000000 2.000000
1 b 1.000000 1.000000
2 b 2.000000 2.000000
3 b 1.333333 1.666667