好的,因为所有可能的值都是预先知道的。下面是一种稍微有点黑客的做法。
import numpy as np
import pandas as pd
# This is a one time process
# Keep all the possible data here in lists
# Can add other categorical variables too which have this type of data
all_possible_states= ['AL', 'MS', 'MS', 'OK', 'VA', 'NJ', 'NM', 'CD', 'WY']
all_possible_cities= ['A', 'B', 'C', 'D', 'E', 'G', 'Z', 'F']
# Declare our transformer class
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
def __init__(self, all_possible_values):
self.le = LabelEncoder()
self.ohe = OneHotEncoder()
self.ohe.fit(self.le.fit_transform(all_possible_values).reshape(-1,1))
def transform(self, X, y=None):
return self.ohe.transform(self.le.transform(X).reshape(-1,1)).toarray()
# Allow the transformer to see all the data here
encoders = {}
encoders['state'] = MyOneHotEncoder(all_possible_states)
encoders['city'] = MyOneHotEncoder(all_possible_cities)
# Do this for all categorical columns
# Now this is our method which will be used on the incoming data
def encode(df):
tup = (encoders['state'].transform(df['state']),
encoders['city'].transform(df['city']),
# Add all other columns which are not to be transformed
df[['age']])
return np.hstack(tup)
# Testing:
day1_data = pd.DataFrame({'state': ['MS', 'OK', 'VA', 'NJ', 'NM'],
'city': ['C', 'B', 'G', 'Z', 'F'],
'age': [27, 19, 63, 40, 93]})
print(encode(day1_data))
[[ 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 27.]
[ 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
0. 0. 19.]
[ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
1. 0. 63.]
[ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 1. 40.]
[ 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.
0. 0. 93.]]
day2_data = pd.DataFrame({'state': ['AL', 'WY', 'VA'],
'city': ['A', 'B', 'E'],
'age': [42, 52, 73]})
print(encode(day2_data))
[[ 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
0. 0. 42.]
[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.
0. 0. 52.]
[ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
0. 0. 73.]]
请仔细阅读评论,如果仍有任何问题,请询问我。