MLOps
The problem with is every step - cleaning, imputation, encoding, feature engineering, etc, is done separately so if a new test sample is given, one cannot directly make a prediction and will have to carry out every step all over again. To solve this, I am going to create a 'preprocessor' class with a transform method that does everything I have done until now and make a pipeline with this preprocessor as the first step and the trained model clf as the second step.
Simply calling predict on the test sample with this pipeline will do all cleaning, imputing, encoding, etc, as well as prediction with the estimator. Thus, one can simply deploy this pipeline at the end.
class Preprocessor():
def transform(self, df):
if 'PassengerId' in df.columns:
df.drop(['PassengerId'], axis=1, inplace=True)
# cleaning
df.drop(['Cabin'], axis=1, inplace=True)
df['Honorific'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df.drop(['Name'], axis=1, inplace=True)
for x in ['Mlle', 'Ms']:
df.Honorific = df.Honorific.replace(x, 'Miss')
for x in ['Lady', 'Mme', 'the Countess', 'Dona']:
df.Honorific = df.Honorific.replace(x, 'Mrs')
for x in ['Sir', 'Don']:
df.Honorific = df.Honorific.replace(x, 'Mr')
for x in ['Dr', 'Rev', 'Major', 'Col', 'Capt']:
df.loc[(df.Sex == 'male'), 'Honorific'] = df.loc[(df.Sex == 'male'), 'Honorific'].replace(x, 'Mr')
df.loc[(df.Sex == 'female'), 'Honorific'] = df.loc[(df.Sex == 'female'), 'Honorific'].replace(x, 'Mrs')
df.Honorific = df.Honorific.replace('Jonkheer', 'Master')
# imputation
df.Embarked = df.Embarked.fillna(df.Embarked.mode())
df.loc[(df.Pclass == 3), 'Fare'] = df.loc[(df.Pclass == 3), 'Fare'].fillna(13.3)
df.loc[(df.Pclass == 2), 'Fare'] = df.loc[(df.Pclass == 2), 'Fare'].fillna(21.2)
df.loc[(df.Pclass == 1), 'Fare'] = df.loc[(df.Pclass == 1), 'Fare'].fillna(87.5)
df.loc[(df.Honorific == 'Mr'), 'Age'] = df.loc[(df.Honorific == 'Mr'), 'Age'].fillna(32.8)
df.loc[(df.Honorific == 'Mrs'), 'Age'] = df.loc[(df.Honorific == 'Mrs'), 'Age'].fillna(37)
df.loc[(df.Honorific == 'Miss'), 'Age'] = df.loc[(df.Honorific == 'Miss'), 'Age'].fillna(21.8)
df.loc[(df.Honorific == 'Master'), 'Age'] = df.loc[(df.Honorific == 'Master'), 'Age'].fillna(6.1)
df.Age = df.Age.fillna(df.Age.mean())
df = df.bfill(axis='rows').ffill(axis='rows')
df.drop(['Honorific'], axis=1, inplace=True)
# encoding
df['Ticket_Group'] = df.groupby('Ticket')['Ticket'].transform('count')
df.drop(['Ticket'], axis=1, inplace=True)
temp_1 = pd.get_dummies(df.Sex)
temp_2 = pd.get_dummies(df.Embarked)
df = df.join([temp_1, temp_2])
df.drop(['Sex', 'Embarked'], axis=1, inplace=True)
# feature engineering
df.drop(['C', 'Q'], axis=1, inplace=True)
temp_1 = df.loc[:, ['male', 'female', 'Age']]
temp_1.Age = (temp_1.Age - temp_1.Age.min()) / (temp_1.Age.max() - temp_1.Age.min())
kmeans_1 = KMeans(n_clusters=6, n_init=6, random_state=1)
temp_1["Sex_Age"] = kmeans_1.fit_predict(temp_1)
temp_2 = df.loc[:, ['Pclass', 'S']]
kmeans_2 = KMeans(n_clusters=6, n_init=6, random_state=1)
temp_2["Embarked_Pclass"] = kmeans_2.fit_predict(temp_2)
df = df.join([temp_1.Sex_Age, temp_2.Embarked_Pclass])
df.drop(['S'], axis=1, inplace=True)
return df
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(Preprocessor(), clf)
test = pd.read_csv('/kaggle/input/titanic/test.csv')
pred = pipe.predict(test)