MLOps

The problem with is every step - cleaning, imputation, encoding, feature engineering, etc, is done separately so if a new test sample is given, one cannot directly make a prediction and will have to carry out every step all over again. To solve this, I am going to create a 'preprocessor' class with a transform method that does everything I have done until now and make a pipeline with this preprocessor as the first step and the trained model clf as the second step.

Simply calling predict on the test sample with this pipeline will do all cleaning, imputing, encoding, etc, as well as prediction with the estimator. Thus, one can simply deploy this pipeline at the end.

class Preprocessor():
    def transform(self, df):
        if 'PassengerId' in df.columns:
            df.drop(['PassengerId'], axis=1, inplace=True)

        # cleaning
        df.drop(['Cabin'], axis=1, inplace=True)
        df['Honorific'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
        df.drop(['Name'], axis=1, inplace=True)
        for x in ['Mlle', 'Ms']:
            df.Honorific = df.Honorific.replace(x, 'Miss')
        for x in ['Lady', 'Mme', 'the Countess', 'Dona']:
            df.Honorific = df.Honorific.replace(x, 'Mrs')
        for x in ['Sir', 'Don']:
            df.Honorific = df.Honorific.replace(x, 'Mr')
        for x in ['Dr', 'Rev', 'Major', 'Col', 'Capt']:
            df.loc[(df.Sex == 'male'), 'Honorific'] = df.loc[(df.Sex == 'male'), 'Honorific'].replace(x, 'Mr')
            df.loc[(df.Sex == 'female'), 'Honorific'] = df.loc[(df.Sex == 'female'), 'Honorific'].replace(x, 'Mrs')
        df.Honorific = df.Honorific.replace('Jonkheer', 'Master')
        
        # imputation
        df.Embarked = df.Embarked.fillna(df.Embarked.mode())
        df.loc[(df.Pclass == 3), 'Fare'] = df.loc[(df.Pclass == 3), 'Fare'].fillna(13.3)
        df.loc[(df.Pclass == 2), 'Fare'] = df.loc[(df.Pclass == 2), 'Fare'].fillna(21.2)
        df.loc[(df.Pclass == 1), 'Fare'] = df.loc[(df.Pclass == 1), 'Fare'].fillna(87.5)

        df.loc[(df.Honorific == 'Mr'), 'Age'] = df.loc[(df.Honorific == 'Mr'), 'Age'].fillna(32.8)
        df.loc[(df.Honorific == 'Mrs'), 'Age'] = df.loc[(df.Honorific == 'Mrs'), 'Age'].fillna(37)
        df.loc[(df.Honorific == 'Miss'), 'Age'] = df.loc[(df.Honorific == 'Miss'), 'Age'].fillna(21.8)
        df.loc[(df.Honorific == 'Master'), 'Age'] = df.loc[(df.Honorific == 'Master'), 'Age'].fillna(6.1)
        df.Age = df.Age.fillna(df.Age.mean())
        
        df = df.bfill(axis='rows').ffill(axis='rows')
        df.drop(['Honorific'], axis=1, inplace=True)
        
        # encoding
        df['Ticket_Group'] = df.groupby('Ticket')['Ticket'].transform('count')
        df.drop(['Ticket'], axis=1, inplace=True)
        
        temp_1 = pd.get_dummies(df.Sex)
        temp_2 = pd.get_dummies(df.Embarked)
        df = df.join([temp_1, temp_2])
        df.drop(['Sex', 'Embarked'], axis=1, inplace=True)
        
        # feature engineering
        df.drop(['C', 'Q'], axis=1, inplace=True)
        
        temp_1 = df.loc[:, ['male', 'female', 'Age']]
        temp_1.Age = (temp_1.Age - temp_1.Age.min()) / (temp_1.Age.max() - temp_1.Age.min())
        kmeans_1 = KMeans(n_clusters=6, n_init=6, random_state=1)
        temp_1["Sex_Age"] = kmeans_1.fit_predict(temp_1)

        temp_2 = df.loc[:, ['Pclass', 'S']]
        kmeans_2 = KMeans(n_clusters=6, n_init=6, random_state=1)
        temp_2["Embarked_Pclass"] = kmeans_2.fit_predict(temp_2)

        df = df.join([temp_1.Sex_Age, temp_2.Embarked_Pclass])
        df.drop(['S'], axis=1, inplace=True)
        
        return df

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(Preprocessor(), clf)

test = pd.read_csv('/kaggle/input/titanic/test.csv')
pred = pipe.predict(test)