FindingData

CV for Multilabel classification

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
# importing relevent packages
import pandas as pd
# !pip install iterative-stratification
# stratified kfold for multilabel classification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn import tree
from sklearn import metrics
def kfolds(df):
    # we add a kfold column in main dataframe
    # why? beacuse it's easy to filter by it for training
    # and for validation. also, if we split like this,
    # our splits are consistant throughout the traning
    # and for any kind of model
    df.loc[:, "kfold"] = -1
    
    # I like to shuffle data like this
    df = df.sample(frac=1).reset_index(drop=True)
    
    # In my dataset, MSSubClass is the independent feature
    # So, i dropped that so that i can get my dependent 
    # feature as targets along with kfolds
    targets = df.drop("MSSubClass", axis=1).values
    
    # init MultilabelStratifiedKFold
    mskf = MultilabelStratifiedKFold(n_splits=5)
    
    # loop over the folds
    # assign the value of validation fold to 
    # kfold column by valifation index
    for fold, (trn_, val_) in enumerate(mskf.split(X=df, y=targets)):
        # print(len(trn_), len(val_))
        df.loc[val_, "kfold"] = fold
        
    # save the modified dataframe
    df.to_csv("train_folds.csv", index=False)
def run(fold):
    # read the data with folds
    df = pd.read_csv("train_folds.csv")
    
    # traning data where kfold is not equal to provided fold
    # also, we reset the index
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # validation data where kfold is equals to provided fold
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # drop the target column from the dataframe and convert
    # it into a numpy array usinf .values
    # target is "SaleCondition" in the dataframe
    X_train = df_train.drop("SaleCondition", axis=1).values
    y_train = df_train.SaleCondition.values
    
    # Similiarly, for validation, we have
    X_valid = df_valid.drop("SaleCondition", axis=1).values
    y_valid = df_valid.SaleCondition.values
    
    # init DecisionTreeClassifier
    clf = tree.DecisionTreeClassifier()
    
    # fit the model on traning data
    clf.fit(X_train, y_train)
    
    # create the predictions for validation samples
    y_pred = clf.predict(X_valid)
    
    # calculate the accuracy and print it 
    accuracy = metrics.accuracy_score(y_valid, y_pred)
    print(f"Fold={fold}, Accuracy={accuracy}")
    
if __name__ == '__main__':
    df = pd.read_csv("train.csv")
    # i have picked only one dependent feature
    # and one independent feature for simplicity
    """
    Independent : "MSSubClass"
    Dependent   : "SaleCondition"
    """
    df = df[['MSSubClass', 'SaleCondition']]
    kfolds(df)
    run(fold=0)
    run(fold=1)
    run(fold=2)
    run(fold=3)
    run(fold=4)

Fold=0, Accuracy=0.821917808219178
Fold=1, Accuracy=0.797945205479452
Fold=2, Accuracy=0.8356164383561644
Fold=3, Accuracy=0.8184931506849316
Fold=4, Accuracy=0.8287671232876712

Edit this page on GitHub

CV for Multilabel classification

CV for Multilabel classification

Recent Posts