data<-read.csv("Dataset2.csv",header=T,sep=";",dec=',',row.names=NULL) head(data) str(data) #Excluding high cardinality attributes data<-data[,-c(1,11,12)] str(data) #Adjusting data type data$Pricelist<-as.factor(data$Pricelist) data$Churn<-as.factor(data$Churn) data$Experiment<-as.factor(data$Experiment) data$Sex<-as.factor(data$Sex) str(data) #Dealing with missing values na_percentage<-replicate(ncol(data),0) for (i in 1:ncol(data)) { na_percentage[i]<-sum(is.na(data[,i]))/nrow(data) } na_logical<-na_percentage>0.05 sum(na_logical) data<-data[-which(is.na(data),arr.ind=T)[,1],] sum(is.na(data)) rownames(data)<-c(1:nrow(data)) #Visualizing distributions of categorical variables and relationships with regards to the target variable data_cat<-data[,c(1,3,6,7)] #Selecting categorical variables (and the target variable) library('ggplot2') #Visualizing Pricelist ggplot(data_cat, aes(x=Pricelist)) + geom_bar() ggplot(data_cat, aes(x=Pricelist,color=Churn)) + geom_bar() #Calculating average Churn rates for individual Pricelist levels avg_churn_offline<-(nrow(data_cat[data_cat$Pricelist=='Offline'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Pricelist=='Offline',])) avg_churn_online<-(nrow(data_cat[data_cat$Pricelist=='Online'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Pricelist=='Online',])) Pricelist_avg_churn<-data.frame(name=c('Offline','Online'),value=c(avg_churn_offline,avg_churn_online)) ggplot(Pricelist_avg_churn,aes(name,value))+geom_bar(stat="identity")+geom_text(aes(label = signif(value)), nudge_y = 0.1)+labs(y= 'Average Churn Rate', x = 'Pricelist') #Visualizing Experiment ggplot(data_cat, aes(x=Experiment)) + geom_bar() ggplot(data_cat, aes(x=Experiment,color=Churn)) + geom_bar() #Calculating average Churn rates for individual Experiment levels avg_churn_CG<-(nrow(data_cat[data_cat$Experiment=='CG'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Experiment=='CG',])) avg_churn_TG<-(nrow(data_cat[data_cat$Experiment=='TG'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Experiment=='TG',])) Experiment_avg_churn<-data.frame(name=c('CG','TG'),value=c(avg_churn_CG,avg_churn_TG)) ggplot(Experiment_avg_churn,aes(name,value))+geom_bar(stat="identity")+geom_text(aes(label = signif(value)), nudge_y = 0.1)+labs(y= 'Average Churn Rate', x = 'Experiment') #Visualizing Sex ggplot(data_cat, aes(x=Sex)) + geom_bar() ggplot(data_cat, aes(x=Sex,color=Churn)) + geom_bar() #Calculating average Churn rates for individual Experiment levels avg_churn_M<-(nrow(data_cat[data_cat$Sex=='M'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Sex=='M',])) avg_churn_F<-(nrow(data_cat[data_cat$Sex=='F'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Sex=='F',])) avg_churn_L<-(nrow(data_cat[data_cat$Sex=='L'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Sex=='L',])) Sex_avg_churn<-data.frame(name=c('M','F','L'),value=c(avg_churn_M,avg_churn_F,avg_churn_L)) ggplot(Sex_avg_churn,aes(name,value))+geom_bar(stat="identity")+geom_text(aes(label = signif(value)), nudge_y = 0.1)+labs(y= 'Average Churn Rate', x = 'Sex') #Visualizing distributions of quantitative variables and relationships with regards to the target variable data_num<-data[,c(2,3,4,5,6,8,9,10,11,12)] #Selecting quantitative variables (and the target variable) #Building correlation matrix data_num1<-data_num for (j in 1:ncol(data_num)) { data_num1[,j]<-as.numeric(data_num[,j]) } cor(data_num1) #Visualizing Length.of.contract hist(data_num$Length.of.contract) #Plotting Churn against Length.of.contract plot(data_num$Length.of.contract,data_num$Churn) #Visualizing Consumption hist(data_num$Consumption) #Plotting Churn against Consumption plot(data_num$Consumption,data_num$Churn) #Visualizing Advance.payments hist(data_num$Advance.payments) #Plotting Churn against Advance.payments plot(data_num$Advance.payments,data_num$Churn) #Visualizing No.of.cons.points hist(data_num$No.of.cons.points) #Plotting Churn against No.of.cons.points plot(data_num$No.of.cons.points,data_num$Churn) #Visualizing Time.w.the.comp hist(data_num$Time.w.the.comp) #Plotting Churn against Time.w.the.comp plot(data_num$Time.w.the.comp,data_num$Churn) #Visualizing Town.population hist(data_num$Town.population) #Plotting Churn against Town.population plot(data_num$Town.population,data_num$Churn) #Visualizing Avg.temp.in.reg hist(data_num$Avg.temp.in.reg) #Plotting Churn against Avg.temp.in.reg plot(data_num$Avg.temp.in.reg,data_num$Churn) #Visualizing Avg.wage.in.reg hist(data_num$Avg.wage.in.reg) #Plotting Churn against Avg.wage.in.reg plot(data_num$Avg.wage.in.reg,data_num$Churn) #Log-transformation data$Advance.payments<-log(data$Advance.payments+0.05) #hist(data$Advance.payments) colnames(data)[5] ="logAP" data$No.of.cons.points<-log(data$No.of.cons.points+0.05) #hist(data$No.of.cons.points) colnames(data)[8] ="logNCP" data$Time.w.the.comp<-log(data$Time.w.the.comp+0.05) #hist(data$Time.w.the.comp) colnames(data)[9] ="logTwC" data$Town.population<-log(data$Town.population+0.05) #hist(data$Town.population) colnames(data)[10] ="logTP" data$Avg.wage.in.reg<-log(data$Avg.wage.in.reg) #hist(data$Avg.wage.in.reg) colnames(data)[12] ="logAW" str(data) #Dummy encoding library('caret') dmy<-dummyVars(" ~ .", data = data, fullRank = T) data <- data.frame(predict(dmy, newdata = data)) str(data) colnames(data)[3] ="Churn" data$Churn<-as.factor(data$Churn) library('tidymodels') # Create data split object set.seed(7) metrics <- metric_set(roc_auc) data_split <- initial_split(data, prop = 0.75, strata = Churn) # Create the training data data_training <- data_split %>% training() # Create the test data data_test <- data_split %>% testing() # Check the average Churn rates mean(as.numeric(data_training$Churn)-1) mean(as.numeric(data_test$Churn)-1) # Check for correlated predictors data_training %>% # Select numeric columns select_if(is.numeric) %>% # Calculate correlation matrix cor() # Specify a logistic regression model logistic_model <- logistic_reg() %>% # Set the engine set_engine('glm') %>% # Set the mode set_mode('classification') # Print the model specification logistic_model #Specify recipe logistic_recipe<-recipe(Churn~., data = data_training) # Create a workflow logistic_wkfl <- workflow() %>% # Include the model object add_model(logistic_model) %>% # Include the recipe object add_recipe(logistic_recipe) # View workflow specification logistic_wkfl # Train the workflow logistic_wkfl_fit <- logistic_wkfl %>% last_fit(split = data_split,metrics=metrics) # Calculate performance metrics on test data logistic_wkfl_fit %>% collect_metrics() #Obtaining predictions logistic_wkfl_fit$.predictions # Combine test set results logit_results <- data_test %>% select(Churn) %>% bind_cols(logistic_wkfl_fit$.predictions[[1]][,1]) # View results tibble logit_results # Calculate metrics across thresholds logistic_threshold_df <- logit_results %>% roc_curve(truth = Churn, .pred_0) # View results logistic_threshold_df # Plot ROC curve logistic_threshold_df %>% autoplot() # Calculate ROC AUC roc_auc(logit_results, truth = Churn, .pred_0) #Calculate Top Decile Lift library('lift') TopDecileLift(1-logit_results$.pred_0,logit_results$Churn) #Obtaining parameters logit_params<-tidy(logistic_wkfl_fit$.workflow[[1]]) library('tune') # Specify a decision tree model dt_model <- decision_tree(cost_complexity= tune(), tree_depth = tune(), min_n = tune()) %>% # Set the engine set_engine('rpart') %>% # Set the mode set_mode('classification') # Print the model specification dt_model #Specify recipe dt_recipe<-recipe(Churn~., data = data_training) # Create a workflow dt_wkfl <- workflow() %>% # Include the model object add_model(dt_model) %>% # Include the recipe object add_recipe(dt_recipe) # View workflow specification dt_wkfl # Hyperparameter tuning with grid search set.seed(6) dt_grid <- grid_random(parameters(dt_model), size = 5) fold <- validation_split(data_training, strata = Churn) # Hyperparameter tuning dt_tuning <- dt_wkfl %>% tune_grid(resamples=fold, grid = dt_grid, metrics = metrics) # View results dt_tuning %>% collect_metrics() # Select based on best performance best_dt_model <- dt_tuning %>% # Choose the best model based on roc_auc select_best(metric = 'roc_auc') # Finalize your workflow final_dt_wkfl <- dt_wkfl %>% finalize_workflow(best_dt_model) final_dt_wkfl # Train the workflow dt_wkfl_fit <- final_dt_wkfl %>% last_fit(split = data_split, metrics=metrics) # Calculate performance metrics on test data dt_wkfl_fit %>% collect_metrics() #Visualising the tree library('rpart.plot') dt_wkfl_fit %>% extract_fit_engine() %>% prp() #Obtaining predictions dt_wkfl_fit$.predictions # Combine test set results dt_results <- data_test %>% select(Churn) %>% bind_cols(dt_wkfl_fit$.predictions[[1]][,1]) # View results tibble dt_results # Calculate metrics across thresholds dt_threshold_df <- dt_results %>% roc_curve(truth = Churn, .pred_0) # View results dt_threshold_df # Plot ROC curve dt_threshold_df %>% autoplot() # Calculate ROC AUC roc_auc(dt_results, truth = Churn, .pred_0) #Calculate Top Decile Lift TopDecileLift(1-dt_results$.pred_0,dt_results$Churn) #####LOGIT LEAF MODEL##### # Specify a decision tree model llm_dt_model <- decision_tree(cost_complexity=0.0000000000002, tree_depth = 3, min_n = 20) %>% # Set the engine set_engine('rpart') %>% # Set the mode set_mode('classification') # Print the model specification llm_dt_model # Fit to training data llm_dt_fit <- llm_dt_model %>% fit(Churn~., data = data_training) # Print model fit object llm_dt_fit # Predict outcome categories class_preds <- predict(llm_dt_fit, new_data = data_training, type = 'class') # Obtain estimated probabilities for each outcome value prob_preds <- predict(llm_dt_fit, new_data = data_training, type = 'prob') #Results results <- data_training %>% select(Churn) %>% bind_cols(class_preds, prob_preds) # View results tibble results #Combining data with predictions data_pred<-cbind(data_training, results$.pred_0) #Creating groups training_data_grouped<-data_pred %>% group_by(results$.pred_0) groups<-group_split(training_data_grouped) n_groups<-length(groups) groups_filtered<-vector(mode = "list", length = n_groups) for(l in 1:n_groups){ groups_filtered[[l]]<-Filter(function(x)(length(unique(x))>1), groups[[l]]) } # Specify a logistic regression model llm_logit_model <- logistic_reg() %>% # Set the engine set_engine('glm') %>% # Set the mode set_mode('classification') # Print the model specification llm_logit_model #Fitting to groups group_fitts<-vector(mode = "list", length = n_groups) for(m in 1:n_groups) { group_fitts[[m]] <- llm_logit_model %>% fit(Churn~., data = groups_filtered[[m]]) } #Extracting paramters param_list<-vector(mode = "list", length = n_groups) for(n in 1:n_groups) { param_list[[n]] <- na.omit(data.frame(group_fitts[[n]]$fit$coefficients)) } param_list ###Fitting testing set### test_prob_preds<-predict(llm_dt_fit, new_data = data_test, type = 'prob') #Results results_test <- data_test %>% select(Churn) %>% bind_cols(test_prob_preds) # View results tibble results_test #Combining data with predictions test_data_pred<-cbind(data_test, results_test$.pred_0) #Creating groups test_data_grouped<-test_data_pred %>% group_by(results_test$.pred_0) test_groups<-group_split(test_data_grouped) #Predicting probabilities for test groups LLM_class_pred<-vector(mode="list", length = n_groups) LLM_prob_pred<-vector(mode="list", length = n_groups) for(o in 1:n_groups){ LLM_prob_pred[[o]]<-predict(group_fitts[[o]],new_data = test_groups[[o]],type = 'prob') } #Applying regression only for groups with enough degrees of freedom for(p in 1:n_groups){ if(nrow(groups[[p]])<50){ LLM_prob_pred[[p]]$.pred_0<-vector('numeric',nrow(test_groups[[p]]))+groups[[p]]$`results$.pred_0`[1] } } #Creating rsult groups LLM_res<-vector(mode="list", length = n_groups) for(q in 1:n_groups){ LLM_res[[q]] <- test_groups[[q]] %>% select(Churn) %>% bind_cols(LLM_prob_pred[[q]]$.pred_0) } #Combining result groups LLM_results<-LLM_res[[1]] for(r in 2:n_groups){ LLM_results<-rbind(LLM_results, LLM_res[[r]]) } # Calculate metrics across thresholds LLM_threshold_df <- LLM_results %>% roc_curve(truth = Churn, ...2) # View results LLM_threshold_df # Calculate ROC AUC ROC_AUC<-roc_auc(LLM_results, truth = Churn, ...2) #Calculate Top Decile Lift TDL<-TopDecileLift(1-LLM_results$...2,LLM_results$Churn) # Plot ROC curve ROC<-LLM_threshold_df %>% autoplot() metr<-list(ROC_AUC,TDL,ROC,llm_dt_fit) metr[[1]] metr[[2]] metr[[3]] #Depicting the DT part metr[[4]] %>% extract_fit_engine() %>% prp() #Regression parameters tidy(group_fitts[[2]]) tidy(group_fitts[[4]])