data<-read.csv("Dataset2.csv",header=T,sep=";",dec=',',row.names=NULL)
head(data)
str(data)

#Excluding high cardinality attributes
data<-data[,-c(1,11,12)]
str(data)

#Adjusting data type
data$Pricelist<-as.factor(data$Pricelist)
data$Churn<-as.factor(data$Churn)
data$Experiment<-as.factor(data$Experiment)
data$Sex<-as.factor(data$Sex)
str(data)


#Dealing with missing values
na_percentage<-replicate(ncol(data),0)
for (i in 1:ncol(data)) {
  na_percentage[i]<-sum(is.na(data[,i]))/nrow(data)
}

na_logical<-na_percentage>0.05
sum(na_logical)

data<-data[-which(is.na(data),arr.ind=T)[,1],]
sum(is.na(data))
rownames(data)<-c(1:nrow(data))


#Visualizing distributions of categorical variables and relationships with regards to the target variable
data_cat<-data[,c(1,3,6,7)] #Selecting categorical variables (and the target variable)
library('ggplot2')

#Visualizing Pricelist
ggplot(data_cat, aes(x=Pricelist)) + geom_bar()
ggplot(data_cat, aes(x=Pricelist,color=Churn)) + geom_bar()

#Calculating average Churn rates for individual Pricelist levels
avg_churn_offline<-(nrow(data_cat[data_cat$Pricelist=='Offline'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Pricelist=='Offline',]))
avg_churn_online<-(nrow(data_cat[data_cat$Pricelist=='Online'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Pricelist=='Online',]))
Pricelist_avg_churn<-data.frame(name=c('Offline','Online'),value=c(avg_churn_offline,avg_churn_online))
ggplot(Pricelist_avg_churn,aes(name,value))+geom_bar(stat="identity")+geom_text(aes(label = signif(value)), nudge_y = 0.1)+labs(y= 'Average Churn Rate', x = 'Pricelist')

#Visualizing Experiment
ggplot(data_cat, aes(x=Experiment)) + geom_bar()
ggplot(data_cat, aes(x=Experiment,color=Churn)) + geom_bar()

#Calculating average Churn rates for individual Experiment levels
avg_churn_CG<-(nrow(data_cat[data_cat$Experiment=='CG'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Experiment=='CG',]))
avg_churn_TG<-(nrow(data_cat[data_cat$Experiment=='TG'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Experiment=='TG',]))
Experiment_avg_churn<-data.frame(name=c('CG','TG'),value=c(avg_churn_CG,avg_churn_TG))
ggplot(Experiment_avg_churn,aes(name,value))+geom_bar(stat="identity")+geom_text(aes(label = signif(value)), nudge_y = 0.1)+labs(y= 'Average Churn Rate', x = 'Experiment')

#Visualizing Sex
ggplot(data_cat, aes(x=Sex)) + geom_bar()
ggplot(data_cat, aes(x=Sex,color=Churn)) + geom_bar()

#Calculating average Churn rates for individual Experiment levels
avg_churn_M<-(nrow(data_cat[data_cat$Sex=='M'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Sex=='M',]))
avg_churn_F<-(nrow(data_cat[data_cat$Sex=='F'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Sex=='F',]))
avg_churn_L<-(nrow(data_cat[data_cat$Sex=='L'& data_cat$Churn==1,]))/(nrow(data_cat[data_cat$Sex=='L',]))
Sex_avg_churn<-data.frame(name=c('M','F','L'),value=c(avg_churn_M,avg_churn_F,avg_churn_L))
ggplot(Sex_avg_churn,aes(name,value))+geom_bar(stat="identity")+geom_text(aes(label = signif(value)), nudge_y = 0.1)+labs(y= 'Average Churn Rate', x = 'Sex')


#Visualizing distributions of quantitative variables and relationships with regards to the target variable
data_num<-data[,c(2,3,4,5,6,8,9,10,11,12)] #Selecting quantitative variables (and the target variable)

#Building correlation matrix
data_num1<-data_num
for (j in 1:ncol(data_num)) {
  data_num1[,j]<-as.numeric(data_num[,j])
}

cor(data_num1)

#Visualizing Length.of.contract
hist(data_num$Length.of.contract)

#Plotting Churn against Length.of.contract
plot(data_num$Length.of.contract,data_num$Churn)

#Visualizing Consumption
hist(data_num$Consumption)

#Plotting Churn against Consumption
plot(data_num$Consumption,data_num$Churn)

#Visualizing Advance.payments
hist(data_num$Advance.payments)

#Plotting Churn against Advance.payments
plot(data_num$Advance.payments,data_num$Churn)

#Visualizing No.of.cons.points
hist(data_num$No.of.cons.points)

#Plotting Churn against No.of.cons.points
plot(data_num$No.of.cons.points,data_num$Churn)

#Visualizing Time.w.the.comp
hist(data_num$Time.w.the.comp)

#Plotting Churn against Time.w.the.comp
plot(data_num$Time.w.the.comp,data_num$Churn)

#Visualizing Town.population
hist(data_num$Town.population)

#Plotting Churn against Town.population
plot(data_num$Town.population,data_num$Churn)

#Visualizing Avg.temp.in.reg
hist(data_num$Avg.temp.in.reg)

#Plotting Churn against Avg.temp.in.reg
plot(data_num$Avg.temp.in.reg,data_num$Churn)

#Visualizing Avg.wage.in.reg
hist(data_num$Avg.wage.in.reg)

#Plotting Churn against Avg.wage.in.reg
plot(data_num$Avg.wage.in.reg,data_num$Churn)

#Log-transformation
data$Advance.payments<-log(data$Advance.payments+0.05)
#hist(data$Advance.payments)
colnames(data)[5] ="logAP"

data$No.of.cons.points<-log(data$No.of.cons.points+0.05)
#hist(data$No.of.cons.points)
colnames(data)[8] ="logNCP"

data$Time.w.the.comp<-log(data$Time.w.the.comp+0.05)
#hist(data$Time.w.the.comp)
colnames(data)[9] ="logTwC"

data$Town.population<-log(data$Town.population+0.05)
#hist(data$Town.population)
colnames(data)[10] ="logTP"

data$Avg.wage.in.reg<-log(data$Avg.wage.in.reg)
#hist(data$Avg.wage.in.reg)
colnames(data)[12] ="logAW"



str(data)





#Dummy encoding
library('caret')
dmy<-dummyVars(" ~ .", data = data, fullRank = T)
data <- data.frame(predict(dmy, newdata = data))
str(data)
colnames(data)[3] ="Churn"
data$Churn<-as.factor(data$Churn)




library('tidymodels')
# Create data split object
set.seed(7)
metrics <- metric_set(roc_auc)
data_split <- initial_split(data, prop = 0.75, strata = Churn)


# Create the training data
data_training <- data_split %>% 
  training()

# Create the test data
data_test <- data_split %>% 
  testing()


# Check the average Churn rates

mean(as.numeric(data_training$Churn)-1)
mean(as.numeric(data_test$Churn)-1)

# Check for correlated predictors
data_training %>% 
# Select numeric columns
  select_if(is.numeric) %>%
# Calculate correlation matrix
cor()


# Specify a logistic regression model
logistic_model <- logistic_reg() %>% 
  # Set the engine
  set_engine('glm') %>% 
  # Set the mode
  set_mode('classification')

# Print the model specification
logistic_model

#Specify recipe
logistic_recipe<-recipe(Churn~.,
                        data = data_training)

# Create a workflow
logistic_wkfl <- workflow() %>% 
  # Include the model object
  add_model(logistic_model) %>% 
  # Include the recipe object
  add_recipe(logistic_recipe)


# View workflow specification
logistic_wkfl

# Train the workflow
logistic_wkfl_fit <- logistic_wkfl %>% 
  last_fit(split = data_split,metrics=metrics)

# Calculate performance metrics on test data
logistic_wkfl_fit %>% 
  collect_metrics()

#Obtaining predictions
logistic_wkfl_fit$.predictions

# Combine test set results
logit_results <- data_test %>% 
  select(Churn) %>% 
  bind_cols(logistic_wkfl_fit$.predictions[[1]][,1])

# View results tibble
logit_results

# Calculate metrics across thresholds
logistic_threshold_df <- logit_results %>% 
  roc_curve(truth = Churn, .pred_0)

# View results
logistic_threshold_df

# Plot ROC curve
logistic_threshold_df %>% 
  autoplot()

# Calculate ROC AUC
roc_auc(logit_results,
        truth = Churn, 
        .pred_0)

#Calculate Top Decile Lift
library('lift')
TopDecileLift(1-logit_results$.pred_0,logit_results$Churn)


#Obtaining parameters
logit_params<-tidy(logistic_wkfl_fit$.workflow[[1]])














library('tune')
# Specify a decision tree model
dt_model <- decision_tree(cost_complexity= tune(),
                          tree_depth = tune(),
                          min_n = tune()) %>% 
  # Set the engine
  set_engine('rpart') %>% 
  # Set the mode
  set_mode('classification')

# Print the model specification
dt_model

#Specify recipe
dt_recipe<-recipe(Churn~.,
                  data = data_training)


# Create a workflow
dt_wkfl <- workflow() %>% 
  # Include the model object
  add_model(dt_model) %>% 
  # Include the recipe object
  add_recipe(dt_recipe) 

# View workflow specification
dt_wkfl


# Hyperparameter tuning with grid search
set.seed(6)
dt_grid <- grid_random(parameters(dt_model),
                       size = 5)
fold <- validation_split(data_training, strata = Churn)

# Hyperparameter tuning
dt_tuning <- dt_wkfl %>% 
  tune_grid(resamples=fold,
            grid = dt_grid,
            metrics = metrics)

# View results
dt_tuning %>% 
  collect_metrics()

# Select based on best performance
best_dt_model <- dt_tuning %>% 
  # Choose the best model based on roc_auc
  select_best(metric = 'roc_auc')

# Finalize your workflow
final_dt_wkfl <- dt_wkfl %>% 
  finalize_workflow(best_dt_model)

final_dt_wkfl

# Train the workflow
dt_wkfl_fit <- final_dt_wkfl %>% 
  last_fit(split = data_split, metrics=metrics)

# Calculate performance metrics on test data
dt_wkfl_fit %>% 
  collect_metrics()

#Visualising the tree
library('rpart.plot')

dt_wkfl_fit %>%
extract_fit_engine() %>%
  prp()




#Obtaining predictions
dt_wkfl_fit$.predictions


# Combine test set results
dt_results <- data_test %>% 
  select(Churn) %>% 
  bind_cols(dt_wkfl_fit$.predictions[[1]][,1])

# View results tibble
dt_results

# Calculate metrics across thresholds
dt_threshold_df <- dt_results %>% 
  roc_curve(truth = Churn, .pred_0)

# View results
dt_threshold_df

# Plot ROC curve
dt_threshold_df %>% 
  autoplot()

# Calculate ROC AUC
roc_auc(dt_results,
        truth = Churn, 
        .pred_0)

#Calculate Top Decile Lift
TopDecileLift(1-dt_results$.pred_0,dt_results$Churn)







#####LOGIT LEAF MODEL#####


# Specify a decision tree model
llm_dt_model <- decision_tree(cost_complexity=0.0000000000002,
                               tree_depth = 3,
                               min_n = 20) %>% 
  # Set the engine
  set_engine('rpart') %>% 
  # Set the mode
  set_mode('classification')

# Print the model specification
llm_dt_model

# Fit to training data
llm_dt_fit <- llm_dt_model %>% 
  fit(Churn~.,
      data = data_training)

# Print model fit object
llm_dt_fit

# Predict outcome categories
class_preds <- predict(llm_dt_fit, new_data = data_training,
                       type = 'class')

# Obtain estimated probabilities for each outcome value
prob_preds <- predict(llm_dt_fit, new_data = data_training, 
                      type = 'prob')

#Results
results <- data_training %>% 
  select(Churn) %>% 
  bind_cols(class_preds, prob_preds)

# View results tibble
results

#Combining data with predictions
data_pred<-cbind(data_training, results$.pred_0)


#Creating groups
training_data_grouped<-data_pred %>% group_by(results$.pred_0)


groups<-group_split(training_data_grouped)
n_groups<-length(groups)

groups_filtered<-vector(mode = "list", length = n_groups)

for(l in 1:n_groups){
  groups_filtered[[l]]<-Filter(function(x)(length(unique(x))>1), groups[[l]])
}


# Specify a logistic regression model
llm_logit_model <- logistic_reg() %>% 
  # Set the engine
  set_engine('glm') %>% 
  # Set the mode
  set_mode('classification')

# Print the model specification
llm_logit_model





#Fitting to groups
group_fitts<-vector(mode = "list", length = n_groups)

for(m in 1:n_groups) {
  group_fitts[[m]] <- llm_logit_model %>% 
    fit(Churn~.,
        data = groups_filtered[[m]])
  
}






#Extracting paramters
param_list<-vector(mode = "list", length = n_groups)

for(n in 1:n_groups) {
  param_list[[n]] <- na.omit(data.frame(group_fitts[[n]]$fit$coefficients))
  
}


param_list


###Fitting testing set###
test_prob_preds<-predict(llm_dt_fit, new_data = data_test, 
                         type = 'prob')


#Results
results_test <- data_test %>% 
  select(Churn) %>% 
  bind_cols(test_prob_preds)

# View results tibble
results_test

#Combining data with predictions
test_data_pred<-cbind(data_test, results_test$.pred_0)


#Creating groups
test_data_grouped<-test_data_pred %>% group_by(results_test$.pred_0)

test_groups<-group_split(test_data_grouped)

#Predicting probabilities for test groups
LLM_class_pred<-vector(mode="list", length = n_groups)
LLM_prob_pred<-vector(mode="list", length = n_groups)

for(o in 1:n_groups){
  LLM_prob_pred[[o]]<-predict(group_fitts[[o]],new_data = test_groups[[o]],type = 'prob')
}


#Applying regression only for groups with enough degrees of freedom
for(p in 1:n_groups){
  if(nrow(groups[[p]])<50){
    LLM_prob_pred[[p]]$.pred_0<-vector('numeric',nrow(test_groups[[p]]))+groups[[p]]$`results$.pred_0`[1]
  }
}


#Creating rsult groups
LLM_res<-vector(mode="list", length = n_groups)

for(q in 1:n_groups){
  LLM_res[[q]] <- test_groups[[q]] %>% 
    select(Churn) %>% 
    bind_cols(LLM_prob_pred[[q]]$.pred_0)
}


#Combining result groups
LLM_results<-LLM_res[[1]]

for(r in 2:n_groups){
  LLM_results<-rbind(LLM_results, LLM_res[[r]])
}

# Calculate metrics across thresholds
LLM_threshold_df <- LLM_results %>% 
  roc_curve(truth = Churn, ...2)

# View results
LLM_threshold_df


# Calculate ROC AUC
ROC_AUC<-roc_auc(LLM_results,
                 truth = Churn, 
                 ...2)

#Calculate Top Decile Lift
TDL<-TopDecileLift(1-LLM_results$...2,LLM_results$Churn)

# Plot ROC curve
ROC<-LLM_threshold_df %>% 
  autoplot()


metr<-list(ROC_AUC,TDL,ROC,llm_dt_fit)

metr[[1]]
metr[[2]]
metr[[3]]

#Depicting the DT part
metr[[4]]  %>%
  extract_fit_engine() %>%
  prp()



#Regression parameters
tidy(group_fitts[[2]])
tidy(group_fitts[[4]])