In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages
library(ggplot2) #grafy
library(patchwork) #dva grafy vedle sebe
library(lubridate) #datum
library(janitor) 
library(hms) #cas
library(performanceEstimation) #smote
library(sampling)
library(caret) #ml
library(embed)
library(fastDummies) # 1hot
library(corrplot) #correlation plot
library(neuralnet) #neural network
library(caTools) # train/test split
library(pscl) #McFaden R2
library(car) # VIF values
library(pROC) # ROC AUC
library(survey) # Wald Test for logit
library(randomForest)
library(recipes)
library(xgboost)
library(ParBayesianOptimization)
library(nnet)
library(doParallel)
library(h2o)
library(MLmetrics)
library(stargazer)
library(plotROC)
library(knitr)
library(kableExtra)
library(microbenchmark)
library(broom)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#IMPORT DATA

#data downloaded from https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions

transactions <- read_csv("/kaggle/input/credit-card-transactions/credit_card_transactions-ibm_v2.csv")
cards <- read_csv("/kaggle/input/credit-card-transactions/sd254_cards.csv")
users <- read_csv("/kaggle/input/credit-card-transactions/sd254_users.csv")

#str(transactions)
#summary(transactions)

In [None]:
#DATA CLEANING AND JOINING

#TRANSACTIONS
transprep <- transactions %>%
    clean_names() %>%
    rename('fraud' = 'is_fraud') %>%
    filter(year >= 2018) %>%
    mutate(across(starts_with("amount"), ~gsub("\\$", "", .) %>% as.numeric)) %>%
    mutate_at(c('use_chip', 'fraud', 'mcc'), as.factor) %>%
    mutate(zip = NULL, 'merchant_city' = NULL, errors = NULL, 'merchant_name' = NULL) %>%
    filter(amount > 0)

transprep %>%
    summary()


#CARDS
cardsprep <- cards %>%
    clean_names() %>%
    mutate_at(c('card_type', 'has_chip'), as.factor) %>%
    rename('card' = 'card_index') %>%
    select(user, card, 'card_type', 'has_chip')

cardsprep %>%
    head()

#INNER JOIN
trncrdprep <- transprep %>%
    inner_join(cardsprep, by = c("user", "card"))

trncrdprep %>%
    head()

#USERS
usersprep <- users %>%
    clean_names() %>%
    select('birth_year', gender, state) %>%
    mutate(user = seq(0, 1999, by = 1)) %>%
    mutate_at(c('gender'), as.factor)

save(data_joined, file = "dataset_joined.RData")

#INNER JOIN
data_joined <- usersprep %>%
    inner_join(trncrdprep, by = c("user"))

data_joined %>%
    str()

#PRIPRAVA DATA_JOINED
data <-
data_joined %>%
    mutate(user = NULL, card = NULL) %>%
    mutate(same_state = state == merchant_state) %>%
    replace_na(list(same_state = FALSE)) %>%
    mutate(age = year - birth_year) %>%
    mutate(birth_year = NULL, state = NULL) %>%
    #mutate_at(c('merchant_state'), as.factor) %>%
    mutate(daytime = ifelse(time > as_hms("10:00:00") &  time < as_hms("20:00:00"), 1, 0)) %>%
    mutate(date = make_date(year, month, day)) %>%
    mutate(year = NULL, month = NULL, day = NULL, 'merchant_state' = NULL)

data$same_state <- as.numeric(data$same_state)


levels(data$has_chip) <- c(0, 1)
data$has_chip <- as.numeric(as.character(data$has_chip))    

In [None]:
#UNDERSAMPLING
set.seed(23)
sample_strata <- sampling:::strata(data, "fraud", size = c(120000, 4450), method = "srswor")

sample <- (getdata(data, sample_strata)) %>%
    mutate(ID_unit = NULL, Prob = NULL, Stratum = NULL)


#LEARNED EMBEDDING
set.seed(23)
fraud_rec <- recipe(fraud ~ ., data = sample) %>%
  step_embed(mcc, outcome = vars(fraud), num_terms = 1) %>%
    prep(training = sample)

sample_embedd <- bake(fraud_rec, new_data = NULL)

#1 HOT ENCOODING
data_hot <- dummy_cols(sample_embedd, select_columns = c('gender', 'card_type', 'use_chip'),
                   remove_selected_columns = TRUE,
                   remove_first_dummy = TRUE)

data_hot <- data_hot  %>% 
    mutate(date = NULL, time = NULL)

set.seed(23)
data_smote <- smote(fraud ~ ., data_hot, perc.over = 6, k = 5, perc.under = 5)

levels(data_smote$fraud) <- c(0, 1)
data_smote$fraud <- as.numeric(as.character(data_smote$fraud))

save(data_smote, file = "data_smote.RData")

In [None]:
#DATA DESCRIPTION
load("/kaggle/input/diplomka-data-upravena/dataset_joined.RData")
load("/kaggle/input/diplomka-data-upravena/smote_data.RData")

data_joined <-
data_joined %>%
    mutate(user = NULL, card = NULL) %>%
    mutate(same_state = state == merchant_state) %>%
    replace_na(list(same_state = FALSE)) %>%
    mutate(age = year - birth_year) %>%
    mutate(birth_year = NULL, state = NULL) %>%
    mutate(date = make_date(year, month, day)) %>%
    mutate(year = NULL, month = NULL, day = NULL, 'merchant_state' = NULL)

data_joined$same_state <- as.numeric(data_joined$same_state)
levels(data_joined$has_chip) <- c(0, 1)
data_joined$has_chip <- as.numeric(as.character(data_joined$has_chip))  
levels(data_joined$fraud) <- c(FALSE, TRUE)
data_joined$fraud <- as.logical(data_joined$fraud)

In [None]:
#HISTOGRAMS OF AMOUNT
amount_fraud <- data_smote %>%
    filter(fraud == "Yes") %>%
    ggplot(aes(amount)) +
    geom_histogram(bins = 10) + 
    scale_x_log10() +
    #coord_cartesian(xlim = c(-2.5, 7.5)) +
    geom_vline(aes(xintercept= mean(amount)), color="red") +
    geom_text(aes(x=mean(amount) + 1000, label=paste0("Mean\n",round(mean(amount), 1)), y=8000)) +
    geom_vline(aes(xintercept= median(amount)), color="black") +
    geom_text(aes(x=median(amount) + 1000, label=paste0("Median\n",round(median(amount), 1)), y=4000)) +
    labs(x = 'Log of Amount', y="", title = "Fraudulent Transactions") +
    theme_bw()

#ggsave(filename = "amount_fraud.png")

amount_legit <- data_smote %>%
    filter(fraud == "No") %>%
    ggplot(aes(amount)) +
    geom_histogram(bins = 10) + 
    scale_x_log10() +
    #coord_cartesian(xlim = c(-2.5, 7.5)) +
    geom_vline(aes(xintercept= mean(amount)), color="red") +
    geom_text(aes(x=mean(amount) + 1000, label=paste0("Mean\n",round(mean(amount), 1)), y=30000)) +
    geom_vline(aes(xintercept= median(amount)), color="black") +
    geom_text(aes(x=median(amount) + 1000, label=paste0("Median\n",round(median(amount), 1)), y=18000)) +
    labs(x = 'Log of Amount', y="", title = "Genuine Transactions") +
    theme_bw()

amount_fraud
amount_legit

amount <- plot_grid(amount_fraud, amount_legit)
save_plot(filename = "amount.png", amount, ncol =2)

#ggsave(filename = "amount_no_fraud.png")

data_smote %>%
    select(amount) %>%
    summary()

In [None]:
#TRANSACTIONS BY TIME
data_smote_graph <- data_smote
levels(data_smote_graph$fraud) <- c(FALSE, TRUE)
data_smote_graph$fraud <- as.logical(data_smote_graph$fraud)
#summary(data_smote)

time_fraud <- data_smote_graph %>%
    filter(fraud == TRUE) %>%
    group_by(time) %>%
    summarise(n = n()) %>%
    ggplot(aes(time, n)) +
    geom_point() +
    labs(x = 'Time', y = 'Number of Transactions', title = 'Fraudulent Transactions by Time') +
    theme_bw()

time_legit <- data_smote_graph %>%
    filter(fraud == FALSE) %>%
    group_by(time) %>%
    summarise(n = n()) %>%
    ggplot(aes(time, n)) +
    geom_point() +
    labs(x = 'Time', y = 'Number of Transactions', title = 'Genuine Transactions by Time') +
    theme_bw()

time_fraud
time_legit
time <- plot_grid(time_legit, time_fraud)
save_plot(filename = "time.png", time, , ncol =2)

#TRANSACTIONS BY AGE

age <- data_smote_graph %>%
    #filter(fraud == TRUE) %>%
    ggplot(aes(age)) +
    geom_histogram(bins = 15) +
    theme_bw() +
    labs(x = 'Age', y = 'Number of Transactions', title = 'Distribution of Age')

age
         
save_plot(filename = "age.png", age, ncol =1)

In [None]:
#NUMBER OF FRAUDS IN DATA
data_smote %>%
    filter(fraud == 'Yes') %>%
    summarise(n=n())

data_smote_all <- data_smote_all %>%
    mutate(fraud = NULL, time = NULL, amount = NULL, age = NULL) %>%
    summarise_all(funs(sum)) %>%
    mutate(across(where(is.numeric), round, 0)) %>%
    mutate(total = 164650) #add column with total number of rows in dataset
head(data_smote_all)

In [None]:
#HAS CHIP
data_smote_all %>%
    select('has_chip', 'total') %>%
    mutate('no_chip' = total - has_chip) %>%
    mutate(total = NULL) %>%
    pivot_longer(cols = everything(), names_to = "chip", values_to = "count") %>%
    as.data.frame() %>%
    ggplot(aes(x = chip, y = count)) +
    geom_col() +
    theme_bw()

data_smote_all %>%
    select('has_chip', 'total') %>%
    mutate('no_chip' = total - has_chip) %>%
    mutate(has_chip = has_chip / total, no_chip = no_chip / total, total = NULL) %>%
    mutate(across(everything(), round, 4))

#GENDERS
gender <- data_smote_all %>%
    select(starts_with('gender')) %>%
    rename(male = gender_Male, female = gender_Female) %>%
    pivot_longer(cols = everything(), names_to = "gender", values_to = "count") %>%
    as.data.frame() %>%
    ggplot(aes(x = gender, y = count)) +
    geom_bar(stat = "identity") +
    theme_bw()



data_smote_all %>%
    select(starts_with('gender'), total) %>%
    rename(male = gender_Male, female = gender_Female) %>%
    mutate(male = male / total, female = female / total, total = NULL) %>%
    mutate(across(everything(), round, 4))

#CARD TYPES
cardtype <- data_smote_all %>%
    select(starts_with('card_type')) %>%
    pivot_longer(cols = everything(), names_to = "type", values_to = "count") %>%
    as.data.frame() %>%
    ggplot(aes(x = type, y = count)) +
    geom_col() +
    theme_bw()

cardtype
save_plot(filename = "cardtype.png", cardtype, ncol =1)

data_smote_all %>%
    select(starts_with('card_type'), total) %>%
    rename(credit = 'card_type_Credit', debit = 'card_type_Debit', prepaid = 'card_type_Debit (Prepaid)') %>%
    mutate(credit = credit / total, debit = debit / total, prepaid = prepaid / total, total = NULL) %>%
    mutate(across(everything(), round, 4))

#USE CHIP
chip <- data_smote_all %>%
    select(starts_with('use_chip')) %>%
    rename(chip = 'use_chip_Chip Transaction', online = 'use_chip_Online Transaction', swipe = 'use_chip_Swipe Transaction') %>%
    pivot_longer(cols = everything(), names_to = "chip", values_to = "count") %>%
    as.data.frame() %>%
    ggplot(aes(x = chip, y = count)) +
    geom_col() +
    theme_bw()

chip
save_plot(filename = "chip.png", chip, ncol =1)


data_smote_all %>%
    select(starts_with('use_chip'), total) %>%
    rename(chip = 'use_chip_Chip Transaction', online = 'use_chip_Online Transaction', swipe = 'use_chip_Swipe Transaction') %>%
    mutate(chip = chip / total, online = online / total, swipe = swipe / total, total = NULL) %>%
    mutate(across(everything(), round, 4))

#SAME STATE
data_smote_all %>%
    select('same_state', 'total') %>%
    mutate('no' = total - same_state, total = NULL) %>%
    rename('yes' = 'same_state') %>%
    pivot_longer(cols = everything(), names_to = "same_state", values_to = "count") %>%
    as.data.frame() %>%
    ggplot(aes(x = same_state, y = count)) +
    geom_col() +
    theme_bw()

data_smote_all %>%
    select('same_state', 'total') %>%
    mutate('no' = total - same_state) %>%
    mutate(no = no / total, yes = same_state / total, total = NULL, same_state = NULL) %>%
    mutate(across(everything(), round, 4))

#FRAUD
fraudprop <- as.data.frame(prop.table(table(data_smote$fraud))) %>%
    rename('Fraud' = 'Var1')

ggplot(fraudprop, aes(Fraud, Freq)) +
    geom_bar(stat='identity') +
    labs(x = 'Fraud', y = 'Proportion', title = 'Proportion of Fraudulent Transactions') +
    theme_bw()

fraudprop

fraudprop_orig <- as.data.frame(prop.table(table(data_joined$fraud))) %>%
    rename('Fraud' = 'Var1')

fraudprop_orig

ggplot(fraudprop_orig, aes(Fraud, Freq)) +
    geom_bar(stat='identity') +
    labs(x = 'Fraud', y = 'Proportion', title = 'Proportion of Fraudulent Transactions') +
    theme_bw()

In [None]:
#MODELS

load("/kaggle/input/dataset-model/data.RData")
load("/kaggle/input/dataset-model/data_smote.RData")
#load("/kaggle/input/dataset-model/sample_all.RData")

corrplot(cor(data_smote), method = 'number')

#TRAIN/TEST SPLIT ON BALANCED DATA
set.seed(23)

data_model <- data_smote %>%
    clean_names() %>%
    mutate(time = NULL)

split <- sample.split(data_model$fraud, SplitRatio = 0.7)
train  <- subset(data_model, split == TRUE)
test   <- subset(data_model, split == FALSE)

#DRAW ANOTHER TESTING SAMPLE FROM IMBALANCED DATA
set.seed(23)
sample_all <- data[sample(nrow(data), size=50000), ]

fraud_all <- recipe(fraud ~ ., data = sample_all) %>%
  step_embed(mcc, outcome = vars(fraud), num_terms = 1) %>%
    prep(training = sample_all)

sample_all <- bake(fraud_all, new_data = NULL)

sample_all <- dummy_cols(sample_all, select_columns = c('gender', 'card_type', 'use_chip'),
                   remove_selected_columns = TRUE,
                   remove_first_dummy = TRUE)

levels(sample_all$fraud) <- c(0, 1)
sample_all$fraud <- as.numeric(as.character(sample_all$fraud))


sample_all <- sample_all %>%
    clean_names() %>%
    mutate(daytime = ifelse(time > as_hms("10:00:00"), 1, 0)) %>%
    mutate(time = NULL, date = NULL)

save(sample_all, file = 'sample_all.RData')

In [None]:
#CONFUSION MATRIX PLOT FUNCTION
draw_confusion_matrix <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#3F97D0')
  text(195, 435, 'Genuine', cex=1.2)
  rect(250, 430, 340, 370, col='#F7AD50')
  text(295, 435, 'Fraudulent', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#F7AD50')
  rect(250, 305, 340, 365, col='#3F97D0')
  text(140, 400, 'Genuine', cex=1.2, srt=90)
  text(140, 335, 'Fraudulent', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(25, 85, names(cm$byClass[1]), cex=1.5, font=2)
  text(25, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.5)
  text(50, 85, names(cm$byClass[2]), cex=1.5, font=2)
  text(50, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.5)
  text(75, 85, names(cm$byClass[5]), cex=1.5, font=2)
  text(75, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.5)
 # text(80, 85, names(cm$byClass[6]), cex=1.5, font=2)
 # text(80, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.5)
  text(50, 35, names(cm$byClass[7]), cex=1.5, font=2)
  text(50, 20, round(as.numeric(cm$byClass[7]), 3), cex=1.5)

  # add in the accuracy information 
  text(25, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(25, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(75, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(75, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}  

In [None]:
#Logit
set.seed(23)
logit_model <- glm(fraud ~ .,
                   family="binomial", data=train)
summary(logit_model)

stargazer(logit_model, type = 'latex', single.row = TRUE)


#Variable importance
logit_imp <- caret::varImp(logit_model)
ggplot(logit_imp, aes(x=reorder(rownames(logit_imp),Overall), y=Overall)) +
geom_point( color="blue", size=4, alpha=0.6)+
geom_segment( aes(x=rownames(logit_imp), xend=rownames(logit_imp), y=0, yend=Overall), 
color='skyblue') +
labs(title = 'Variable Importance in Logistic Regression', x = 'Variable', y = 'Overall Importance') +
theme_light() +
coord_flip() 

#ggsave('logit2_varimp.png')

pR2(logit_model)["McFadden"]

#Test uspesnosti Logit na vzorku nebalancovaneho datasetu (vysledek nizka specificita)
logit_predict_all <- predict(logit_model, sample_all, type="response")
logitpred_all <- ifelse(logit_predict_all > 0.5, 1, 0)

logit_cm <- confusionMatrix(as.factor(logitpred_all), as.factor(sample_all$fraud), positive = '1')
png("logit2_cm.png", width = 590, height = 620, res = 110)
draw_confusion_matrix(logit_cm)
dev.off()

logit_roc <- roc(sample_all$fraud ~ logit_predict_all)
logit_auc <- round(auc(sample_all$fraud ~ logit_predict_all),4)

#create ROC plot
pROC:::ggroc(logit_roc) +
  ggtitle(paste0('Logistic Regression ROC Curve ', '(AUC = ', logit_auc, ')')) +
    geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="darkgrey", linetype="dashed") +
    theme_light()

#ggsave('logit2_auc.png')

logit_benchmark <- microbenchmark(predict(logit_model, sample_all, type="response"), times = 100L, setup = set.seed(23))
summary(logit_benchmark, unit = 's')

In [None]:
#BOOTSTRAP LOGIT
set.seed(23)
logit_bootstrap <- Boot(logit_model, R = 1000)
summary(logit_bootstrap)
confint(logit_bootstrap)
hist(logit_bootstrap)

as.data.frame(summary(logit_bootstrap))[,-1] %>%
    kable(format = 'latex', booktabs = TRUE) 
#save(logit_bootstrap, file = 'logit_bootstrap.RData')

In [None]:
#RANDOM FOREST

#RF h2o priprava 
h2o.no_progress()
h2o.init(max_mem_size = "20g")

# convert training data to h2o object
train_h2o <- as.h2o(train)
train_h2o$fraud <- as.factor(train_h2o$fraud)
test_h2o <- as.h2o(test)
sample_h2o <- as.h2o(sample_all)

fraud_splits <- h2o.splitFrame(data =  train_h2o, ratios = 0.8, seed = 23)
train <- fraud_splits[[1]]
valid <- fraud_splits[[2]]

# set the response column to Sale_Price
response <- "fraud"

# set the predictor names
predictors <- setdiff(colnames(train), response)
#predictors <- c('same_state', 'mcc_embed_1', 'use_chip_online_transaction', 'gender_male', 'daytime', 'card_type_debit', 'amount', 'age')
n_features <- length(setdiff(names(train), "fraud"))

#FINDING IDEAL PARAMETERS

# hyperparameter grid
h2o.rm('rf_grid')
hyper_grid <- list(
  mtries = seq(3, 7, by = 2),
  #min_rows = seq(1, 5, by = 2),
  max_depth = seq(30, 40, by = 5),
  sample_rate = seq(0.6, 1, by = 0.05)
)

# random grid search strategy
search_criteria <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "aucpr",
  stopping_tolerance = 0.01,   # stop if improvement is < 0.1%
  stopping_rounds = 10,         # over the last 10 models
  max_runtime_secs = 60*15      # or stop search after 5 min.
)

# perform grid search 
random_grid <- h2o.grid(
  algorithm = "randomForest",
  grid_id = "rf_grid",
  x = predictors, 
  y = response, 
  training_frame = train,
  validation_frame = valid,
  hyper_params = hyper_grid,
  ntrees = 100,
  seed = 23,
  search_criteria = search_criteria
)

# collect the results and sort by our model performance metric 
# of choice
random_grid_perf <- h2o.getGrid(
  grid_id = "rf_grid", 
  sort_by = "aucpr", 
  decreasing = TRUE
)
print(random_grid_perf)

#FINAL RANDOM FOREST
h2o_rf <- h2o.randomForest(
    x = predictors, 
    y = response,
    training_frame = train,
    validation_frame = valid,
    nfolds = 5,
    ntrees = 1000,
    max_depth = 30,
    min_rows = 1,
    mtries = 3,
    sample_rate = 0.9,
    seed = 23
)

#h2o_rf

h2o_predict <- as.data.frame(h2o.predict(h2o_rf, sample_h2o)$predict)
h2o_predict <- h2o_predict[,1]
rf_cm <- confusionMatrix(h2o_predict, as.factor(sample_all$fraud), positive = '1')

png("rf2_cm.png", width = 590, height = 620, res = 110)
draw_confusion_matrix(rf_cm)
dev.off()

rf_perf <- h2o.performance(h2o_rf, sample_h2o)
plot(rf_perf, type = "roc")

#RANDOM FOREST VARIABLE IMPORTANCE
h2o_imp <- h2o.varimp(h2o_rf)[,-c(2,3)]
h2o_imp <- as.data.frame(h2o_imp)
rownames(h2o_imp) <- h2o_imp[,1]

ggplot2::ggplot(h2o_imp, aes(x=reorder(rownames(h2o_imp),percentage), y=percentage)) +
geom_point( color="blue", size=4, alpha=0.6)+
geom_segment( aes(x=rownames(h2o_imp), xend=rownames(h2o_imp), y=0, yend=percentage), 
color='skyblue') +
labs(title = 'Variable Importance in Random Forest', x = 'Variable', y = 'Overall Importance') +
theme_light() +
coord_flip() 

#TESTING RANDOM FOREST ON BALANCED DATA - WORKS BETTER THAN TEST ON IMBALANCED DATA
h2o_predict <- as.data.frame(h2o.predict(h2o_rf, test_h2o)$predict)
h2o_predict <- h2o_predict[,1]
rf2_cm <- confusionMatrix(h2o_predict, as.factor(test$fraud), positive = '1')

png("rf2_cm.png", width = 590, height = 620, res = 110)
draw_confusion_matrix(rf2_cm)
dev.off()

In [None]:
#NEURAL NETWORK

#NNET TUNING
MySummary  <- function(data, lev = NULL, model = NULL){
  a1 <- defaultSummary(data, lev, model)
  b1 <- twoClassSummary(data, lev, model)
  c1 <- prSummary(data, lev, model)
  out <- c(a1, b1, c1)
  out}


train_nn <- train
train_nn$fraud <- as.factor(train_nn$fraud)
levels(train_nn$fraud) <- c('No', 'Yes')
fitControl <- trainControl(method = "repeatedcv", 
                           number = 2, 
                           repeats = 2, 
                           classProbs = TRUE, 
                           summaryFunction = MySummary)

nnetGrid <-  expand.grid(decay = c(0.5, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7),
                         size = c(3, 5, 10, 20))

nnetFit <- train(fraud ~ ., 
                 data = train_nn,
                 method = "nnet",
                 metric = "Kappa",
                 trControl = fitControl,
                 tuneGrid = nnetGrid,
                 verbose = FALSE)
plot(nnetFit)

(nnetFit$results) %>%
    arrange(desc(Kappa))
plot(nnetFit, metric = "Kappa")


#FINAL NEURAL NETWORK
set.seed(23)
nn_model <- nnet(fraud~. , data = train, size = 10, decay = 0.001, trace = FALSE)

nn_pred <- predict(nn_model, sample_all)
nn_pred <-as.data.frame(nn_pred)[,1]
nn_results <- ifelse(nn_pred > 0.5, 1, 0)
nnet_cm <- confusionMatrix(as.factor(nn_results), as.factor(sample_all$fraud), positive = '1')

png("nnet_cm.png", width = 590, height = 620, res = 110)
draw_confusion_matrix(nnet_cm)
dev.off()

nnet_roc <- roc(sample_all$fraud ~ nn_pred)
nnet_auc <- round(auc(sample_all$fraud ~ nn_pred),4)

#create ROC plot
pROC:::ggroc(nnet_roc) +
  ggtitle(paste0('Neural Network ROC Curve ', '(AUC = ', nnet_auc, ')')) +
    geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="darkgrey", linetype="dashed") +
    theme_light()

#ggsave(file = 'nnet_auc.png')

#NEURAL NETWORK TIME BENCHMARK
nnet_benchmark <- microbenchmark(predict(nn_model, sample_all), times = 100L, setup = set.seed(23))
summary(nnet_benchmark, unit = 's')

#NEURAL NETWORK VARIABLE IMPORTANCE
nn_imp <- caret::varImp(nn_model)

ggplot2::ggplot(nn_imp, aes(x=reorder(rownames(nn_imp),Overall), y=Overall)) +
geom_point( color="blue", size=4, alpha=0.6)+
geom_segment( aes(x=rownames(nn_imp), xend=rownames(nn_imp), y=0, yend=Overall), 
color='skyblue') +
labs(title = 'Variable Importance in Neural Network', x = 'Variable', y = 'Overall Importance') +
theme_light() +
coord_flip() 

#ggsave(file = 'nnet_varimp.png')

In [None]:
#XGBOOST

#DATA PREPARATION
train_x <- data.matrix(train[, -6])
train_y <- data.matrix(train[, 6])
xgb_train <- xgb.DMatrix(data = train_x, label = train_y)

test_x <- data.matrix(test[, -6])
test_y <- data.matrix(test[, 6])
xgb_test <- xgb.DMatrix(data = test_x, label = test_y)

sample_x <- data.matrix(sample_all[, -6])
sample_y <- data.matrix(sample_all[, 6])
xgb_sample <- xgb.DMatrix(data = sample_x, label = sample_y)

#XGBOOST
set.seed(23)
#watchlist <- list(train=xgb_train, test=xgb_sample)
#xgb_model_train <- xgb.train(data = xgb_train, max.depth = 3, watchlist=watchlist, nrounds = 10000)
xgb_model <- xgboost(data = xgb_train, max.depth = 10, eta = 0.15, nrounds = 200, verbose = 0, max_delta_step =1, lambda = 2,
                     objective = "binary:logistic")
#XGBOOST NA TESTOVACIM VZORKU
xgb_pred <- predict(xgb_model, xgb_sample)
xgb_prediction <- as.numeric(xgb_pred > 0.5)

xgb_cm <- caret:::confusionMatrix(as.factor(xgb_prediction), as.factor(sample_all$fraud), positive = '1')

png("xgb_cm.png", width = 590, height = 620, res = 110)
draw_confusion_matrix(xgb_cm)
dev.off()

xgb_roc <- roc(sample_all$fraud ~ xgb_pred)
xgb_auc <- round(auc(sample_all$fraud ~ xgb_pred),4)

#create ROC plot
pROC:::ggroc(xgb_roc) +
  ggtitle(paste0('XGBoost ROC Curve ', '(AUC = ', xgb_auc, ')')) +
    geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="darkgrey", linetype="dashed") +
    theme_light()

ggsave(file = 'xgb_auc.png')

xgboost_benchmark <- microbenchmark(predict(xgb_model, xgb_sample), times = 100L, setup = set.seed(23))
summary(xgboost_benchmark, unit = 's')

#XGB VARIABLE IMPORTANCE
xgb_imp <- xgb.importance(colnames(xgb_train), model = xgb_model)
xgb_imp <- as.data.frame(xgb_imp[,c(1,2)])
rownames(xgb_imp) <- xgb_imp[,1]
#xgb_imp
ggplot2::ggplot(xgb_imp, aes(x=reorder(rownames(xgb_imp),Gain), y=Gain)) +
geom_point( color="blue", size=4, alpha=0.6) +
geom_segment( aes(x=rownames(xgb_imp), xend=rownames(xgb_imp), y=0, yend=Gain), 
color='skyblue') +
labs(title = 'Variable Importance in XGBoost', x = 'Variable', y = 'Overall Importance') +
theme_light() +
coord_flip() 

ggsave(file = 'xgboost_varimp.png')