rm(list = ls()) library(glmnet) library(quantreg) library(robustbase) library(Metrics) library(robustHD) library("MultiRNG") library(MASS) library(latex2exp) library(tidyverse) library(dplyr) library(car) library(datasets) library(pracma) options(max.print = 1000000) ########################################## Realne data kapitola 3 ########################################## data <- read.csv("C:/Jakub/Karlova Univerzita/Diplomová práca/DataExtended.csv", header = T, dec = ",", sep = ";") #data[order(data[ ,17], decreasing=TRUE), ] # zoradenie dat podla odozvy (response premennej) X <- as.matrix(data[-71,3:16]) # dim 140*14 incpt <- rep(1, nrow(X)) X_incpt <- cbind(incpt, X) Y <- data[-71,17] boxplot(Y, ylab="[v miliónoch]", xlab="140 štátov",cex.lab=1.7,cex.main=1.5,cex.axis=1.4,main="Množstvo návštevníkov jednotlivých štátov za rok", col="tomato2", pch=20) ############################################ Tvorba podmodelu ################################################### (m_summary <- summary(lm(Y~X))) X_sub <- subset(X, select = -c(TT)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(NR)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(BE)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(HH)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(HRLM)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(SS)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(PC)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(ATI)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(IO)) (m_summary <- summary(lm(Y~X_sub))) X_sub <- subset(X_sub, select = -c(ES)) (m_summary <- summary(lm(Y~X_sub))) ################################################################################################################# # Dostavame zoznam signifikantnych premennych: ICT, GPI, TSI, CRBT ICT_s <- c() GPI_s <- c() TSI_s <- c() CRBT_s <- c() for (i in 1:11){ coef <- as.matrix(coef(glmnet(X_sub, Y, family = "gaussian", lambda = (i-1)/10, alpha = 1))) # LASSO ICT_s[i] <- coef[2,1] GPI_s[i] <- coef[3,1] TSI_s[i] <- coef[4,1] CRBT_s[i] <- coef[5,1] } plot(0,0, type = "n", ylim = c(-2.5,2.5), xlim = c(0,1), xlab = TeX("Postupnosť hodnôť $\\lambda_l$"), ylab = "Odhad signifikantných koeficientov", cex.lab=1.1) title(TeX("Vplyv parametru $\\lambda$"), cex.main=1.5) axis(1, at = seq(0, 1, by = 0.1), las=1) cl <- rainbow(3) lines(seq(0, 1, length.out = 11), ICT_s, col = cl[1], lwd =2) lines(seq(0, 1, length.out = 11), GPI_s, col = "#339900", lwd =2) lines(seq(0, 1, length.out = 11), TSI_s, col = cl[3], lwd =2) abline(h=0, lwd=3, lty=2) legend("topright", c("ICT", "GPI", "TSI"), col = c(cl[1], "#339900", cl[3]), lty=1, lwd=3, cex=1.1) ############################################ Inicializacia ################################################### BE <- c() SS <- c() HH <- c() HRLM <- c() ICT <- c() TT <- c() IO <- c() PC <- c() ES <- c() ATI <- c() GPI <- c() TSI <- c() NR <- c() CRBT <- c() ################################################################################################################# for (i in 1:11){ coef <- as.matrix(coef(glmnet(X, Y, family = "gaussian", lambda = (i-1)/10, alpha = 1))) # LASSO BE[i] <- coef[2,1] SS[i] <- coef[3,1] HH[i] <- coef[4,1] HRLM[i] <- coef[5,1] ICT[i] <- coef[6,1] TT[i] <- coef[7,1] IO[i] <- coef[8,1] PC[i] <- coef[9,1] ES[i] <- coef[10,1] ATI[i] <- coef[11,1] GPI[i] <- coef[12,1] TSI[i] <- coef[13,1] NR[i] <- coef[14,1] CRBT[i] <- coef[15,1] } plot(0,0, type = "n", ylim = c(-2.5,2.5), xlim = c(0,1), xlab = TeX("Postupnosť hodnôť $\\lambda_l$"), ylab = "Odhad koeficientov", cex.lab=1.2) title(TeX("Vplyv parametru $\\lambda$"), cex.main=1.5) axis(1, at = seq(0, 1, by = 0.1), las=1) cl <- rainbow(14) lines(seq(0, 1, length.out = 11), BE, col = cl[1], lwd=2) lines(seq(0, 1, length.out = 11), SS, col = cl[2], lwd=2) lines(seq(0, 1, length.out = 11), HH, col = "#FFCC33", lwd=2) lines(seq(0, 1, length.out = 11), HRLM, col = "#339900", lwd=2) lines(seq(0, 1, length.out = 11), ICT, col = "#990000", lwd=2) lines(seq(0, 1, length.out = 11), TT, col = "#336600", lwd=2) lines(seq(0, 1, length.out = 11), IO, col = cl[7], lwd=2) lines(seq(0, 1, length.out = 11), PC, col = cl[8], lwd=2) lines(seq(0, 1, length.out = 11), ES, col = cl[9], lwd=2) lines(seq(0, 1, length.out = 11), ATI, col = cl[10], lwd=2) lines(seq(0, 1, length.out = 11), GPI, col = "#FF0000", lwd=2) lines(seq(0, 1, length.out = 11), TSI, col = cl[12], lwd=2) lines(seq(0, 1, length.out = 11), NR, col = cl[13], lwd=2) #lines(seq(0, 1, length.out = 11), CRBT, col = cl[14]) abline(h=0, lwd=3, lty=2) summary(X_sub);summary(Y) sd(X_sub[ ,"ICT"]);sd(X_sub[ ,"GPI"]);sd(X_sub[ ,"TSI"]);sd(X_sub[ ,"CRBT"]);sd(Y) n <- nrow(X) # pocet pozorovani (140 riadkov modelovej matice X) p <- ncol(X) # pocet prediktorov (14 stlpcov modelovej matice X ) k <- 10 # pocet skupin v k-fold algoritme k_fold_opak <- 100 # pocet opakovani k-fold algoritmu, pouzivame len pri urceni optimalneho lambda lambdas <- seq(0.01, 1, length.out = 100) # postupnost testovanych regularizacnych parametrov pre realne data best_lambda <- 0 set.seed(6554) lambda_real <- k_fold_lambda(X, Y, k, k_fold_opak, lambdas) result_real <- k_fold_methods(X, Y, k, lambda_real) ########################################## Simulacna schema I kapitola 4 ################################################### set.seed(655434) n <- 60 # pocet pozorovani (riadkov modelovej matice X) p <- 5 # pocet prediktorov (sltpcov modelovej matice X) lambdas <- seq(0.01, 0.1, length.out = 10) # postupnost testovanych regularizacnych parametrov pre p=5 #p <- 20 # pocet prediktorov (sltpcov modelovej matice X) #lambdas <- seq(0.01, 0.2, length.out = 20) # postupnost testovanych regularizacnych parametrov pre p=20 opak <- 100 # pocet nahodnych datasetov k_fold_opak <- 10 # pocet opakovani k-fold algoritmu, pouzivame len pri urceni optimalneho lambda k <- 5 # pocet skupin v k-fold algoritme ############################################ Inicializacia ################################################### lambda_d0 <- c() lambda_d1 <- c() lambda_d2 <- c() best_lambda <- 0 final_result_d0 <- matrix(nrow = 7) final_result_d1 <- matrix(nrow = 7) final_result_d2 <- matrix(nrow = 7) final_result_d0_A <- matrix(nrow = 7) final_result_d0_A <- matrix(nrow = 7) final_result_d0_A <- matrix(nrow = 7) final_result_MSE_d0_A <- matrix(nrow = 7) final_result_MSE_d1_A <- matrix(nrow = 7) final_result_MSE_d2_A <- matrix(nrow = 7) final_result_TMSE_d0_A <- matrix(nrow = 7) final_result_TMSE_d1_A <- matrix(nrow = 7) final_result_TMSE_d2_A <- matrix(nrow = 7) ################################################################################################################# for (i in 1:opak){ cat("A: ", i, "\n") X <- draw.d.variate.uniform(no.row = n, d = p, cov.mat = diag(p)) incpt <- rep(1,nrow(X)) X_incpt <- cbind(incpt,X) beta_true <- rep(5, p) sigma <- 1/2 e <- rnorm(n, mean = 0, sd = sigma) Y <- 5 + X %*% beta_true + e # beta_0 = 5 lambda_d0[i] <- k_fold_lambda(X, Y, k, k_fold_opak, lambdas) # vybera najlepsiu lambdu z postupnosti lambdas a opakuje k_fold_opak krat, potom vyberie najlepsiu a ulozi result_d0 <- k_fold_methods(X, Y, k, lambda_d0[i]) colnames(result_d0) <- c(paste("MSE", i), paste("TMSE", i)) final_result_d0 <- cbind(final_result_d0, result_d0) # Prva kontaminacia vznikne nahradenim prvych d = 1/12 dat odlahlymi d <- 1/12 X[1:as.integer(n*d), ] <- mvrnorm(as.integer(n*d), mu = rep(0, p), Sigma = (diag(p)/10)) X_incpt <- cbind(incpt,X) Y <- 11 + X %*% beta_true + e # beta_0 = 11 lambda_d1[i] <- k_fold_lambda(X, Y, k, k_fold_opak, lambdas) result_d1 <- k_fold_methods(X, Y, k, lambda_d1[i]) colnames(result_d1) <- c(paste("MSE", i), paste("TMSE", i)) final_result_d1 <- cbind(final_result_d1, result_d1) # Druha kontaminacia vznikne nahradenim prvych d = 1/6 dat odlahlymi d <- 1/6 X[1:as.integer(n*d), ] <- mvrnorm(as.integer(n*d), mu = rep(0, p), Sigma = (diag(p)/10)) X_incpt <- cbind(incpt,X) Y <- 11 + X %*% beta_true + e # beta_0 = 11 lambda_d2[i] <- k_fold_lambda(X, Y, k, k_fold_opak, lambdas) result_d2 <- k_fold_methods(X, Y, k, lambda_d2[i]) colnames(result_d2) <- c(paste("MSE", i), paste("TMSE", i)) final_result_d2 <- cbind(final_result_d2, result_d2) } final_result_d0_A <- final_result_d0[ ,-c(1)] final_result_d1_A <- final_result_d1[ ,-c(1)] final_result_d2_A <- final_result_d2[ ,-c(1)] final_result_MSE_d0_A <- final_result_d0_A[ ,seq(1, ncol(final_result_d0_A), by = 2)] final_result_TMSE_d0_A <- final_result_d0_A[ ,seq(2, ncol(final_result_d0_A), by = 2)] final_result_MSE_d1_A <- final_result_d1_A[ ,seq(1, ncol(final_result_d0_A), by = 2)] final_result_TMSE_d1_A <- final_result_d1_A[ ,seq(2, ncol(final_result_d0_A), by = 2)] final_result_MSE_d2_A <- final_result_d2_A[ ,seq(1, ncol(final_result_d0_A), by = 2)] final_result_TMSE_d2_A <- final_result_d2_A[ ,seq(2, ncol(final_result_d0_A), by = 2)] (final_result_MSE_d0_mean_A <- apply(final_result_MSE_d0_A, 1, mean)) (final_result_TMSE_d0_mean_A <- apply(final_result_TMSE_d0_A, 1, mean)) (final_result_MSE_d1_mean_A <- apply(final_result_MSE_d1_A, 1, mean)) (final_result_TMSE_d1_mean_A <- apply(final_result_TMSE_d1_A, 1, mean)) (final_result_MSE_d2_mean_A <- apply(final_result_MSE_d2_A, 1, mean)) (final_result_TMSE_d2_mean_A <- apply(final_result_TMSE_d2_A, 1, mean)) ############################################ Simulacna schema II kapitola 4 ################################################### set.seed(655434) n <- 60 # pocet pozorovani (riadkov modelovej matice X) p <- 5 # pocet prediktorov (sltpcov modelovej matice X) lambdas <- seq(0.01, 0.1, length.out = 10) # postupnost testovanych regularizacnych parametrov pre p=5 #p <- 20 # pocet prediktorov (sltpcov modelovej matice X) #lambdas <- seq(0.01, 0.2, length.out = 20) # postupnost testovanych regularizacnych parametrov pre p=20 opak <- 100 # pocet nahodnych datasetov k_fold_opak <- 10 # pocet opakovani k-fold algoritmu, pouzivame len pri urceni optimalneho lambda k <- 5 # pocet skupin v k-fold algoritme ############################################ Inicializacia ################################################### lambda_d0 <- c() lambda_d1 <- c() lambda_d2 <- c() best_lambda <- 0 final_result_d0 <- matrix(nrow = 7) final_result_d1 <- matrix(nrow = 7) final_result_d2 <- matrix(nrow = 7) final_result_d0_B <- matrix(nrow = 7) final_result_d0_B <- matrix(nrow = 7) final_result_d0_B <- matrix(nrow = 7) final_result_MSE_d0_B <- matrix(nrow = 7) final_result_MSE_d1_B <- matrix(nrow = 7) final_result_MSE_d2_B <- matrix(nrow = 7) final_result_TMSE_d0_B <- matrix(nrow = 7) final_result_TMSE_d1_B <- matrix(nrow = 7) final_result_TMSE_d2_B <- matrix(nrow = 7) ################################################################################################################# for (i in 1:opak){ cat("B: ", i, "\n") Sigma <- diag(p)*0.4 + rep(1, p) %*% t(rep(1, p))*0.6 X <- mvrnorm(n, mu = rep(0, p), Sigma = Sigma) incpt <- rep(1,nrow(X)) X_incpt <- cbind(incpt,X) beta_true <- rep(1, p) e <- rt(n, 3)/5 Y <- 1 + X %*% beta_true + e # beta_0 = 1 lambda_d0[i] <- k_fold_lambda(X, Y, k, k_fold_opak, lambdas) result_d0 <- k_fold_methods(X, Y, k, lambda_d0[i]) colnames(result_d0) <- c(paste("MSE", i), paste("TMSE", i)) final_result_d0 <- cbind(final_result_d0, result_d0) # Prva kontaminacia vznikne nahradenim prvych d = 1/12 dat odlahlymi d <- 1/12 X[1:as.integer(n*d), ] <- mvrnorm(as.integer(n*d), mu = rep(-3, p), Sigma = (diag(p)/5)) X_incpt <- cbind(incpt,X) Y <- -10 + X %*% beta_true + e # beta_0 = -10 lambda_d1[i] <- k_fold_lambda(X, Y, k, k_fold_opak, lambdas) result_d1 <- k_fold_methods(X, Y, k, lambda_d1[i]) colnames(result_d1) <- c(paste("MSE", i), paste("TMSE", i)) final_result_d1 <- cbind(final_result_d1, result_d1) # Druha kontaminacia vznikne nahradenim prvych d = 1/6 dat odlahlymi d <- 1/6 X[1:as.integer(n*d), ] <- mvrnorm(as.integer(n*d), mu = rep(-3, p), Sigma = (diag(p)/5)) X_incpt <- cbind(incpt,X) Y <- -10 + X %*% beta_true + e # beta_0 = -10 lambda_d2[i] <- k_fold_lambda(X, Y, k, k_fold_opak, lambdas) result_d2 <- k_fold_methods(X, Y, k, lambda_d2[i]) colnames(result_d2) <- c(paste("MSE", i), paste("TMSE", i)) final_result_d2 <- cbind(final_result_d2, result_d2) } final_result_d0_B <- final_result_d0[ ,-c(1)] final_result_d1_B <- final_result_d1[ ,-c(1)] final_result_d2_B <- final_result_d2[ ,-c(1)] final_result_MSE_d0_B <- final_result_d0_B[ ,seq(1, ncol(final_result_d0_B), by = 2)] final_result_TMSE_d0_B <- final_result_d0_B[ ,seq(2, ncol(final_result_d0_B), by = 2)] final_result_MSE_d1_B <- final_result_d1_B[ ,seq(1, ncol(final_result_d0_B), by = 2)] final_result_TMSE_d1_B <- final_result_d1_B[ ,seq(2, ncol(final_result_d0_B), by = 2)] final_result_MSE_d2_B <- final_result_d2_B[ ,seq(1, ncol(final_result_d0_B), by = 2)] final_result_TMSE_d2_B <- final_result_d2_B[ ,seq(2, ncol(final_result_d0_B), by = 2)] (final_result_MSE_d0_mean_B <- apply(final_result_MSE_d0_B, 1, mean)) (final_result_TMSE_d0_mean_B <- apply(final_result_TMSE_d0_B, 1, mean)) (final_result_MSE_d1_mean_B <- apply(final_result_MSE_d1_B, 1, mean)) (final_result_TMSE_d1_mean_B <- apply(final_result_TMSE_d1_B, 1, mean)) (final_result_MSE_d2_mean_B <- apply(final_result_MSE_d2_B, 1, mean)) (final_result_TMSE_d2_mean_B <- apply(final_result_TMSE_d2_B, 1, mean)) ############################################## K Fold alg. for lambda ######################################################## k_fold_lambda <- function(X, Y, k, repet, lambdas){ #set.seed(6554) # pouzivame pre realne data #repet <- 100 # pre obrazok optimal lambda #k_results <- matrix(NA, nrow = repet, ncol = 3) # pre obrazok optimal lambda k_results <- matrix(NA, nrow = repet, ncol = 2) for (l in 1:repet){ results <- matrix(NA, nrow = length(lambdas), ncol = k) # Matica, ktora uchovava MSE vysledky algoritmu fold_idx <- sample(1:k, nrow(X), replace = TRUE, prob=rep(1/k, k)) # Nahodne rozdel data na k skupin for (i in 1:k){ # Rozdel indexy na treningovu a testovaciu mnozinu pre dany fold, vzdy je jeden fold validacna (testovacia) mnozina a ostatok trenovacia train_idx <- which(fold_idx != i) valid_idx <- which(fold_idx == i) X_train <- X[train_idx, ] Y_train <- Y[train_idx] X_valid <- X[valid_idx, ] Y_valid <- Y[valid_idx] if(is.matrix(X_valid) == TRUE){ # ak X_valid obsahuje 0 alebo 1 pozorovanie, tak nie je typu matrix a kadze to errory. Takuto situaciu chceme preskocit ale obdrzime tak NA hodnoty v stlpcoch pre niektore foldy, cize musime tieto stlpce odstranit v premennej results incpt <- rep(1, nrow(X_valid)) X_valid_incpt <- cbind(incpt, X_valid) for (j in 1:length(lambdas)){ coeff_Kfold <- as.matrix(coef(glmnet(X_train, Y_train, family = "gaussian", lambda = lambdas[j], alpha = 1))) Y_valid_LASSO <- X_valid_incpt %*% coeff_Kfold results[j, i] <- sum((Y_valid_LASSO - Y_valid)^2) / nrow(Y_valid_LASSO) } } } results <- results[, colSums(is.na(results)) < nrow(results)] # Odstranuje len ten stlpec, ktory obsahuje iba NA hodnoty MSE_final <- apply(results, 1, mean) # Pocita priemerne MSE cez vsetky k foldy pre kazde lambda best_lambda <- lambdas[which.min(MSE_final)] # Z MSE vyberieme minimum a chceme lambdu, ktora prislucha tomuto MSE, lambdy idu po poradi, cize staci vybrat prislusny index v postupnosti lambdas k_results[l, 1] <- best_lambda k_results[l, 2] <- min(MSE_final) # obsahuje najlepsiu lambdu a prislusne MSE, na zaver bude repet najlepsich lambd a MSE # k_results[l, 3] <- l # pre obrazok optimal lambda # if(l==11){break} # pre obrazok optimal lambda #} # pre obrazok optimal lambda best_lambda <- k_results[which.min(k_results[, 2]), 1] # Vyberame najlepsiu lambdu podla najmensieho MSE z repet lambd #k_results[order(k_results[ ,2], decreasing=FALSE), ] return(best_lambda) } ######################################## K-fold alg. for other regression methods ###################################### k_fold_methods <- function(X, Y, k, best_lambda){ h <- 0.6 # pre LTS a LTS LASSO metody # Vytvor matice uchovavajuce cross validacne MSE a TMSE vysledky pre dane metody results_mse <- matrix(NA, nrow = 7, ncol = k) # 7 metod rownames(results_mse) <- c("OLS","MM","LTS","LASSO","LTS_LASSO","LWS","LWS_LASSO") results_tmse <- matrix(NA, nrow = 7, ncol = k) rownames(results_tmse) <- c("OLS","MM","LTS","LASSO","LTS_LASSO","LWS","LWS_LASSO") coeff_methods <- matrix(NA, nrow = 7, ncol = ncol(X)+1) rownames(coeff_methods) <- c("OLS","MM","LTS","LASSO","LTS_LASSO","LWS","LWS_LASSO") # Vytvor k nahodnych skupin fold_idx <- sample(1:k, nrow(X), replace = TRUE, prob=rep(1/k, k)) for (i in 1:k){ # Rozdel indexy na treningovu a testovaciu mnozinu pre dany fold, vzdy je jeden fold validacna (testovacia) mnozina a ostatok trenovacia train_idx <- which(fold_idx != i) valid_idx <- which(fold_idx == i) X_train <- X[train_idx, ] Y_train <- Y[train_idx] X_valid <- X[valid_idx, ] Y_valid <- Y[valid_idx] if(is.matrix(X_valid) == TRUE){ incpt <- rep(1, nrow(X_valid)) X_valid_incpt <- cbind(incpt, X_valid) # V pripade simulacii pre p = 20 (a n = 60) je potrebne zakomentovat metody OLS a LTS, pretoze potrebujeme aspon dvojnasobne viac pozorovani ako regresorov a pri rozdelovani dat do k foldov nebyva tato podmienka vzdy splnena coeff_methods[1, ] <- as.matrix(lm(Y_train~X_train)$coeff) # OLS coeff_methods[2, ] <- as.matrix(lmrob(Y_train~X_train)$coeff) # MM coeff_methods[3, ] <- as.matrix(ltsReg(Y_train~X_train, alpha = h)$coeff) # LTS coeff_methods[4, ] <- as.matrix(coef(glmnet(X_train, Y_train, family = "gaussian", lambda = best_lambda, alpha = 1))) # LASSO coeff_methods[5, ] <- as.matrix(sparseLTS(X_train, Y_train, lambda = best_lambda, alpha = h)$coeff) # LTS LASSO for (j in 1:5){ # 5 lebo mame 5 metod, ktore volame pomocou balickov Y_methods_valid <- X_valid_incpt %*% coeff_methods[j, ] # Calculate the mean squared error for these folds results_mse[j, i] <- sum((Y_valid - Y_methods_valid)^2) / nrow(Y_methods_valid) results_tmse[j, i] <- sum(sort((Y_valid - Y_methods_valid)^2 / (nrow(Y_methods_valid)*0.8))[1:(nrow(Y_methods_valid)*0.8)]) } } } results_mse <- results_mse[, colSums(is.na(results_mse)) < nrow(results_mse)] # Odstranuje len ten stlpec, ktory obsahuje iba NA hodnoty MSE_methods_final <- matrix(NA, nrow = 7, ncol = 1) # Matica na ukladanie priemernych MSE hodnot pre vsetky metody rownames(MSE_methods_final) <- c("OLS","MM","LTS","LASSO","LTS_LASSO","LWS","LWS_LASSO") for (j in 1:5){ MSE_methods_final[j,1] <- mean(results_mse[j, ]) } LWS_errors <- LWS_LASSO(X, Y, n, p, 0) LWS_LASSO_errors <- LWS_LASSO(X, Y, n, p, best_lambda) MSE_methods_final[6,1] <- LWS_errors[1] # MSE je v prvom indexe returnu funkcie MSE_methods_final[7,1] <- LWS_LASSO_errors[1] results_tmse <- results_tmse[, colSums(is.na(results_tmse)) < nrow(results_tmse)] # Odstranuje len ten stlpec, ktory obsahuje iba NA hodnoty TMSE_methods_final <- matrix(NA, nrow = 7, ncol = 1) # Matica na ukladanie priemernych TMSE hodnot pre vsetky metody rownames(TMSE_methods_final) <- c("OLS","MM","LTS","LASSO","LTS_LASSO","LWS","LWS_LASSO") for (j in 1:5){ TMSE_methods_final[j,1] <- mean(results_tmse[j, ]) } TMSE_methods_final[6,1] <- LWS_errors[2] # TMSE je v druhom indexe returnu funkcie TMSE_methods_final[7,1] <- LWS_LASSO_errors[2] #(MSE_methods_final) #(TMSE_methods_final) #MSE_methods_final[order(MSE_methods_final[ ,1], decreasing=FALSE), ] #TMSE_methods_final[order(TMSE_methods_final[ ,1], decreasing=FALSE), ] MSE_TMSE <- cbind(MSE_methods_final, TMSE_methods_final) colnames(MSE_TMSE) <- c("MSE", "TMSE") return(MSE_TMSE) } ########################################## Sparse LWS alg for mse and tmse ############################################## LWS_LASSO <- function(X, Y, n, p, lambda){ mse <- c() tmse <- c() for(a in 1:20){ psi_1 <- function(x){ # Linearna vahova funkcia 1-x } tau <- 0.8 psi_2 <- function(x){ # Orezana linearna vahova funkcia (1 - x / tau) * (x < tau) } J <- 100 K <- 20 eps <- 10^(-6) #p <- 14 #n <- nrow(X) b_tilde <- matrix(nrow = p+1, ncol = J) w_tilde <- matrix(nrow = J, ncol = n) u_tilde <- matrix(nrow = J, ncol = n) u_tilde_ord <- matrix(nrow = J, ncol = n) for(j in 1:J){ b <- matrix(nrow = p+1, ncol = K) w <- matrix(nrow = K, ncol = n) R <- matrix(nrow = K, ncol = n) u <- matrix(nrow = K, ncol = n) u_ord <- matrix(nrow = K, ncol = n) m <- c() indices <- sample(1:n, p+1, replace = FALSE) X_sample <- X[indices, ] Y_sample <- Y[indices] k <- 1 b[, k] <- as.matrix(coef(glmnet(X_sample, Y_sample, family = "gaussian", lambda = lambda, alpha = 1))) w[k, ] <- rep(1, n) m[k] <- Inf repeat{ u[k, ] <- (Y - b[1, k] - X %*% b[-1,k])^2 R[k, ] <- rank(u[k, ]) w[k+1, ] <- psi_2((R[k, ] - 0.5) / n) b[ ,k+1] <- as.matrix(coef(glmnet(X, Y, family = "gaussian", weights = w[k+1, ], lambda = lambda, alpha = 1))) u[k+1, ] <- (Y - b[1, k+1] - X %*% b[-1,k+1])^2 u_ord[k+1, ] <- u[k+1, ][order(u[k+1, ])] m[k+1] <- w[k+1, ] %*% u_ord[k+1, ] + best_lambda * sum(abs(b[-1,k+1])) k <- k+1 if(m[k] > m[k-1] + eps | k >= K){ break } } b_tilde[ ,j] <- b[ ,k-1] w_tilde[j, ] <- w[k-1, ] } min <- Inf for(j in 1:J){ u_tilde[j, ] <- (Y - b_tilde[1, j] - X %*% b_tilde[-1,j])^2 u_tilde_ord[j, ] <- u_tilde[j, ][order(u_tilde[j, ])] m_tilde <- w_tilde[j, ] %*% u_tilde_ord[j, ] + lambda * sum(abs(b_tilde[-1,j])) if(m_tilde < min){ min <- m_tilde j_star <- j } } b_final <- b_tilde[ ,j_star] w_final <- w_tilde[j_star, ] coeff_LWS_LASSO <- as.matrix(b_final) Y_LWS_LASSO <- X_incpt %*% coeff_LWS_LASSO mse_LWS_LASSO <- sum((Y - Y_LWS_LASSO)^2) / nrow(X) mse[a] <- mse_LWS_LASSO # Defaultne nastavujeme pre TMSE orezavanie 20% hodnot, nie je to premenna, pretoze hodnotu fixujeme na celu pracu tmse_LWS_LASSO <- sum(sort((Y - Y_LWS_LASSO)^2 / (nrow(X)*0.8))[1:(nrow(X)*0.8)]) tmse[a] <- tmse_LWS_LASSO } min_MSE_TMSE <- c(min(mse), min(tmse)) return(min_MSE_TMSE) } ############################################# Sparse LWS alg for coeff #################################################### LWS_LASSO_coeff <- function(X, Y, n, p, lambda){ psi <- function(x){ # Linearna vahova funkcia 1-x } tau <- 0.8 psi_2 <- function(x){ # Orezana linearna vahova funkcia (1 - x / tau) * (x < tau) } J <- 100 K <- 20 eps <- 10^(-6) #p <- 14 #n <- nrow(X) b_tilde <- matrix(nrow = p+1, ncol = J) w_tilde <- matrix(nrow = J, ncol = n) u_tilde <- matrix(nrow = J, ncol = n) u_tilde_ord <- matrix(nrow = J, ncol = n) for(j in 1:J){ b <- matrix(nrow = p+1, ncol = K) w <- matrix(nrow = K, ncol = n) R <- matrix(nrow = K, ncol = n) u <- matrix(nrow = K, ncol = n) u_ord <- matrix(nrow = K, ncol = n) m <- c() indices <- sample(1:n, p+1, replace = FALSE) X_sample <- X[indices, ] Y_sample <- Y[indices] k <- 1 b[, k] <- as.matrix(coef(glmnet(X_sample, Y_sample, family = "gaussian", lambda = lambda, alpha = 1))) w[k, ] <- rep(1, n) m[k] <- Inf repeat{ u[k, ] <- (Y - b[1, k] - X %*% b[-1,k])^2 R[k, ] <- rank(u[k, ]) w[k+1, ] <- psi((R[k, ] - 0.5) / n) b[ ,k+1] <- as.matrix(coef(glmnet(X, Y, family = "gaussian", weights = w[k+1, ], lambda = lambda, alpha = 1))) u[k+1, ] <- (Y - b[1, k+1] - X %*% b[-1,k+1])^2 u_ord[k+1, ] <- u[k+1, ][order(u[k+1, ])] m[k+1] <- w[k+1, ] %*% u_ord[k+1, ] + best_lambda * sum(abs(b[-1,k+1])) k <- k+1 if(m[k] > m[k-1] + eps | k >= K){ break } } b_tilde[ ,j] <- b[ ,k-1] w_tilde[j, ] <- w[k-1, ] } min <- Inf for(j in 1:J){ u_tilde[j, ] <- (Y - b_tilde[1, j] - X %*% b_tilde[-1,j])^2 u_tilde_ord[j, ] <- u_tilde[j, ][order(u_tilde[j, ])] m_tilde <- w_tilde[j, ] %*% u_tilde_ord[j, ] + lambda * sum(abs(b_tilde[-1,j])) if(m_tilde < min){ min <- m_tilde j_star <- j } } (b_final <- b_tilde[ ,j_star]) w_final <- w_tilde[j_star, ] return(b_final) } #################################################### Bootstrap ######################################################### # Bootstrap pouzivame len na IS pre realne data, cize fixujeme h = 0.8 set.seed(6554) B <- 100 Boot_matrix_LWS <- matrix(nrow = B, ncol = ncol(X_incpt)) Boot_matrix_LTS <- matrix(nrow = B, ncol = ncol(X_incpt)) for (i in 1:B){ obs <- sample(1:nrow(X), replace=TRUE) X_boot <- X[obs, ] Y_boot <- Y[obs] Boot_matrix_LWS[i, ] <- LWS_LASSO_coeff(X_boot, Y_boot, nrow(X), ncol(X), lambda_real) Boot_matrix_LTS[i, ] <- as.matrix(sparseLTS(X_boot, Y_boot, lambda = lambda_real, alpha = 0.8)$coeff) } # IS asymptoticky IS_LWS <- matrix(NA, nrow = nrow(var(Boot_matrix_LWS))-1, ncol = 2) IS_LTS <- matrix(NA, nrow = nrow(var(Boot_matrix_LTS))-1, ncol = 2) alpha <- 0.05 # Pokrytie 95% u <- qnorm(1 - alpha / 2) # 1-alpha/2 kvantil N(0,1) hat_beta_LWS <- LWS_LASSO_coeff(X, Y, nrow(X), ncol(X), lambda_real) hat_beta_LTS <- as.matrix(sparseLTS(X, Y, lambda = lambda_real, alpha = 0.8)$coeff) for (i in 2:nrow(var(Boot_matrix_LWS))){ # od 2, lebo nechceme intercept hat_sigma_LWS <- sqrt(var(Boot_matrix_LWS)[i,i]) # alebo sd(Boot_matrix_LWS[ ,i]), alebo sqrt(sum((Boot_matrix_LWS[ ,i] - mean(Boot_matrix_LWS[ ,i]))^2) / (nrow(Boot_matrix_LWS)-1)). Ak chceme aj vedlajsie prvky, tak napr cov(Boot_matrix_LWS[ ,1],Boot_matrix_LWS[ ,2]) hat_sigma_LTS <- sqrt(var(Boot_matrix_LTS)[i,i]) IS_LWS[i-1, ] <- hat_beta_LWS[i] + c(-1,1)*u*hat_sigma_LWS / sqrt(nrow(X)) # LWS_LASSO IS_LTS[i-1, ] <- hat_beta_LTS[i] + c(-1,1)*u*hat_sigma_LTS / sqrt(nrow(X)) # LTS_LASSO } (IS_LWS) # LWS_LASOO (IS_LTS) # LTS_LASSO #################################################### Dalsie obrazky do prace ######################################################### par(mfrow=c(1,1)) # l=11 plot(lambdas, MSE_final, xaxt="n", xlab = "", cex=1.5, ylab = "MSE", cex.lab=1.3, cex.axis=1.3, pch=20, col= "#CC9900") axis(1, at = seq(0, 1, by = 0.05), las=2, cex.axis=1.35) title(xlab=TeX("Postupnosť hodnôť $\\lambda_l$"), line = 4, cex.lab=1.35) title(TeX("Graf MSE hodnôt zrátaných krížovou validáciou pre rôzne $\\lambda$"), cex.main=1.7) min_lambda <- lambdas[which.min(MSE_final)] min_MSE <- MSE_final[which.min(MSE_final)] points(min_lambda, min_MSE, pch=20, col="black", cex=1.5) OLS_1 <- c(0.289, 0.125, 0.295, 0.130, 0.287, 0.126) MM_1 <- c(0.291, 0.126, 0.298, 0.131, 0.291, 0.126) LTS8_1 <- c(0.299, 0.129, 0.315, 0.139, 0.301, 0.131) LTS6_1 <- c(0.309, 0.133, 0.320, 0.142, 0.310, 0.135) Lasso_1 <- c(0.290, 0.125, 0.295, 0.130, 0.288, 0.126) LTS_Lasso8_1 <- c(0.365, 0.160, 0.388, 0.169, 0.367, 0.158) LTS_Lasso6_1 <- c(0.512, 0.229, 0.550, 0.228, 0.516, 0.218) LWS1_1 <- c(0.232, 0.097, 0.231, 0.096, 0.231, 0.096) LWS2_1 <- c(0.235, 0.097, 0.235, 0.095, 0.235, 0.096) LWS_Lasso1_1 <- c(0.233, 0.097, 0.233, 0.096, 0.232, 0.096) LWS_Lasso2_1 <- c(0.239, 0.098, 0.239, 0.097, 0.239, 0.097) par(mfrow=c(1,2)) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.2, 0.55), xaxt = "n", ylab='MSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár I., p = 5", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(OLS_1[c(1,3,5)], col = cl[1], lwd=2) lines(MM_1[c(1,3,5)], col = cl[7], lwd=2) lines(LTS8_1[c(1,3,5)], col = "#CCCC00", lwd=2) lines(LTS6_1[c(1,3,5)], col = "#339900", lwd=2) lines(Lasso_1[c(1,3,5)], col = cl[1], lwd=2) lines(LTS_Lasso8_1[c(1,3,5)], col = "#CC33CC", lwd=2) lines(LTS_Lasso6_1[c(1,3,5)], col = cl[2], lwd=2) lines(LWS1_1[c(1,3,5)], col = cl[8], lwd=2) lines(LWS2_1[c(1,3,5)], col = "#666666", lwd=2) lines(LWS_Lasso1_1[c(1,3,5)], col = cl[8], lwd=2) lines(LWS_Lasso2_1[c(1,3,5)], col = "#33CCCC", lwd=2) legend("topleft", c("OLS", "MM", "LTS(h=48)", "LTS(h=36)", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c(cl[1], cl[7], "#CCCC00", "#339900", cl[1], "#CC33CC", cl[2], cl[8], "#666666", cl[8], "#33CCCC"), lty=1, lwd=3, cex=1.3) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.095, 0.23), xaxt = "n", ylab='TMSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár I., p = 5", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(OLS_1[c(2,4,6)], col = cl[1], lwd=2) lines(MM_1[c(2,4,6)], col = cl[7], lwd=2) lines(LTS8_1[c(2,4,6)], col = "#CCCC00", lwd=2) lines(LTS6_1[c(2,4,6)], col = "#339900", lwd=2) lines(Lasso_1[c(2,4,6)], col = cl[1], lwd=2) lines(LTS_Lasso8_1[c(2,4,6)], col = "#CC33CC", lwd=2) lines(LTS_Lasso6_1[c(2,4,6)], col = cl[2], lwd=2) lines(LWS1_1[c(2,4,6)], col = cl[8], lwd=2) lines(LWS2_1[c(2,4,6)], col = "#666666", lwd=2) lines(LWS_Lasso1_1[c(2,4,6)], col = cl[8], lwd=2) lines(LWS_Lasso2_1[c(2,4,6)], col = "#33CCCC", lwd=2) legend("topleft", c("OLS", "MM", "LTS(h=48)", "LTS(h=36)", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c(cl[1], cl[7], "#CCCC00", "#339900", cl[1], "#CC33CC", cl[2], cl[8], "#666666", cl[8], "#33CCCC"), lty=1, lwd=3, cex=1.3) MM_2 <- c(0.537, 0.230, 0.539, 0.227, 0.551, 0.238) Lasso_2 <- c(0.458, 0.197, 0.454, 0.191, 0.449, 0.195) LTS_Lasso8_2 <- c(8.396, 3.651, 14.599, 2.922, 9.321, 2.293) LTS_Lasso6_2 <- c(27.63, 12.03, 99.96, 15.05, 141.14, 25.14) LWS1_2 <- c(0.180, 0.050, 0.178, 0.051, 0.175, 0.053) LWS2_2 <- c(0.208, 0.042, 0.203, 0.042, 0.198, 0.042) LWS_Lasso1_2 <- c(0.185, 0.054, 0.181, 0.051, 0.178, 0.052) LWS_Lasso2_2 <- c(0.220, 0.052, 0.214, 0.044, 0.212, 0.044) par(mfrow=c(1,2)) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.21, 140), xaxt = "n", ylab='MSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár I., p = 20", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(MM_2[c(1,3,5)], col = "#33CC33", lwd=2) lines(Lasso_2[c(1,3,5)], col = "#33CC33", lwd=2) lines(LTS_Lasso8_2[c(1,3,5)], col = cl[1], lwd=2) lines(LTS_Lasso6_2[c(1,3,5)], col = cl[8], lwd=2) lines(LWS1_2[c(1,3,5)], col = "#33CC33", lwd=2) lines(LWS2_2[c(1,3,5)], col = "#33CC33", lwd=2) lines(LWS_Lasso1_2[c(1,3,5)], col = "#33CC33", lwd=2) lines(LWS_Lasso2_2[c(1,3,5)], col = "#33CC33", lwd=2) legend("topleft", c("MM", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c("#33CC33", "#33CC33", cl[1], cl[8], "#33CC33", "#33CC33", "#33CC33", "#33CC33"), lty=1, lwd=3, cex=1.3) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.04, 25), xaxt = "n", ylab='TMSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár I., p = 20", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(MM_2[c(2,4,6)], col = cl[1], lwd=2) lines(Lasso_2[c(2,4,6)], col = cl[1], lwd=2) lines(LTS_Lasso8_2[c(2,4,6)], col = "#CC33CC", lwd=2) lines(LTS_Lasso6_2[c(2,4,6)], col = cl[8], lwd=2) lines(LWS1_2[c(2,4,6)], col = "#33CC33", lwd=2) lines(LWS2_2[c(2,4,6)], col = "#33CC33", lwd=2) lines(LWS_Lasso1_2[c(2,4,6)], col = "#33CC33", lwd=2) lines(LWS_Lasso2_2[c(2,4,6)], col = "#33CC33", lwd=2) legend("topleft", c("MM", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c(cl[1], cl[1], "#CC33CC", cl[8], "#33CC33", "#33CC33", "#33CC33", "#33CC33"), lty=1, lwd=3, cex=1.3) OLS_3 <- c(0.115, 0.032, 0.117, 0.034, 0.118, 0.034) MM_3 <- c(0.110, 0.029, 0.113, 0.031, 0.112, 0.031) LTS8_3 <- c(0.123, 0.030, 0.130, 0.032, 0.128, 0.031) LTS6_3 <- c(0.115, 0.031, 0.118, 0.033, 0.115, 0.033) Lasso_3 <- c(0.115, 0.032, 0.118, 0.034, 0.118, 0.034) LTS_Lasso8_3 <- c(0.133, 0.035, 0.147, 0.039, 0.138, 0.037) LTS_Lasso6_3 <- c(0.137, 0.043, 0.154, 0.049, 0.149, 0.050) LWS1_3 <- c(0.106, 0.023, 0.106, 0.022, 0.107, 0.022) LWS2_3 <- c(0.096, 0.023, 0.096, 0.023, 0.096, 0.023) LWS_Lasso1_3 <- c(0.107, 0.023, 0.107, 0.022, 0.108, 0.023) LWS_Lasso2_3 <- c(0.098, 0.023, 0.098, 0.024, 0.099, 0.024) par(mfrow=c(1,2)) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.09, 0.160), xaxt = "n", ylab='MSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár II., p = 5", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(OLS_3[c(1,3,5)], col = cl[1], lwd=2) lines(MM_3[c(1,3,5)], col = cl[2], lwd=2) lines(LTS8_3[c(1,3,5)], col = "#FC33CC", lwd=2) lines(LTS6_3[c(1,3,5)], col = "#33CC33", lwd=2) lines(Lasso_3[c(1,3,5)], col = cl[3], lwd=2) lines(LTS_Lasso8_3[c(1,3,5)], col = cl[6], lwd=2) lines(LTS_Lasso6_3[c(1,3,5)], col = cl[7], lwd=2) lines(LWS1_3[c(1,3,5)], col = "#339900", lwd=2) lines(LWS2_3[c(1,3,5)], col = cl[9], lwd=2) lines(LWS_Lasso1_3[c(1,3,5)], col = cl[10], lwd=2) lines(LWS_Lasso2_3[c(1,3,5)], col = "#666666", lwd=2) legend("topleft", c("OLS", "MM", "LTS(h=48)", "LTS(h=36)", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c(cl[1], cl[2], "#FC33CC", "#33CC33", cl[3], cl[6], cl[7], "#339900", cl[9], cl[10], "#666666"), lty=1, lwd=3, cex=1.2) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.02, 0.05), xaxt = "n", ylab='TMSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár II., p = 5", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(OLS_3[c(2,4,6)], col = cl[1], lwd=2) lines(MM_3[c(2,4,6)], col = cl[2], lwd=2) lines(LTS8_3[c(2,4,6)], col = "#FC33CC", lwd=2) lines(LTS6_3[c(2,4,6)], col = "#33CC33", lwd=2) lines(Lasso_3[c(2,4,6)], col = cl[3], lwd=2) lines(LTS_Lasso8_3[c(2,4,6)], col = cl[6], lwd=2) lines(LTS_Lasso6_3[c(2,4,6)], col = cl[7], lwd=2) lines(LWS1_3[c(2,4,6)], col = "#339900", lwd=2) lines(LWS2_3[c(2,4,6)], col = cl[9], lwd=2) lines(LWS_Lasso1_3[c(2,4,6)], col = cl[10], lwd=2) lines(LWS_Lasso2_3[c(2,4,6)], col = "#666666", lwd=2) legend("topleft", c("OLS", "MM", "LTS(h=48)", "LTS(h=36)", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c(cl[1], cl[2], "#FC33CC", "#33CC33", cl[3], cl[6], cl[7], "#339900", cl[9], cl[10], "#666666"), lty=1, lwd=3, cex=1.2) MM_4 <- c(0.237, 0.067, 0.248, 0.071, 0.234, 0.068) Lasso_4 <- c(0.261, 0.082, 0.295, 0.094, 0.310, 0.103) LTS_Lasso8_4 <- c(0.768, 0.303, 1.075, 0.407, 1.256, 0.478) LTS_Lasso6_4 <- c(6.13, 2.57, 10.77, 3.79, 12.24, 4.56) LWS1_4 <- c(0.114, 0.016, 0.115, 0.017, 0.115, 0.019) LWS2_4 <- c(0.129, 0.014, 0.126, 0.015, 0.126, 0.017) LWS_Lasso1_4 <- c(0.123, 0.017, 0.124, 0.018, 0.122, 0.019) LWS_Lasso2_4 <- c(0.132, 0.015, 0.131, 0.017, 0.129, 0.019) par(mfrow=c(1,2)) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.12, 12), xaxt = "n", ylab='MSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár II., p = 20", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(MM_4[c(1,3,5)], col = cl[1], lwd=2) lines(Lasso_4[c(1,3,5)], col = cl[1], lwd=2) lines(LTS_Lasso8_4[c(1,3,5)], col = "#33CC33", lwd=2) lines(LTS_Lasso6_4[c(1,3,5)], col = cl[7], lwd=2) lines(LWS1_4[c(1,3,5)], col = cl[8], lwd=2) lines(LWS2_4[c(1,3,5)], col = cl[8], lwd=2) lines(LWS_Lasso1_4[c(1,3,5)], col = cl[8], lwd=2) lines(LWS_Lasso2_4[c(1,3,5)], col = cl[8], lwd=2) legend("topleft", c("MM", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c(cl[1], cl[1], "#33CC33", cl[7], cl[8], cl[8], cl[8], cl[8]), lty=1, lwd=3, cex=1.3) x_os <- c("d=0", "d=1/12", "d=1/6") plot(OLS_1[c(1,3,5)], ylim=c(0.015, 4.5), xaxt = "n", ylab='TMSE', xlab='Úroveň kontaminácie extrémnymi pozorovaniami', col = 'white', cex.lab=1.5, cex.axis=1.5) title("Scenár II., p = 20", cex.main=1.7) axis(1, at=1:3, labels=x_os, cex.axis=1.5) cl <- rainbow(11) lines(MM_4[c(2,4,6)], col = cl[1], lwd=2) lines(Lasso_4[c(2,4,6)], col = cl[1], lwd=2) lines(LTS_Lasso8_4[c(2,4,6)], col = "#33CC33", lwd=2) lines(LTS_Lasso6_4[c(2,4,6)], col = cl[7], lwd=2) lines(LWS1_4[c(2,4,6)], col = cl[8], lwd=2) lines(LWS2_4[c(2,4,6)], col = cl[8], lwd=2) lines(LWS_Lasso1_4[c(2,4,6)], col = cl[8], lwd=2) lines(LWS_Lasso2_4[c(2,4,6)], col = cl[8], lwd=2) legend("topleft", c("MM", "Lasso", "LTS-lasso(h=48)", "LTS-lasso(h=36)", TeX("LWS ($\\psi_1$)"), TeX("LWS ($\\psi_2$)"), TeX("LWS-lasso ($\\psi_1$)"), TeX("LWS-lasso ($\\psi_2$)")), col = c(cl[1], cl[1], "#33CC33", cl[7], cl[8], cl[8], cl[8], cl[8]), lty=1, lwd=3, cex=1.3) data = heart dim(data) data$weight ## Default method works with 'x'-matrix and y-var: heart.x <- data.matrix(heart[, 1:2]) # the X-variables heart.y <- heart[,"clength"] heart.x[,1] = heart.x[,1] - mean(heart.x[,1]) heart.x[,2] = heart.x[,2] - mean(heart.x[,2]) ltsReg(heart.x, heart.y, alpha = 0.5) dim(x) x = matrix(rnorm(100 * 20), 100, 20) y = rnorm(100) fit1 = glmnet(x, y, nlambda = 10, alpha = 1) print(fit1) coef(fit1, s = 0.01) # extract coefficients at a single value of lambda predict(fit1, newx = x[1:10, ], s = c(0.01, 0.005)) # make predictions # Load the dataset data("mtcars") # Calculate residuals residuals <- mtcars %>% mutate(residual = residuals(lm(mpg ~ wt + hp + qsec, data = .))) # Identify outliers based on residuals outliers <- residuals %>% mutate(outlier = ifelse(abs(residual) > 3.5, "outlier", "not outlier")) # Plot the residuals with outliers in red plot(mtcars$wt, residuals$residual, col = ifelse(outliers$outlier == "outlier", "red", "black")) # Fit a linear regression model fit <- lm(mpg ~ wt + hp + qsec, data = mtcars) leverage <- hatvalues(fit) # Identify leverage points based on leverage leverage_points <- data.frame(index = 1:length(leverage), leverage = leverage) %>% mutate(leverage_point = ifelse(leverage > 2 * (1 / nrow(mtcars)), "leverage point", "not leverage point")) # Plot the leverage with leverage points in blue plot(mtcars$wt, leverage, col = ifelse(leverage_points$leverage_point == "leverage point", "blue", "black")) fit <- lm(mpg ~ wt, data = mtcars) # Calculate leverage leverage <- hatvalues(fit) # Identify leverage points based on leverage leverage_points <- data.frame(index = 1:length(leverage), leverage = leverage) %>% mutate(leverage_point = ifelse(leverage > 2 * (1 / nrow(mtcars)), "leverage point", "not leverage point")) # Plot the data with leverage points in blue plot(mtcars$wt, mtcars$mpg, col = ifelse(leverage_points$leverage_point == "leverage point", "blue", "black")) # Load the dataset data("mtcars") # Fit a linear regression model fit <- lm(mpg ~ wt, data = mtcars) # Calculate residuals residuals <- residuals(fit) # Identify outliers based on residuals outliers <- data.frame(index = 1:length(residuals), residuals = residuals) %>% mutate(outlier = ifelse(abs(residuals) > 2 * sd(residuals), "outlier", "not outlier")) # Plot the data with outliers in red plot(mtcars$wt, mtcars$mpg, col = ifelse(outliers$outlier == "outlier", "red", "black")) set.seed(123) n <- 100 x <- rnorm(n) y <- 2 * x + rnorm(n, sd = 0.5) df <- data.frame(x = x, y = y) # Fit a linear regression model fit <- lm(y ~ x, data = df) # Calculate residuals and hat values residuals <- residuals(fit) hat_values <- hatvalues(fit) # Identify outliers based on residuals outliers <- data.frame(index = 1:length(residuals), residual = residuals) %>% filter(abs(residual) > 2 * sd(residuals)) # Identify leverage points based on hat values leverage_points <- data.frame(index = 1:length(hat_values), hat_value = hat_values) %>% filter(hat_value > 2 * mean(hat_values)) # Plot the scatter plot plot(df$x, df$y, col = ifelse(df$x %in% outliers$x, "red", "black"), xlab = "x", ylab = "y", pch=20, cex = 2) # Overlay the outliers and leverage points points(df[outliers$index, ], pch = 20, col = "red", cex=2) points(df[leverage_points$index, ], pch = 20, col = "blue", cex=2) set.seed(1544335) n <- 100 x <- rnorm(n) y <- 1 * x + rnorm(n, sd = 0.5) df <- data.frame(x = x, y = y) # Fit a linear regression model fit <- lm(y ~ x, data = df) # Calculate residuals and hat values residuals <- residuals(fit) hat_values <- hatvalues(fit) # Identify outliers based on residuals outliers <- data.frame(index = 1:length(residuals), residual = residuals) %>% filter(abs(residual) > 2 * sd(residuals)) # Identify leverage points based on hat values leverage_points <- data.frame(index = 1:length(hat_values), hat_value = hat_values) %>% filter(hat_value > 3*sum(hat_values)/n) # Plot the scatter plot par(mfrow=c(1,1)) plot(df$x, df$y, col = ifelse(df$x %in% outliers$x, "red", "black"), xlab = "x", ylab = "y", pch=20, cex=2, cex.lab=1.5, cex.axis=1.5) title("Graf odľahlých a vzdialených pozorovaní", cex.main=1.5) # Overlay the outliers and leverage points points(df[outliers$index, ], pch = 20, col = "red", cex=2) points(df[leverage_points$index, ], pch = 20, col = "blue", cex=2) # Add the legend to the plot legend("topleft", c("Dáta", "Odľahlé (Outliers)", "Vzdialené (Leverage Points)"), col = c("black", "red", "blue"), pch = c(20, 20, 20), cex=1.3, pt.cex=2) set.seed(1230543) x <- 1:30 y <- 2 + 0.5 * x + rnorm(30, 0, 20) df <- data.frame(x, y) # Create a data frame with and without the outlier df_without_outlier <- df df[29, "y"] = 100 df_with_outlier <- df # Fit regression lines fit_without_outlier <- lm(y ~ x, data = df_without_outlier) fit_with_outlier <- lm(y ~ x, data = df_with_outlier) par(mfrow=c(1,2)) # Plot regression lines plot(df$x, df$y, col = ifelse(df$x == 29, "red", "black"), pch=19, xlab = "x", ylab = "y", cex=2, cex.lab=1.6, cex.axis=1.5, ylim=c(-50,100), xlim=c(0,100)) abline(fit_without_outlier, col = "black", lwd=3) abline(fit_with_outlier, col = "red", lwd =3) title("Vplyv odľahlého pozorovania na OLS odhad", cex.main=1.9) legend("bottomright", c("Dáta", "Odľahlé pozorovanie", "OLS bez odľahlého poz.", "OLS s odľahlým poz."), lty=c(0,0,1,1), pch = c(20,20,NA,NA), col = c("black", "red", "black", "red"), cex=1.4, pt.cex=2) df <- data.frame(x, y) df_without_leverage <- df df[10, "x"] = 100 df_with_leverage <- df # Fit regression lines fit_without_leverage <- lm(y ~ x, data = df_without_leverage) fit_with_leverage <- lm(y ~ x, data = df_with_leverage) # Plot regression lines plot(df$x, df$y, col = ifelse(df$x == 100, "blue", "black"), pch=19, xlab = "x", ylab = "y", cex=2, cex.lab=1.6, cex.axis=1.5,ylim=c(-50,100)) abline(fit_without_leverage, col = "black", lwd=3) abline(fit_with_leverage, col = "blue", lwd =3) title("Vplyv vzdialeného pozorovania na OLS odhad", cex.main=1.9) legend("bottomright", c("Dáta", "Vzdialené pozorovanie", "OLS bez vzdialeného poz.", "OLS so vzdialeným poz."), lty=c(0,0,1,1), pch = c(20,20,NA,NA), col = c("black", "blue", "black", "blue"), cex=1.4, pt.cex=2) par(mfrow=c(2,2)) x <- seq(0, 1, length.out = 100) f_1 <- 1-x plot(x, f_1, type = "l", xlab = "x", col = "#CC0000", ylab = TeX("$\\psi_1$"), main = "Lineárne váženie", cex.lab=1.5, lwd = 3, cex.main=1.7) points(1,0, pch=20,cex=2) points(0,1, pch=20,cex=2) tau <- 0.75 f_2 <- (1 - x / tau) * (x < tau) plot(x, f_2, type = "l", xlab = "x", col = "#FFCC00", ylab = TeX("$\\psi_2$"), main = "Orezané lineárne váženie", cex.lab=1.5, lwd = 3, cex.main=1.7) points(1,0, pch=20,cex=2) points(0,1, pch=20,cex=2) s <- 10 f_4 <- (1 + exp(-s/2)) / (1 + exp(s*(x - 1/2))) plot(x, f_4, type = "l", xlab = "x", col = "#0000FF", ylab = TeX("$\\psi_3$"), main = "Logistická funkcia", cex.lab=1.5, lwd = 3, cex.main=1.7) points(1,0, pch=20,cex=2) points(0,1, pch=20,cex=2) plot(x, 1-erf(x), type = "l", xlab = "x", col = "#006600", ylab = TeX("$\\psi_4$"), main = "Chybová funkcia", ylim=c(0,1), cex.lab=1.5, lwd = 3, cex.main=1.7) points(1,0, pch=20,cex=2) points(0,1, pch=20,cex=2)