#----------------------------------------------------------------------------------------------------
#Import packages e librerie utilizzate:
library(R.matlab)
library(dbscan)
library(e1071)
library(caret)
library(NLP)
library(tm)
library(DMwR)
library(Rlof)
#Per l'istallazione del pacchetto isofor:
#install.packages("devtools")
#library(devtools)
#devtools::install_github("Zelazny7/isofor")
library(isofor)
#Per leggere i dati .mat
#install.packages("rmatio")
library(rmatio)
#Per il calcolo dell'AUC:
#install.packages("pROC")
library(pROC)
#Per il KNN-kdtree:
#install.packages("RANN")
#install.packages("adamethods")
#install.packages("DDoutlier")
library(RANN)
library(adamethods)
library(DDoutlier)
library(kknn)
library(partykit)
library(rpart)
library(rpart.plot)
#--------------------------------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------
#Import data:
rm(list=ls())
setwd("C:/...")
path="dati.mat"
#inserire path
data=readMat(path)
X=data$X
is_out<-as.vector(data$y)
dati<-data.frame(cbind(X, is_out))
p=dim(X)[2]; n=dim(X)[1]
#imputare a factor le variabili che sono categoriche:
fac<-c(numeri delle variabili factor)
for (i in 1:length(fac)){
dati[,i]<-as.factor(dati[,i])
}
#One hot encoding:
dati<-model.matrix(~.,dati[,-p])[-1]
#Dati Scalati:
#Questo viene applicato a dataset che necessitano di una normalizzazione:
X_scale<-scale(X)
#-------------------------------------------------------------------------------------------------------
#Split Automatizzato:
#Di seguito vengono proposte due funzioni per l'automatizzazione dello split
#Utilizziamo l'rpart:
rpart_split_finder <- function(outlier.scores, n){
x <- outlier.scores
y <- 1:n
library(rpart)
library(rpart.plot)
splits=c()
for(i in 1:3){
set.seed(123)
tree <-rpart(y~x,
control=rpart.control(maxdepth=i, at most 1 split
cp=0, any positive improvement will do
minsplit=1,
minbucket=1, even leaves with 1 point are accepted
xval=0))
tagli <- rpart.rules(tree)
if( i!=1){
splits<-c(splits, tagli[,5], tagli[,7])
}
else{
splits<-c(splits, tagli[,5])
}
}
togliere<-c("")
if (togliere %in% splits){splits<-splits[which(splits!=togliere)]}
splits<-as.numeric(splits)
return(unique(splits))
}
#Utilizziamo ctree:
ctree_split_finder <- function(outlier.scores, n){
library(partykit)
x <- outlier.scores
y <- 1:n
taglio=c()
for (i in 1:3){
albero <- ctree(y~x, maxdepth=i)
splittini <- partykit:::.list.rules.party(albero)
splittini<-unlist(strsplit(splittini, " "))
togliere<-c("x","<",">","<=",">=","&","")
for (el in togliere){
if(el %in% splittini){splittini<-splittini[which(splittini!=el)]}}
taglio <- c(taglio,round(as.numeric(splittini), 5))
}
return(unique(taglio))
}
#------------------------------------------------------------------------------------------------------
##DBSCAN-------------------------------------------------------------------------------------------
###three method useful to find your k
##Operative tip, use k2 when your dataset has
##p slightly smaller than n.
#DEFAULT
k1=mean(c(p+1,log(n),2*p))
k2=max(p+1,log(n))
k3=max(2*p)
DISTANCES=kNNdist(X,k=k1) #Settiamo k1 come esempio, scegliere il k desiderato
head(DISTANCES)
distances=as.vector(DISTANCES)
##look it
head(distances)
##cut methods
(valori=rpart_split_finder(distances,length(distances)))
(valori2=ctree_split_finder(distances,length(distances)))
v1=1:length(valori)
val1=as.data.frame(cbind(valori,v1))
#v2=1:length(valori2)
#val2=as.data.frame(cbind(valori2,v1))
#look at the distances plot to find the knee
plot(sort(distances),type="l")
abline(h=val1$valori,col=val1$v1+1)
#abline(h=val2$valori2,col=val2$21+1)
##if no cut is useful abline manually
abline(h= ) ##0.7
eps=sort(valori)[4] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
##dbscan
res=dbscan(X,eps=eps,minPts = k) #k2,k3
##VALUTATION
indicator=res$cluster==0
out_prev=rep(0,n);out_prev[indicator]=1
sum(out_prev)
##real outlier
sum(is_out)
(tabfin=table(out_prev,is_out))
##Attention, can be time consuming
hullplot(X,res)
#Confusion Matrix & AUC:
table(DBSCAN=out_prev, Actual=is_out)
#Valutazione performance:
roc_dbscan<- roc(response = is_out, predictor = out_prev)
(AUCDB=auc(roc_dbscan))
(precision=tabfin[2,2]/sum(tabfin[2,]))
(recall=tabfin[2,2]/sum(tabfin[,2]))
##Optics-------------------------------------------------------------------------------------------
###three method useful to find your k
##Operative tip, use k2 when your dataset has
##p slightly smaller than n.
#DEFAULT
k1=mean(c(p+1,log(n),2*p))
k2=max(p+1,log(n))
k3=max(2*p)
##fix a very large eps
res<-optics(X,eps=500,minPts = k1) #scegliamo k1 ad esempio, scegliere il k desiderato
resultKNN=kNN(X,k1) #ad esempio
head(resultKNN$id,3)
numberid=resultKNN$id
head(res$order,n=15)
#altro modo per estrarlo
distanzerech=res$reachdist
head(distanzerech)
distanzerech[1]=distanzerech[2]
dist=vector()
lrd=vector()
###FIXING problems with replication
for (i in 1:n){
minptsvicini=numberid[i,]
dist[i]=mean(distanzerech[numberid[i,]])
}
distmagg=dist[dist>0]
valore=min(distmagg)
dist[dist==0]=valore/2
lrd=1/dist
hist(dist)
numeratore=vector()
OF=vector()
for (i in 1:n){
minptsvicini=numberid[i,]
lrd_numero=lrd[i]
lrd_minpts=lrd[numberid[i,]]
numeratore[i]=sum(lrd_minpts/lrd_numero)
OF[i]=numeratore[i]/k
}
summary(lrd);sum(is.na(OF));str(OF)
(sum=summary(OF))
###cutting
(cfind=rpart_split_finder(OF,length(OF)))
(rfind=ctree_split_finder(OF,length(OF)))
####
#OF
plot(sort(OF),type="l",ylim=c(sum[1],sum[6]))
abline(h=val_opt)
rbest_taglio <- sort(rfind)[numero opportuno] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
out_previsti <- rep(0,n)
out_previsti <- ifelse(OF>rbest_taglio, 1, 0)
#Confusion Matrix & AUC:
table(Optics=out_previsti, Actual=is_out)
#Valutazione performance:
roc_Optics<- roc(response = is_out, predictor = out_previsti)
auc(roc_Optics)
confusion_matrix<-table(Optics=out_previsti, Actual=is_out)
precision <- confusion_matrix[2,2]/sum(confusion_matrix[2,]); precision
recall <- confusion_matrix[2,2]/sum(confusion_matrix[,2]); recall
cbest_taglio <- sort(cfind)[valore che ci sembra più opportuno]
out_previsti <- rep(0,n)
out_previsti <- ifelse(OF>cbest_taglio, 1, 0)
#Confusion Matrix & AUC:
table(Optics=out_previsti, Actual=is_out)
#Valutazione performance:
roc_Optics<- roc(response = is_out, predictor = out_previsti)
auc(roc_Optics)
confusion_matrix<-table(Optics=out_previsti, Actual=is_out)
precision <- confusion_matrix[2,2]/sum(confusion_matrix[2,]); precision
recall <- confusion_matrix[2,2]/sum(confusion_matrix[,2]); recall
##LOF--------------------------------------------------------------------------------------------------
outlier.scores <- lof(X, k=mean(c(log(n), p+1, 2*p)))
rfind <- rpart_split_finder(outlier.scores, n)
plot(sort(outlier.scores), col=is_out[order(outlier.scores)]+1, pch=18)
abline(h=rfind, col=4)
rbest_taglio <- sort(rfind)[numero che ci sembra più opportuno] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
out_previsti <- rep(0,n)
out_previsti <- ifelse(outlier.scores>rbest_taglio, 1, 0)
#Confusion Matrix & AUC:
table(LOF=out_previsti, Actual=is_out)
#Valutazione performance:
roc_lof<- roc(response = is_out, predictor = out_previsti)
auc(roc_lof)
confusion_matrix<-table(LOF=out_previsti, Actual=is_out)
precision <- confusion_matrix[2,2]/sum(confusion_matrix[2,]); precision
recall <- confusion_matrix[2,2]/sum(confusion_matrix[,2]); recall
cfind <- ctree_split_finder(outlier.scores, n)
plot(sort(outlier.scores), col=is_out[order(outlier.scores)]+1, pch=19)
abline(h=cfind, col=cfind+1)
cbest_taglio <- sort(cfind)[valore che ci sembra più opportuno] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
out_previsti[out_previsti>numero]=1
#Confusion Matrix & AUC:
table(LOF=out_previsti, Actual=is_out)
#Valutazione performance:
roc_lof<- roc(response = is_out, predictor = out_previsti)
auc(roc_lof)
confusion_matrix<-table(LOF=out_previsti, Actual=is_out)
precision <- confusion_matrix[2,2]/sum(confusion_matrix[2,]); precision
recall <- confusion_matrix[2,2]/sum(confusion_matrix[,2]); recall
##kNN----------------------------------------------------------------------------------------------
#(1)
#Usare un knn aggregato con la seguente scelta di parametri:
knn.agg<-KNN_AGG(X, k_min = min(log(n),p+1), k_max = max(log(n),p+1))
plot(knn.agg)
rfind<- rpart_split_finder(knn.agg, n)
plot(sort(knn.agg),col=is_out[order(knn.agg)]+1)
abline(h=rfind)
rbest_taglio=sort(rfind)[numero opportuno] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
#Si ricorda che eventuali altri valori di soglia desiderati possono essere scelti manualmente
outliers_r<-ifelse(knn.agg>rbest_taglio,1,0)
cfind <- ctree_split_finder(knn.agg, n)
plot(sort(knn.agg), col=is_out[order(knn.agg)]+1, pch=19)
abline(h=cfind, col=cfind+1)
cbest_taglio <- sort(cfind)
outliers_c<-ifelse(knn.agg>cbest_taglio,1,0)
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_r, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_r)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_c, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_c)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#(2)
#si presenta un diferente algoritmo in grado di gestire grandi dataset.
#usare come K una media, oppure il massimo o il minimo:
k_mean=round(mean(c(log(n),p+1)),0)
#88
k_max=round(max(c(log(n),p+1)),0)
#167
k_min=round(min(c(log(n),p+1)),0)
#8
#Nello specifico caso funziona molto meglio il max:
#Media
nearest<-nn2(X, query =X , k = k_mean, treetype = c("kd"),
searchtype = c("standard"), radius = 0, eps = 0)
nd<-nearest$nn.dists
score<-rowMeans(nd)
rfind<- rpart_split_finder(score, n)
plot(sort(score),col=is_out[order(score)]+1)
abline(h=rfind)
rbest_taglio=sort(rfind)[numero che ci sembra migliore] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
outliers_r<-ifelse(score>rbest_taglio,1,0)
cfind <- ctree_split_finder(score, n)
plot(sort(score), col=is_out[order(score)]+1, pch=19)
abline(h=cfind, col=cfind+1)
cbest_taglio <- sort(cfind)
outliers_c<-ifelse(score>cbest_taglio,1,0)
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_r, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_r)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_c, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_c)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#Max
nearest<-nn2(X, query =X , k = k_max, treetype = c("kd"),
searchtype = c("standard"), radius = 0, eps = 0)
nd<-nearest$nn.dists
score<-rowMeans(nd)
rfind<- rpart_split_finder(score, n)
plot(sort(score),col=is_out[order(score)]+1)
abline(h=rfind)
rbest_taglio=sort(rfind)[numero opportuno] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
outliers_r<-ifelse(score>rbest_taglio,1,0)
cfind <- ctree_split_finder(score, n)
plot(sort(score), col=is_out[order(score)]+1, pch=19)
abline(h=cfind, col=cfind+1)
cbest_taglio <- sort(cfind)
outliers_c<-ifelse(score>cbest_taglio,1,0)
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_r, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_r)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_c, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_c)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#Min
nearest<-nn2(X, query =X , k = k_min, treetype = c("kd"),
searchtype = c("standard"), radius = 0, eps = 0)
nd<-nearest$nn.dists
score<-rowMeans(nd)
rfind<- rpart_split_finder(score, n)
plot(sort(score),col=is_out[order(score)]+1)
abline(h=rfind)
rbest_taglio=sort(rfind)[6] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
outliers_r<-ifelse(score>rbest_taglio,1,0)
cfind <- ctree_split_finder(score, n)
plot(sort(score), col=is_out[order(score)]+1, pch=19)
abline(h=cfind, col=cfind+1)
cbest_taglio <- sort(cfind)
outliers_c<-ifelse(score>cbest_taglio,1,0)
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_r, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_r)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#Confusion Matrix & AUC:
confusion_matrix<-table(KNN=outliers_c, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_KNN<- roc(response = is_out, predictor = outliers_c)
auc(roc_KNN)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
##SVM One-Class-----------------------------------------------------------------------------------------
##SVM:
nu with 4 values:
nu_val <- c(0.01, 0.1, 0.9, 0.5)
auc_svm <- c()
precision <- c()
recall <- c()
for (el in nu_val){
svm.model<-svm(X, y=NULL,
type='one-classification',
nu=el,
kernel="radial")
svm.predtrain<-predict(svm.model, X)
out <- rep(0, n)
out[svm.predtrain==FALSE] <- 1
roc_lof<- roc(response = is_out, predictor = out)
auc_svm <- c(auc_svm ,auc(roc_lof))
confusion_matrix<-table(SVM=out, Actual=is_out)
precision <- c(precision, confusion_matrix[2,2]/sum(confusion_matrix[2,]))
recall <- c(recall, confusion_matrix[2,2]/sum(confusion_matrix[,2]))
}
auc_svm; precision; recall
OCSVM paper with k=7 and k=Best knn value:
#K=7:
distanze<-kNNdist(X,k=7)
si_K<-rowMeans(distanze)
plot(sort(si_K), type="l", lwd=2)
x<-1:n
si_K<-sort(si_K)
fit = smooth.spline(x, si_K, cv=TRUE)
fit$lambda
fit_opt<-smooth.spline(x, si_K, lambda = fit$lambda)
plot(x,si_K ,col="lightgray")
lines(x, predict(fit_opt)$y)
d2<-predict(fit_opt,x,deriv=2)$y
d1<-predict(fit_opt,x,deriv=1)$y
CF<-d2/sqrt(1+d1^2)
x.max_cf<-which.max(CF)
gamma<-1/si_K[x.max_cf]
SK<-si_K[1:x.max_cf]
nu<-abs((sum(SK)-x.max_cf)/sum(SK))
svm.model<-svm(X, y=NULL,
type='one-classification',
nu=nu, gamma=gamma,
scale=FALSE,
kernel="radial")
svm.predtrain<-predict(svm.model, X)
out <- rep(0, n)
out[svm.predtrain==FALSE] <- 1
#Confusion Matrix & AUC:
table(SVMOneClass=out, Actual=is_out)
#Valutazione performance:
roc_svmoc<- roc(response = is_out, predictor = out)
auc(roc_svmoc)
confusion_matrix<-table(SVM=out, Actual=is_out)
precision <- confusion_matrix[2,2]/sum(confusion_matrix[2,]); precision
recall <- confusion_matrix[2,2]/sum(confusion_matrix[,2]); recall
#K=BEST kNN:
distanze<-kNNdist(X,k=numero di vicini trovato con kNN precedente)
si_K<-rowMeans(distanze)
plot(sort(si_K), type="l", lwd=2)
x<-1:n
si_K<-sort(si_K)
fit = smooth.spline(x, si_K, cv=TRUE)
fit$lambda
fit_opt<-smooth.spline(x, si_K, lambda = fit$lambda)
plot(x,si_K ,col="lightgray")
lines(x, predict(fit_opt)$y)
d2<-predict(fit_opt,x,deriv=2)$y
d1<-predict(fit_opt,x,deriv=1)$y
CF<-d2/sqrt(1+d1^2)
x.max_cf<-which.max(CF)
gamma<-1/si_K[x.max_cf]
SK<-si_K[1:x.max_cf]
nu<-abs((sum(SK)-x.max_cf)/sum(SK))
svm.model<-svm(X, y=NULL,
type='one-classification',
nu=nu, gamma=gamma,
scale=FALSE,
kernel="radial")
svm.predtrain<-predict(svm.model, X)
out <- rep(0, n)
out[svm.predtrain==FALSE] <- 1
#Confusion Matrix & AUC:
table(SVMOneClass=out, Actual=is_out)
#Valutazione performance:
roc_svmoc<- roc(response = is_out, predictor = out)
auc(roc_svmoc)
confusion_matrix<-table(SVM=out, Actual=is_out)
precision <- confusion_matrix[2,2]/sum(confusion_matrix[2,]); precision
recall <- confusion_matrix[2,2]/sum(confusion_matrix[,2]); recall
##IF--------------------------------------------------------------------------------------
mod = iForest(datix, nt=100, phi=256, seed=123)
If_score = predict(mod, X)
plot(If_score)
rfind<- rpart_split_finder(If_score, n)
plot(sort(If_score),col=is_out[order(If_score)]+1)
abline(h=rfind)
rbest_taglio=sort(rfind)[numero opportuno] #scegliere l'indice del valore degli split possibili che si desideri usare come soglia
outliers_r<-ifelse(If_score>rbest_taglio,1,0)
cfind <- ctree_split_finder(If_score, n)
plot(sort(If_score), col=is_out[order(If_score)]+1, pch=19)
abline(h=cfind, col=cfind+1)
cbest_taglio <- sort(cfind)
outliers_c<-ifelse(If_score>cbest_taglio,1,0)
#Confusion Matrix & AUC:
confusion_matrix<-table(IF=outliers_r, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_IF<- roc(response = is_out, predictor = outliers_r)
auc(roc_IF)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall
#Confusion Matrix & AUC:
confusion_matrix<-table(IF=outliers_c, Actual=is_out); confusion_matrix
#Valutazione performance:
roc_IF<- roc(response = is_out, predictor = outliers_c)
auc(roc_IF)
precision=confusion_matrix[2,2]/sum(confusion_matrix[2,]);precision
recall=confusion_matrix[2,2]/sum(confusion_matrix[,2]);recall