Monotono grupisanje numeričkih risk faktora
Last updated
Was this helpful?
Last updated
Was this helpful?
Zadatak: Fajl dat u nastavku, prvo importovati, a zatim, u odnosu na binarnu zavisnu varijablu target
izvršiti monotono grupisanje varijable maturity
i to putem sljedećih metoda:
iterativnog grupisanja percentila;
izotonične regresije;
iterativne selekcije maksimalne kumulativne stope zavisne varijable (metod u praksi poznat kao Monotone Adjacent Pooling Algorithm - MAPA).
> #naredne komande izvrsiti ukoliko paketi vec nisu instalirani
> #install.packages("Hmisc")
> #install.packages("dplyr")
> library(Hmisc)
> library(dplyr)
> #pomocna funckija za kreiranje grupa varijable maturity
> slice.variable <- function(x.orig, x.lb, x.ub) {
+ lx <- length(x.orig)
+ lg <- length(x.lb)
+ x.trans <- rep(NA, lx)
+ x.lb.lag <- c(x.lb[-1], x.ub[lg])
+ for(i in 1:lg) {
+ x.lb.l <- x.lb[i]
+ x.lb.lag.l <- x.lb.lag[i]
+ x.ub.l <- x.ub[i]
+ bin.n <- sprintf("%02d", i)
+ bin.f <- ifelse(x.lb.l == x.ub.l,
+ paste0(bin.n, " [", x.lb.l, "]"),
+ paste0(bin.n, " [", x.lb.l, ",", x.lb.lag.l, ")"))
+ rep.indx <- which(x.orig >= x.lb.l & x.orig <= x.ub.l)
+ x.trans[rep.indx] <- bin.f
+ }
+ return(x.trans)
+ }
> #metod percentila
> #kreirati funckiju za grupisanje
> monobin.pct <- function(y, x, num.g) {
+ d <- data.frame(y, x)
+ d <- d[complete.cases(d), ]
+ repeat {
+ d$bin = cut2(d$x, g = num.g)
+ ds <- d %>%
+ group_by(bin) %>%
+ summarise(no = n(),
+ y.avg = mean(y),
+ x.avg = mean(x),
+ x.min = min(x),
+ x.max = max(x))
+ cor.coef <- cor(ds$y.avg, ds$x.avg, method = "spearman")
+ if(abs(cor.coef) == 1 | num.g == 2) {break}
+ num.g <- num.g - 1
+ }
+ return(ds)
+ }
> #primijeniti funkciju na zavisnu varijablu target i varijablu maturity
> cut.pts <- monobin.pct(y = db$target, x = db$maturity, num.g = 10)
> cut.pts
# A tibble: 3 x 6
bin no y.avg x.avg x.min x.max
<fct> <int> <dbl> <dbl> <int> <int>
1 [ 4,13) 3973 0.0327 9.65 4 12
2 [13,26) 4090 0.0489 20.3 13 24
3 [26,72] 1937 0.0878 38.2 26 72
> #kreiranje grupa
> db$maturity.pct <- slice.variable(x.orig = db$maturity,
+ x.lb = cut.pts$x.min,
+ x.ub = cut.pts$x.max)
> #provjera
> db %>%
+ group_by(mat.bin = maturity.pct) %>%
+ summarise(no = n(),
+ target.avg = mean(target, na.rm = TRUE),
+ mat.avg = mean(maturity, na.rm = TRUE),
+ mat.min = min(maturity, na.rm = TRUE),
+ mat.max = max(maturity, na.rm = TRUE))
# A tibble: 3 x 6
mat.bin no target.avg mat.avg mat.min mat.max
<chr> <int> <dbl> <dbl> <int> <int>
1 01 [4,13) 3973 0.0327 9.65 4 12
2 02 [13,26) 4090 0.0489 20.3 13 24
3 03 [26,72) 1937 0.0878 38.2 26 72