Monotono grupisanje numeričkih risk faktora
Last updated
Was this helpful?
Last updated
Was this helpful?
Zadatak: Fajl dat u nastavku, prvo importovati, a zatim, u odnosu na binarnu zavisnu varijablu target
izvršiti monotono grupisanje varijable maturity
i to putem sljedećih metoda:
iterativnog grupisanja percentila;
izotonične regresije;
iterativne selekcije maksimalne kumulativne stope zavisne varijable (metod u praksi poznat kao Monotone Adjacent Pooling Algorithm - MAPA).
> #naredne komande izvrsiti ukoliko paketi vec nisu instalirani
> #install.packages("Hmisc")
> #install.packages("dplyr")
> library(Hmisc)
> library(dplyr)
> #pomocna funckija za kreiranje grupa varijable maturity
> slice.variable <- function(x.orig, x.lb, x.ub) {
+ lx <- length(x.orig)
+ lg <- length(x.lb)
+ x.trans <- rep(NA, lx)
+ x.lb.lag <- c(x.lb[-1], x.ub[lg])
+ for(i in 1:lg) {
+ x.lb.l <- x.lb[i]
+ x.lb.lag.l <- x.lb.lag[i]
+ x.ub.l <- x.ub[i]
+ bin.n <- sprintf("%02d", i)
+ bin.f <- ifelse(x.lb.l == x.ub.l,
+ paste0(bin.n, " [", x.lb.l, "]"),
+ paste0(bin.n, " [", x.lb.l, ",", x.lb.lag.l, ")"))
+ rep.indx <- which(x.orig >= x.lb.l & x.orig <= x.ub.l)
+ x.trans[rep.indx] <- bin.f
+ }
+ return(x.trans)
+ }
> #metod percentila
> #kreirati funckiju za grupisanje
> monobin.pct <- function(y, x, num.g) {
+ d <- data.frame(y, x)
+ d <- d[complete.cases(d), ]
+ repeat {
+ d$bin = cut2(d$x, g = num.g)
+ ds <- d %>%
+ group_by(bin) %>%
+ summarise(no = n(),
+ y.avg = mean(y),
+ x.avg = mean(x),
+ x.min = min(x),
+ x.max = max(x))
+ cor.coef <- cor(ds$y.avg, ds$x.avg, method = "spearman")
+ if(abs(cor.coef) == 1 | num.g == 2) {break}
+ num.g <- num.g - 1
+ }
+ return(ds)
+ }
> #primijeniti funkciju na zavisnu varijablu target i varijablu maturity
> cut.pts <- monobin.pct(y = db$target, x = db$maturity, num.g = 10)
> cut.pts
# A tibble: 3 x 6
bin no y.avg x.avg x.min x.max
<fct> <int> <dbl> <dbl> <int> <int>
1 [ 4,13) 3973 0.0327 9.65 4 12
2 [13,26) 4090 0.0489 20.3 13 24
3 [26,72] 1937 0.0878 38.2 26 72
> #kreiranje grupa
> db$maturity.pct <- slice.variable(x.orig = db$maturity,
+ x.lb = cut.pts$x.min,
+ x.ub = cut.pts$x.max)
> #provjera
> db %>%
+ group_by(mat.bin = maturity.pct) %>%
+ summarise(no = n(),
+ target.avg = mean(target, na.rm = TRUE),
+ mat.avg = mean(maturity, na.rm = TRUE),
+ mat.min = min(maturity, na.rm = TRUE),
+ mat.max = max(maturity, na.rm = TRUE))
# A tibble: 3 x 6
mat.bin no target.avg mat.avg mat.min mat.max
<chr> <int> <dbl> <dbl> <int> <int>
1 01 [4,13) 3973 0.0327 9.65 4 12
2 02 [13,26) 4090 0.0489 20.3 13 24
3 03 [26,72) 1937 0.0878 38.2 26 72
> #izotonicna regresija
> #kreirati funckiju za grupisanje
> monobin.iso <- function(y, x) {
+ d <- data.frame(y, x)
+ d <- d[complete.cases(d), ]
+ d <- d[order(d$x), ]
+ cor.coef <- cor(d$y, d$x, method = "spearman")
+ iso.r <- isoreg(x = d$x, y = (cor.coef / abs(cor.coef)) * d$y)
+ d$yhat <- iso.r$yf
+ ds <- d %>%
+ group_by(yhat) %>%
+ summarise(no = n(),
+ y.avg = mean(y),
+ x.avg = mean(x),
+ x.min = min(x),
+ x.max = max(x))
+ return(ds)
+ }
> #primijeniti funkciju na zavisnu varijablu target i variajblu maturity
> cut.pts <- monobin.iso(y = db$target, x = db$maturity)
> cut.pts
# A tibble: 10 x 6
yhat no y.avg x.avg x.min x.max
<dbl> <int> <dbl> <dbl> <int> <int>
1 0 86 0 4.10 4 5
2 0.0117 1024 0.0117 6.06 6 7
3 0.0350 1029 0.0350 9.50 8 11
4 0.0392 2655 0.0392 12.9 12 15
5 0.0541 3291 0.0541 21.7 16 26
6 0.0596 503 0.0596 29.3 27 30
7 0.0737 936 0.0737 37.0 33 42
8 0.128 39 0.128 45.8 45 47
9 0.147 435 0.147 51.0 48 60
10 1 2 1 72 72 72
> #kreiranje grupa
> db$maturity.iso <- slice.variable(x.orig = db$maturity,
+ x.lb = cut.pts$x.min,
+ x.ub = cut.pts$x.max)
> #provjera
> db %>%
+ group_by(mat.bin = maturity.iso) %>%
+ summarise(no = n(),
+ target.avg = mean(target, na.rm = TRUE),
+ mat.avg = mean(maturity, na.rm = TRUE),
+ mat.min = min(maturity, na.rm = TRUE),
+ mat.max = max(maturity, na.rm = TRUE))
# A tibble: 10 x 6
mat.bin no target.avg mat.avg mat.min mat.max
<chr> <int> <dbl> <dbl> <int> <int>
1 01 [4,6) 86 0 4.10 4 5
2 02 [6,8) 1024 0.0117 6.06 6 7
3 03 [8,12) 1029 0.0350 9.50 8 11
4 04 [12,16) 2655 0.0392 12.9 12 15
5 05 [16,27) 3291 0.0541 21.7 16 26
6 06 [27,33) 503 0.0596 29.3 27 30
7 07 [33,45) 936 0.0737 37.0 33 42
8 08 [45,48) 39 0.128 45.8 45 47
9 09 [48,72) 435 0.147 51.0 48 60
10 10 [72] 2 1 72 72 72
> #iterativna selekcija maksimalne kumulativne vjerovantoce defaulta
> #u praksi ovaj metod je poznat kao MAPA (Monotone Adjacent Pooling Algorithm)
> #kreirati funckiju za grupisanje
> monobin.cum <- function(y, x, num.g) {
+ d <- data.frame(y, x)
+ d <- d[complete.cases(d), ]
+ cor.coef <- cor(d$y, d$x, method = "spearman")
+ d$bin = cut2(d$x, g = num.g)
+ ds <- d %>%
+ group_by(bin) %>%
+ summarise(no = n(),
+ nb = sum(y),
+ ng = sum(y%in%0),
+ y.avg = mean(y),
+ x.avg = mean(x),
+ x.min = min(x),
+ x.max = max(x))
+ ds <- ds[order(ds$bin, decreasing = ifelse(sign(cor.coef) == 1, TRUE, FALSE)), ]
+ if(all(diff(ds$y.avg) > 0)) {
+ res.s <- cbind.data.frame(ds, mapa.c = 1:nrow(ds))
+ return(res.s)
+ }
+ tbl.i <- ds
+ cp <- c()
+ repeat {
+ cs <- cumsum(tbl.i[, "nb"]) / cumsum(tbl.i[, "no"])
+ indx <- which(cs == max(cs))[1]
+ if(indx == nrow(tbl.i)) {cp <- c(cp, indx); break}
+ cp <- c(cp, indx)
+ tbl.i <- tbl.i[-(1:indx), ]
+ }
+ if(length(cp) == 1) {
+ res <- cbind.data.frame(tbl, mapa.c = "UNSUCCESSFUL", stringsAsFactors = FALSE)
+ return(res)
+ } else {
+ mapa.c <- rep(cumsum(cp), times = cp)
+ res <- cbind.data.frame(ds, mapa.c)
+ }
+ res.s <- res %>%
+ group_by(mapa.c) %>%
+ summarise(no = sum(no),
+ nb = sum(nb),
+ ng = sum(ng),
+ x.min = min(x.min),
+ x.max = max(x.max)) %>%
+ ungroup() %>%
+ mutate(y.avg = nb / no) %>%
+ select(-mapa.c)
+ bin <- paste0("[", res.s$x.min, ", ", res.s$x.max, "]")
+ res.s <- cbind.data.frame(bin = bin, res.s)
+ res.s <- res.s[order(res.s$x.min), ]
+ return(res.s)
+ }
> #primijeniti funkciju na zavisnu varijablu target i varijablu maturity
> cut.pts <- monobin.cum(y = db$target, x = db$maturity, num.g = 15)
> cut.pts
bin no nb ng x.min x.max y.avg
5 [4, 6] 1045 12 1033 4 6 0.01148325
4 [7, 15] 3749 140 3609 7 15 0.03734329
3 [16, 24] 3269 178 3091 16 24 0.05445090
2 [26, 36] 1273 93 1180 26 36 0.07305577
1 [39, 72] 664 77 587 39 72 0.11596386
> #kreiranje grupa
> db$maturity.cum <- slice.variable(x.orig = db$maturity,
+ x.lb = cut.pts$x.min,
+ x.ub = cut.pts$x.max)
> #provjera
> db %>%
+ group_by(mat.bin = maturity.cum) %>%
+ summarise(no = n(),
+ target.avg = mean(target, na.rm = TRUE),
+ mat.avg = mean(maturity, na.rm = TRUE),
+ mat.min = min(maturity, na.rm = TRUE),
+ mat.max = max(maturity, na.rm = TRUE))
# A tibble: 5 x 6
mat.bin no target.avg mat.avg mat.min mat.max
<chr> <int> <dbl> <dbl> <int> <int>
1 01 [4,7) 1045 0.0115 5.84 4 6
2 02 [7,16) 3749 0.0373 11.8 7 15
3 03 [16,26) 3269 0.0545 21.7 16 24
4 04 [26,39) 1273 0.0731 33.1 26 36
5 05 [39,72) 664 0.116 48.0 39 72
U praksi, analitičari prikazane metode najčešće kombinuju sa korekcijama za minimalni broj podataka i minimalnu stopu zavisne varijable po pojedinačnim grupama. Takođe, čest je slučaj da se koriste i različiti statistički testovi ili ekspertski odrede pragovi značajnosti za dalje spajanje inicijalnih grupa, u cilju dobijanja monotonih grupa sa značajnim razlikama prosječnih vrijednosti nezavisne varijable. Prikazane metode, kao i njihova kombinacija sa dodatnih grupisanjem, vrlo jednostavno mogu biti prilagođenje i za neprekidne zavisne varijable (što je obično slučaj kod LGD i EAD modela). Detalji i dodatne metode prikazani su u R paketima i .