Agregacije podataka
Zadatak: Generisati vještački data frame db
, a zatim izračunati aritmetičku sredinu i standardnu devijaciju varijabli x
i y
, dok za varijablu z
izračunati maksimalni datum u odnosu na modalitete varijable w
.
> #definisati data frame
> set.seed(2021)
> db <- data.frame(x = rnorm(1000, 10, 2),
+ y = runif(1000, 200, 800),
+ z = as.Date("2021-03-31") - 1:1000,
+ w = sample(letters[1:5], 1000, rep = TRUE))
> str(db)
'data.frame': 1000 obs. of 4 variables:
$ x: num 9.76 11.1 10.7 10.72 11.8 ...
$ y: num 675 574 524 282 206 ...
$ z: Date, format: "2021-03-30" "2021-03-29" ...
$ w: chr "d" "d" "e" "d" ...
> #izracunati aritmeticku sredinu kolona x i y
> aggregate(x = db[, c("x", "y")],
+ by = list("Group" = db$w),
+ FUN = "mean")
Group x y
1 a 10.039274 491.7409
2 b 10.035221 479.6193
3 c 10.182541 495.6854
4 d 9.823664 513.8679
5 e 10.042712 489.2177
> #izracunati aritmeticku sredinu i standardnu devijaciju kolona x i y
> aggregate(x = db[, c("x", "y")],
+ by = list("Group" = db$w),
+ FUN = function(x) {c("avg" = mean(x), "stdev" = sd(x))})
Group x.avg x.stdev y.avg y.stdev
1 a 10.039274 1.864388 491.7409 174.8500
2 b 10.035221 2.070886 479.6193 179.2915
3 c 10.182541 2.235355 495.6854 174.2087
4 d 9.823664 2.027051 513.8679 172.6692
5 e 10.042712 1.946465 489.2177 176.1243
> #izracunati maksimalni datum kolone z
> aggregate(x = db[, "z"],
+ by = list("Group" = db$w),
+ FUN = "max")
Group x
1 a 2021-03-23
2 b 2021-03-16
3 c 2021-03-25
4 d 2021-03-30
5 e 2021-03-28
> #narednu komandu izvrsiti ukoliko data.table paket vec nije instaliran
> #install.packages("data.table")
> library(data.table)
> #definisati data.table objekat
> dt <- data.table(db)
> dt
x y z w
1: 9.755080 674.5482 2021-03-30 d
2: 11.104913 573.7752 2021-03-29 d
3: 10.697299 523.7577 2021-03-28 e
4: 10.719264 281.7932 2021-03-27 d
5: 11.796107 205.7385 2021-03-26 d
---
996: 8.352168 328.0673 2018-07-09 e
997: 12.100994 446.9176 2018-07-08 c
998: 12.314042 248.5114 2018-07-07 c
999: 9.525164 503.5328 2018-07-06 b
1000: 11.286432 371.5646 2018-07-05 e
> #izracunati aritmeticku sredinu i standardnu devijaciju kolona x i y
> #izracunti maksimalni datum kolone z
> dt[,list(avg.x = mean(x),
+ avg.y = mean(y),
+ stdev.x = sd(x),
+ stdev.y = sd(y),
+ max.z = max(z)),
+ by = w]
w avg.x avg.y stdev.x stdev.y max.z
1: d 9.823664 513.8679 2.027051 172.6692 2021-03-30
2: e 10.042712 489.2177 1.946465 176.1243 2021-03-28
3: c 10.182541 495.6854 2.235355 174.2087 2021-03-25
4: a 10.039274 491.7409 1.864388 174.8500 2021-03-23
5: b 10.035221 479.6193 2.070886 179.2915 2021-03-16
> #narednu komandu izvrsiti ukoliko dplyr paket vec nije instaliran
> #install.packages("dplyr")
> library(dplyr)
> #izracunati aritmeticku sredinu i standardnu devijaciju kolona x i y
> #izracunti maksimalni datum kolone z
> db %>%
+ group_by(w) %>%
+ summarise(avg.x = mean(x),
+ avg.y = mean(y),
+ stdev.x = sd(x),
+ stdev.x = sd(x),
+ max.z = max(z))
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 5 x 5
w avg.x avg.y stdev.x max.z
<chr> <dbl> <dbl> <dbl> <date>
1 a 10.0 492. 1.86 2021-03-23
2 b 10.0 480. 2.07 2021-03-16
3 c 10.2 496. 2.24 2021-03-25
4 d 9.82 514. 2.03 2021-03-30
5 e 10.0 489. 1.95 2021-03-28
> #naredne komande izvrsiti ukoliko dtplyr i dplyr paketi vec nisu instalirani
> #install.packages("dtplyr")
> #install.packages("dplyr")
> library(dtplyr)
> library(dplyr)
>
> dt <- lazy_dt(db)
> #izracunati aritmeticku sredinu i standardnu devijaciju kolona x i y
> #izracunti maksimalni datum kolone z
> dt %>%
+ group_by(w) %>%
+ summarise(avg.x = mean(x),
+ avg.y = mean(y),
+ stdev.x = sd(x),
+ stdev.x = sd(x),
+ max.z = max(z))
Source: local data table [5 x 6]
Call: `_DT4`[, .(avg.x = mean(x), avg.y = mean(y), stdev.x = sd(x),
stdev.x = sd(x), max.z = max(z)), keyby = .(w)]
w avg.x avg.y stdev.x stdev.x max.z
<chr> <dbl> <dbl> <dbl> <dbl> <date>
1 a 10.0 492. 1.86 1.86 2021-03-23
2 b 10.0 480. 2.07 2.07 2021-03-16
3 c 10.2 496. 2.24 2.24 2021-03-25
4 d 9.82 514. 2.03 2.03 2021-03-30
5 e 10.0 489. 1.95 1.95 2021-03-28
# Use as.data.table()/as.data.frame()/as_tibble() to access results
> #narednu komandu izvrsiti ukoliko plyr paket vec nije instaliran
> #install.packages("plyr")
> library(plyr)
> #izracunati aritmeticku sredinu i standardnu devijaciju kolona x i y
> #izracunti maksimalni datum kolone z
> ddply(db, .(w), summarise,
+ avg.x = mean(x),
+ avg.y = mean(y),
+ stdev.x = sd(x),
+ stdev.x = sd(x),
+ max.z = max(z))
w avg.x avg.y stdev.x max.z
1 a 10.039274 491.7409 1.864388 2021-03-23
2 b 10.035221 479.6193 2.070886 2021-03-16
3 c 10.182541 495.6854 2.235355 2021-03-25
4 d 9.823664 513.8679 2.027051 2021-03-30
5 e 10.042712 489.2177 1.946465 2021-03-28
> #narednu komandu izvrsiti ukoliko doBy paket vec nije instaliran
> #install.packages("doBy")
> library(doBy)
> #izracunati aritmeticku sredinu i standardnu devijaciju kolona x i y
> summaryBy(x + y ~ w, data = db, FUN = c(mean, sd))
w x.mean y.mean x.sd y.sd
1 a 10.039274 491.7409 1.864388 174.8500
2 b 10.035221 479.6193 2.070886 179.2915
3 c 10.182541 495.6854 2.235355 174.2087
4 d 9.823664 513.8679 2.027051 172.6692
5 e 10.042712 489.2177 1.946465 176.1243
> #izracunti maksimalni datum kolone z
> db.s <- summaryBy(z ~ w, data = db, FUN = max)
> db.s
w z.max
1 a 18709
2 b 18702
3 c 18711
4 d 18716
5 e 18714
> #transformisati numericku vrijednost u datum
> db.s$z.max <- as.Date(db.s$z.max, origin = "1970-01-01")
> db.s
w z.max
1 a 2021-03-23
2 b 2021-03-16
3 c 2021-03-25
4 d 2021-03-30
5 e 2021-03-28
>
> #origin se u R-u moze naci kao:
> Sys.Date() - as.numeric(Sys.Date())
[1] "1970-01-01"
> #narednu komandu izvrsiti ukoliko sqldf paket vec nije instaliran
> #install.packages("sqldf")
> library(sqldf)
> #izracunati aritmeticku sredinu i standardnu devijaciju kolona x i y
> #izracunti maksimalni datum kolone z
> sql.s <- sqldf("SELECT w,
+ AVG(x) as avg_x,
+ AVG(y) as avg_y,
+ STDEV(x) as sd_x,
+ STDEV(y) as sd_y,
+ max(z) as max_z
+ FROM db
+ GROUP BY w")
> sql.s
w avg_x avg_y sd_x sd_y max_z
1 a 10.039274 491.7409 1.864388 174.8500 18709
2 b 10.035221 479.6193 2.070886 179.2915 18702
3 c 10.182541 495.6854 2.235355 174.2087 18711
4 d 9.823664 513.8679 2.027051 172.6692 18716
5 e 10.042712 489.2177 1.946465 176.1243 18714
> #transformisati numericku vrijednost u datum
> sql.s$max_z <- as.Date(sql.s$max_z, origin = "1970-01-01")
> sql.s
w avg_x avg_y sd_x sd_y max_z
1 a 10.039274 491.7409 1.864388 174.8500 2021-03-23
2 b 10.035221 479.6193 2.070886 179.2915 2021-03-16
3 c 10.182541 495.6854 2.235355 174.2087 2021-03-25
4 d 9.823664 513.8679 2.027051 172.6692 2021-03-30
5 e 10.042712 489.2177 1.946465 176.1243 2021-03-28
>
> #drugi nacin koji zadrzava format datuma za kolonu z:
> #neophodno je imati isti naziv rezultata agregatne funkcije
> #kako bi se zadrzao format datuma
> sqldf("SELECT w,
+ AVG(x) as avg_x,
+ AVG(y) as avg_y,
+ STDEV(x) as sd_x,
+ STDEV(y) as sd_y,
+ max(z) as z
+ FROM db
+ GROUP BY w")
w avg_x avg_y sd_x sd_y z
1 a 10.039274 491.7409 1.864388 174.8500 2021-03-23
2 b 10.035221 479.6193 2.070886 179.2915 2021-03-16
3 c 10.182541 495.6854 2.235355 174.2087 2021-03-25
4 d 9.823664 513.8679 2.027051 172.6692 2021-03-30
5 e 10.042712 489.2177 1.946465 176.1243 2021-03-28
Prilikom rada sa većim brojem podataka, performanse funkcija iz različitih paketa znatno se razlikuju. U pogledu vremena izvršenja agregacija, paket data.table
ima najbolje rezultate, dok se u pogledu jednostavnosti i razumljivosti koda izdvojio paket dplyr
.
Kombinujući najbolje karakteristike ova dva paketa, nastao je paket dtplyr
koji sve više dobija na popularnosti.
Last updated
Was this helpful?