R: dplyr
Package to interact easily with tables
install.packages('dplyr')
library(dplyr)
tab <- tbl_df(originaltable)
Five basic functions: select(), filter(), arrange(), mutate() e summarize()
Select
To select columns
select(tab, col_1, col_5, col_4, col_20)
#from col_1 to col_5, and col_20:
select(tab, col_1:col_5, col_20)
#all except
select(tab, -col_5)
select(tab, -(col_1:col_5))
Filter
To filter rows
filter(tab, col_1 == "value")
#comma equals to AND, | to OR:
filter(tab, col_1 == "value", col_5 < "3.0", col_3 != "y")
filter(tab, col_1 == "value" | col_5 < "3.0")
#exclude missing values
filter(tab, !is.na(col_1))
Arrange
To order
arrange(tab, col_5, col_1) #asc
arrange(tab, desc(col_5), col_1) #desc and asc
Mutate
New values based on current values
mutate(tab, col_21 = (col_5*col_20)+2)
Summarize
To summarize the dataset
summarize(tab, avg_1 = mean(col_1))
# very useful with the group_by() function
summarize(group_by(tab, col_1), mean(col_2))
summarize(group_by(tab, col_1), mean(col_2), n(), n_distinct(col_2), max(col_2), min(col_2), sd(col_2))
We can concatenate the different function
arrange(filter(summarize(group_by(tabella, col_1), numero = n(), media = mean(col_2)), col_5 < "3.0"), desc(media))
#same result, different code
tabella %>% group_by(col_1) %>% summarize(numero = n(), media = mean(col_2)) %>% filter(col_5 < "3.0") %>% arrange(desc(media))
Join
inner_join(), left_join() right_join(), full_join(), semi_join(), anti_join()
inner_join(tab1, tab2, by=c("FieldTab1"="FieldTab2"))
Top_n
first n values
#first 10 rows ordered by col
top_n(tab, 10, col)