Here are some R tools that I have used successfully for handling, tidying up, ordering, reshaping, subsetting, merging, and summarising datasets with upwards of a million cells.
rm(list=ls()); ls()
library(pacman) # loads packages, installing them if they aren't present
data(iris) # example dataset
str(iris)
# summarize:
pacman::p_load(doBy)
# summarizes data according to conditions expressed in a model form
iris_summary <- doBy::summaryBy(cbind(Sepal.Length, Petal.Length) ~ Species,
# alternatively: Sepal.Length + Petal.Length ~ Species,
FUN=c(mean, sd),
data=iris
)
# subset:
setosa.sepal <- droplevels(subset(iris, Species=="setosa",
select=-c(Petal.Length, Petal.Width)
)
)
# droplevels() makes subset() "forget" about unused levels, so that they don't show up in str()
# another example:
versicolor.petal <- droplevels(subset(iris,
Species=="versicolor",
select=c(Species, Petal.Length, Petal.Width)
)
)
# other ways of subsetting:
selected_spp <- c("setosa","versicolor")
set.vers <- iris[iris$Species %in% selected_spp,]
# order:
set.vers.ordered <- with(set.vers, set.vers[order(Sepal.Length), ])
# change a column's content, e.g. add Genus to Species:
iris$Species_binomial <- paste("Iris", iris$Species, sep="_")
# split column content into multiple columns:
iris$Genus <- do.call(rbind, strsplit(iris$Species_binomial, split="_"))[,1]
# or:
pacman::p_load(stringr)
iris$Species <- str_split_fixed(iris$Species_binomial, pattern="_", n=2)[,2]
# reshape:
# change the format of setosa.sepal from "wide" to "long":
# see https://stackoverflow.com/questions/2185252/reshaping-data-frame-from-wide-to-long-format
pacman::p_load(tidyr)
setosa.sepal$uniqueID <- paste("ID", 1:length(setosa.sepal[,1]), sep="_")
# gather() gets mad if rows don't have a unique identifier
setosa.sepal.long <- gather(setosa.sepal,
key = "Variable",
value = "Measurement",
-c(uniqueID, Species)
)
# turn back to wide format:
setosa.sepal.wide <- spread(setosa.sepal.long,
key = "Variable",
value = "Measurement"
)
# gather() and spread() have been superseded by pivot_wider() and pivot_longer()
# merge setosa.sepal and versicolor.petal:
merged_df <- plyr::join_all(list(setosa.sepal, versicolor.petal),
type='full')
Did you enjoy this? Consider joining my on-line course “First steps in data analysis with R” and learn data analysis from zero to hero!