This tutorial demonstrates how to perform data manipulation using the dsTidyverse
and dsTidyverseClient
packages. These are community packages which implement selected functions from the tidyverse
collection of R packages. To learn about all the functionality available you can view the vignette provided with these packages.
library(dsBaseClient)
library(dsTidyverseClient)
ds.rename
renames columns within a server-side data frame. Column names are passed unquoted to tidy_expr
and are evaluated as column names within df.name
.
ds.colnames("titanic")
## $server1
## [1] "entity_id" "Survived" "Pclass" "Name" "Sex" "Age"
## [7] "SibSp" "Parch" "Ticket" "Fare" "Cabin" "Embarked"
##
## $server2
## [1] "Survived" "Pclass" "Name" "Sex" "Age" "SibSp" "Parch"
## [8] "Ticket" "Fare" "Cabin" "Embarked"
ds.rename(
df.name = "titanic",
tidy_expr = list(person_name = Name, person_class = Pclass),
newobj = "new_df")
ds.colnames("new_df")
## $server1
## [1] "entity_id" "Survived" "person_class" "person_name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
##
## $server2
## [1] "Survived" "person_class" "person_name" "Sex" "Age"
## [6] "SibSp" "Parch" "Ticket" "Fare" "Cabin"
## [11] "Embarked"
ds.select
creates a subset of columns in a server-side data frame. Column names are passed unquoted to tidy_expr
and are evaluated as column names within df.name
.
ds.select(
df.name = "titanic",
tidy_expr = list(Sex, Age, Survived),
newobj = "titanic_2")
ds.colnames("titanic_2")
## $server1
## [1] "Sex" "Age" "Survived"
##
## $server2
## [1] "Sex" "Age" "Survived"
ds.select(
df.name = "titanic",
tidy_expr = list(Sex:Cabin),
newobj = "titanic_3")
ds.colnames("titanic_3")
## $server1
## [1] "Sex" "Age" "SibSp" "Parch" "Ticket" "Fare" "Cabin"
##
## $server2
## [1] "Sex" "Age" "SibSp" "Parch" "Ticket" "Fare" "Cabin"
ds.select(
df.name = "titanic",
tidy_expr = list(!SibSp),
newobj = "titanic_4")
ds.colnames("titanic_4")
## $server1
## [1] "entity_id" "Survived" "Pclass" "Name" "Sex" "Age"
## [7] "Parch" "Ticket" "Fare" "Cabin" "Embarked"
##
## $server2
## [1] "Survived" "Pclass" "Name" "Sex" "Age" "Parch" "Ticket"
## [8] "Fare" "Cabin" "Embarked"
Subset set a data frame flexibly, retaining all rows that satisfy the condition supplied to tidy_expr
ds.dim("titanic")
## $`dimensions of titanic in server1`
## [1] 444 12
##
## $`dimensions of titanic in server2`
## [1] 447 11
##
## $`dimensions of titanic in combined studies`
## [1] 891 12
ds.filter(
df.name = "titanic",
tidy_expr = list(Survived == 1 & Sex == "female"),
newobj = "titanic_5",
datasources = conns)
ds.dim("titanic_5")
## $`dimensions of titanic_5 in server1`
## [1] 128 12
##
## $`dimensions of titanic_5 in server2`
## [1] 105 11
##
## $`dimensions of titanic_5 in combined studies`
## [1] 233 12
ds.mutate
creates new columns in a server-side dataframe. These new columns are normally transformations of existing columns. Again, column names are passed unquoted to tidy_expr
and are evaluated as column names within df.name
.
ds.mutate(
df.name = "titanic",
tidy_expr = list(age_months = Age*12),
newobj = "titanic_6")
ds.mean("titanic_6$age_months")$Mean.by.Study
## EstimatedMean Nmissing Nvalid Ntotal
## server1 343.9244 87 357 444
## server2 368.8545 90 357 447
ds.if_else
allows you to specify a condition, and return values based on whether the condition is ‘true’ or ‘false’
ds.if_else(
condition = list(titanic$Age > 18),
"Adult",
"Child",
newobj = "titanic_7")
ds.table("titanic_7")$output.list$TABLE_rvar.by.study_counts
##
## Data in all studies were valid
##
## Study 1 : No errors reported from this study
## Study 2 : No errors reported from this study
## study
## titanic_7 server1 server2
## Adult 287 288
## Child 70 69
## NA 87 90
An extension of ds.if_else
which allows the specification of multiple conditions. Extremely useful for recoding variables.
ds.case_when(
tidy_expr = list(
titanic$Fare < 20 ~ "low",
titanic$Fare >= 20 & titanic$Fare < 40 ~ "medium",
titanic$Fare >= 40 ~ "high"),
newobj = "titanic_8")
ds.table("titanic_8")$output.list$TABLE_rvar.by.study_counts
##
## Data in all studies were valid
##
## Study 1 : No errors reported from this study
## Study 2 : No errors reported from this study
## study
## titanic_8 server1 server2
## high 86 90
## low 262 253
## medium 96 104
## NA 0 0
Bind any number of data frames by row, making a longer result. This is similar to do.call(rbind, dfs), but the output will contain all columns that appear in any of the inputs.
ds.dim("titanic")
## $`dimensions of titanic in server1`
## [1] 444 12
##
## $`dimensions of titanic in server2`
## [1] 447 11
##
## $`dimensions of titanic in combined studies`
## [1] 891 12
ds.bind_rows(
to_combine = list(titanic, titanic),
newobj = "titanic_9"
)
ds.dim("titanic_9")
## $`dimensions of titanic_9 in server1`
## [1] 888 12
##
## $`dimensions of titanic_9 in server2`
## [1] 894 11
##
## $`dimensions of titanic_9 in combined studies`
## [1] 1782 12
Bind any number of data frames by column, making a wider result. This is similar to do.call(cbind, dfs).
First create some copies with selected columns:
ds.select(
df.name = "titanic",
tidy_expr = list(Survived, Pclass),
newobj = "titanic_sub_1")
ds.colnames("titanic_sub_1")
## $server1
## [1] "Survived" "Pclass"
##
## $server2
## [1] "Survived" "Pclass"
ds.select(
df.name = "titanic",
tidy_expr = list(Parch, Ticket),
newobj = "titanic_sub_2")
ds.colnames("titanic_sub_2")
## $server1
## [1] "Parch" "Ticket"
##
## $server2
## [1] "Parch" "Ticket"
Now join these together:
ds.bind_cols(
to_combine = list(titanic_sub_1, titanic_sub_2),
newobj = "titanic_sub_3"
)
ds.colnames("titanic_sub_3")
## $server1
## [1] "Survived" "Pclass" "Parch" "Ticket"
##
## $server2
## [1] "Survived" "Pclass" "Parch" "Ticket"
arrange
is a more flexible version of base::sort
, which orders the rows of a data frame by the values of selected columns.
ds.arrange(
df.name = "titanic",
tidy_expr = list(Sex),
newobj = "arranged_df"
)
ds.distinct(
df.name = "titanic",
newobj = "titanic_distinct"
)
ds.dim("titanic_distinct")
## $`dimensions of titanic_distinct in server1`
## [1] 444 12
##
## $`dimensions of titanic_distinct in server2`
## [1] 447 11
##
## $`dimensions of titanic_distinct in combined studies`
## [1] 891 12
Merging two datasets requires the use of ds.merge
- see separate tutorial.
datashield.logout(conns)