# Chapter 16 R Practice Problems

• Exercise (Counting Missing Values) Create a vector of lenght 100 with every third element missing (namely NA) by only using vector operations. Then count the number of missing values by utilizing only the following two function is.na and sum.

• Leadership Example Suppose you have the following data

manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")

age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)

leadershipOriginal <- data.frame(manager, date, country, gender, age,
q1, q2, q3, q4, q5, stringsAsFactors=FALSE)
• Create a new dataset newdata containing rows sorted from youngest manager to oldest man- ager.
leadership <- leadershipOriginal
newdata <- leadership[order(leadership$age),] newdata ## manager date country gender age q1 q2 q3 q4 q5 ## 3 3 10/1/08 UK F 25 3 5 5 5 2 ## 1 1 10/24/08 US M 32 5 4 5 5 5 ## 4 4 10/12/08 UK M 39 3 3 4 NA NA ## 2 2 10/28/08 US F 45 3 5 2 5 5 ## 5 5 5/1/09 UK F 99 2 2 1 2 1 • sorts the rows into female followed by male, and youngest to oldest within each gender.  newdata <- leadership[order(leadership$gender, leadership$age),] newdata ## manager date country gender age q1 q2 q3 q4 q5 ## 3 3 10/1/08 UK F 25 3 5 5 5 2 ## 2 2 10/28/08 US F 45 3 5 2 5 5 ## 5 5 5/1/09 UK F 99 2 2 1 2 1 ## 1 1 10/24/08 US M 32 5 4 5 5 5 ## 4 4 10/12/08 UK M 39 3 3 4 NA NA • sorts the rows by gender, and then from oldest to youngest manager within each gender.  newdata <- leadership[order(leadership$gender, -leadership$age),] newdata ## manager date country gender age q1 q2 q3 q4 q5 ## 5 5 5/1/09 UK F 99 2 2 1 2 1 ## 2 2 10/28/08 US F 45 3 5 2 5 5 ## 3 3 10/1/08 UK F 25 3 5 5 5 2 ## 4 4 10/12/08 UK M 39 3 3 4 NA NA ## 1 1 10/24/08 US M 32 5 4 5 5 5 • __ Merging Datasets __ Suppose we have these two data sets. Combine them in a meaningful way. names_1 <- c("A", "B", "C") names_2 <- c("A", "C", "B") gender <- c("M", "F", "F") age <- c(32, 45, 25) dataFrame_1 <- data.frame(names_1, gender) dataFrame_2 <- data.frame(names_2, age) dataFrame_1 ## names_1 gender ## 1 A M ## 2 B F ## 3 C F dataFrame_2 ## names_2 age ## 1 A 32 ## 2 C 45 ## 3 B 25 dataFrame_1new <- dataFrame_1[order(dataFrame_1$names_1),]
names(dataFrame_1new)[1] <- "names"

dataFrame_2new <- dataFrame_2[order(dataFrame_2$names_2),] names(dataFrame_2new)[1] <- "names" dataFrameOriginal <- merge(dataFrame_1new, dataFrame_2new, by = "names") • Suppose you want to add another variable to the data frame by using cbind: names <- c("A", "B", "C") income <- c(100, 85, 125) dataFrame_3 <- data.frame(names, income) dataFrame <- cbind(dataFrameOriginal, dataFrame_3["income"]) dataFrame ## names gender age income ## 1 A M 32 100 ## 2 B F 25 85 ## 3 C F 45 125 • Suppose you want to add another observation to the data frame by using rbind: dataFrame_4 <- data.frame(names = c("D"), gender = c("F"), age = c(44)) dataFrame_4 ## names gender age ## 1 D F 44 dataFrame <- rbind(dataFrameOriginal, dataFrame_4) dataFrame ## names gender age ## 1 A M 32 ## 2 B F 25 ## 3 C F 45 ## 4 D F 44 • Suppose you want to add another observation to the data frame by using rbind but this time two data frames have different colums: dataFrame_5 <- data.frame(names = c("D"), gender = c("F"), age = c(44), occupation = c("teacher")) dataFrame_5 ## names gender age occupation ## 1 D F 44 teacher # method 1: append the dataFrameOriginal dataFrame <- cbind(dataFrameOriginal, occupation = c(NA)) dataFrame <- rbind(dataFrame, dataFrame_5) dataFrame ## names gender age occupation ## 1 A M 32 <NA> ## 2 B F 25 <NA> ## 3 C F 45 <NA> ## 4 D F 44 teacher dataFrame_5 <- data.frame(names = c("D"), gender = c("F"), age = c(44), occupation = c("teacher")) dataFrame_5 ## names gender age occupation ## 1 D F 44 teacher # method 2: delete the NA column dataFrame <- rbind(dataFrameOriginal, dataFrame_5[1:3]) dataFrame ## names gender age ## 1 A M 32 ## 2 B F 25 ## 3 C F 45 ## 4 D F 44 • Subsetting selects variables q1, q2, q3, q4, and q5 from the leadership data frame and saves them to the data frame newdata. leadership <- leadershipOriginal # method 1 newdata <- leadership[, c(6:10)] newdata ## q1 q2 q3 q4 q5 ## 1 5 4 5 5 5 ## 2 3 5 2 5 5 ## 3 3 5 5 5 2 ## 4 3 3 4 NA NA ## 5 2 2 1 2 1 # method 2 myvars <- c("q1", "q2", "q3", "q4", "q5") newdata <- leadership[myvars] newdata ## q1 q2 q3 q4 q5 ## 1 5 4 5 5 5 ## 2 3 5 2 5 5 ## 3 3 5 5 5 2 ## 4 3 3 4 NA NA ## 5 2 2 1 2 1 # method 3: using paste() function myvars <- paste("q", 1:5, sep = "") newdata <- leadership[myvars] newdata ## q1 q2 q3 q4 q5 ## 1 5 4 5 5 5 ## 2 3 5 2 5 5 ## 3 3 5 5 5 2 ## 4 3 3 4 NA NA ## 5 2 2 1 2 1 • Dropping Variables Drop q3 and q4 from the leadership data frame and saves them to the data frame newdata. leadership <- leadershipOriginal # method 1 newdata <- leadership[, c(-8,-9)] newdata ## manager date country gender age q1 q2 q5 ## 1 1 10/24/08 US M 32 5 4 5 ## 2 2 10/28/08 US F 45 3 5 5 ## 3 3 10/1/08 UK F 25 3 5 2 ## 4 4 10/12/08 UK M 39 3 3 NA ## 5 5 5/1/09 UK F 99 2 2 1 # method 2 myvars <- names(leadership) %in% c("q3", "q4") newdata <- leadership[!myvars] # method 3 leadership$q3 <- leadership$q4 <- NULL leadership ## manager date country gender age q1 q2 q5 ## 1 1 10/24/08 US M 32 5 4 5 ## 2 2 10/28/08 US F 45 3 5 5 ## 3 3 10/1/08 UK F 25 3 5 2 ## 4 4 10/12/08 UK M 39 3 3 NA ## 5 5 5/1/09 UK F 99 2 2 1 • Selecting Observations Again use leadership data. • select rows 1 through 3 leadership <- leadershipOriginal newdata <- leadership[1:3,] newdata ## manager date country gender age q1 q2 q3 q4 q5 ## 1 1 10/24/08 US M 32 5 4 5 5 5 ## 2 2 10/28/08 US F 45 3 5 2 5 5 ## 3 3 10/1/08 UK F 25 3 5 5 5 2 • select observations where men over 30 leadership <- leadershipOriginal newdata <- leadership[leadership$gender == "M" & leadership$age > 30, ] newdata ## manager date country gender age q1 q2 q3 q4 q5 ## 1 1 10/24/08 US M 32 5 4 5 5 5 ## 4 4 10/12/08 UK M 39 3 3 4 NA NA • select observations where men over 30 and be careful with comma. Figure out what is going on below leadership <- leadershipOriginal newdata <- leadership[leadership$gender == "M" & leadership$age > 30] newdata ## manager gender q1 q4 ## 1 1 M 5 5 ## 2 2 F 3 5 ## 3 3 F 3 5 ## 4 4 M 3 NA ## 5 5 F 2 2 • limit your analyses to observations collected between January 1, 2009 and December 31, 2009. leadership <- leadershipOriginal # Converts the date values read in originally as character values to date values using the format mm/dd/yy leadership$date <- as.Date(leadership$date, "%m/%d/%y") # Create starting and ending dates startdate <- as.Date("2009-01-01") enddate <- as.Date("2009-10-31") # Selects cases meeting your desired criteria newdata <- leadership[which(leadership$date >= startdate &
leadership$date <= enddate), ] newdata ## manager date country gender age q1 q2 q3 q4 q5 ## 5 5 2009-05-01 UK F 99 2 2 1 2 1 • subset() function The subset() function is probably the easiest way to select variables and observations. • Selects all rows that have a value of age greater than or equal to 35 or less than 24. Keeps variables q1 through q4. leadership <- leadershipOriginal newdata <- subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4)) newdata ## q1 q2 q3 q4 ## 2 3 5 2 5 ## 4 3 3 4 NA ## 5 2 2 1 2 • Selects all men over the age of 25, and keeps variables gender through q4 (gender, q4, and all columns between them) leadership <- leadershipOriginal newdata <- subset(leadership, age > 25 & gender == "M", select=c(gender:q4)) newdata ## gender age q1 q2 q3 q4 ## 1 M 32 5 4 5 5 ## 4 M 39 3 3 4 NA • sample() function The sample() function enables you to take a random sample (with or without replacement) of size n from a dataset. • take a random sample of size 3 from the leadership dataset leadership <- leadershipOriginal mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),] mysample ## manager date country gender age q1 q2 q3 q4 q5 ## 4 4 10/12/08 UK M 39 3 3 4 NA NA ## 2 2 10/28/08 US F 45 3 5 2 5 5 ## 1 1 10/24/08 US M 32 5 4 5 5 5 mystats <- function(x, parametric=TRUE, print=FALSE) { if (parametric) { center <- mean(x); spread <- sd(x) } else { center <- median(x); spread <- mad(x) } if (print & parametric) { cat("Mean=", center, "\n", "SD=", spread, "\n") } else if (print & !parametric) { cat("Median=", center, "\n", "MAD=", spread, "\n") } result <- list(center=center, spread=spread) return(result) } mystats(rnorm(500), parametric = FALSE, print = FALSE) ##$center
## [1] -0.02230415
##
## [1] 1.034385