Chapter 16 R Practice Problems
Exercise (Counting Missing Values) Create a vector of lenght 100 with every third element missing (namely
NA) by only using vector operations. Then count the number of missing values by utilizing only the following two functionis.naandsum.Leadership Example Suppose you have the following data
manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadershipOriginal <- data.frame(manager, date, country, gender, age,
q1, q2, q3, q4, q5, stringsAsFactors=FALSE)- Create a new dataset
newdatacontaining rows sorted from youngest manager to oldest man- ager.
leadership <- leadershipOriginal
newdata <- leadership[order(leadership$age),]
newdata## manager date country gender age q1 q2 q3 q4 q5
## 3 3 10/1/08 UK F 25 3 5 5 5 2
## 1 1 10/24/08 US M 32 5 4 5 5 5
## 4 4 10/12/08 UK M 39 3 3 4 NA NA
## 2 2 10/28/08 US F 45 3 5 2 5 5
## 5 5 5/1/09 UK F 99 2 2 1 2 1
- sorts the rows into female followed by male, and youngest to oldest within each gender.
newdata <- leadership[order(leadership$gender, leadership$age),]
newdata## manager date country gender age q1 q2 q3 q4 q5
## 3 3 10/1/08 UK F 25 3 5 5 5 2
## 2 2 10/28/08 US F 45 3 5 2 5 5
## 5 5 5/1/09 UK F 99 2 2 1 2 1
## 1 1 10/24/08 US M 32 5 4 5 5 5
## 4 4 10/12/08 UK M 39 3 3 4 NA NA
- sorts the rows by gender, and then from oldest to youngest manager within each gender.
newdata <- leadership[order(leadership$gender, -leadership$age),]
newdata## manager date country gender age q1 q2 q3 q4 q5
## 5 5 5/1/09 UK F 99 2 2 1 2 1
## 2 2 10/28/08 US F 45 3 5 2 5 5
## 3 3 10/1/08 UK F 25 3 5 5 5 2
## 4 4 10/12/08 UK M 39 3 3 4 NA NA
## 1 1 10/24/08 US M 32 5 4 5 5 5
- __ Merging Datasets __ Suppose we have these two data sets. Combine them in a meaningful way.
names_1 <- c("A", "B", "C")
names_2 <- c("A", "C", "B")
gender <- c("M", "F", "F")
age <- c(32, 45, 25)
dataFrame_1 <- data.frame(names_1, gender)
dataFrame_2 <- data.frame(names_2, age)
dataFrame_1
## names_1 gender
## 1 A M
## 2 B F
## 3 C F
dataFrame_2
## names_2 age
## 1 A 32
## 2 C 45
## 3 B 25
dataFrame_1new <- dataFrame_1[order(dataFrame_1$names_1),]
names(dataFrame_1new)[1] <- "names"
dataFrame_2new <- dataFrame_2[order(dataFrame_2$names_2),]
names(dataFrame_2new)[1] <- "names"
dataFrameOriginal <- merge(dataFrame_1new, dataFrame_2new, by = "names")- Suppose you want to add another variable to the data frame by using
cbind:
names <- c("A", "B", "C")
income <- c(100, 85, 125)
dataFrame_3 <- data.frame(names, income)
dataFrame <- cbind(dataFrameOriginal, dataFrame_3["income"])
dataFrame
## names gender age income
## 1 A M 32 100
## 2 B F 25 85
## 3 C F 45 125- Suppose you want to add another observation to the data frame by using
rbind:
dataFrame_4 <- data.frame(names = c("D"), gender = c("F"), age = c(44))
dataFrame_4
## names gender age
## 1 D F 44
dataFrame <- rbind(dataFrameOriginal, dataFrame_4)
dataFrame
## names gender age
## 1 A M 32
## 2 B F 25
## 3 C F 45
## 4 D F 44- Suppose you want to add another observation to the data frame by using
rbindbut this time two data frames have different colums:
dataFrame_5 <- data.frame(names = c("D"), gender = c("F"), age = c(44), occupation = c("teacher"))
dataFrame_5
## names gender age occupation
## 1 D F 44 teacher
# method 1: append the dataFrameOriginal
dataFrame <- cbind(dataFrameOriginal, occupation = c(NA))
dataFrame <- rbind(dataFrame, dataFrame_5)
dataFrame
## names gender age occupation
## 1 A M 32 <NA>
## 2 B F 25 <NA>
## 3 C F 45 <NA>
## 4 D F 44 teacherdataFrame_5 <- data.frame(names = c("D"), gender = c("F"), age = c(44), occupation = c("teacher"))
dataFrame_5
## names gender age occupation
## 1 D F 44 teacher
# method 2: delete the NA column
dataFrame <- rbind(dataFrameOriginal, dataFrame_5[1:3])
dataFrame
## names gender age
## 1 A M 32
## 2 B F 25
## 3 C F 45
## 4 D F 44- Subsetting selects variables q1, q2, q3, q4, and q5 from the leadership data frame and saves them to the data frame newdata.
leadership <- leadershipOriginal
# method 1
newdata <- leadership[, c(6:10)]
newdata## q1 q2 q3 q4 q5
## 1 5 4 5 5 5
## 2 3 5 2 5 5
## 3 3 5 5 5 2
## 4 3 3 4 NA NA
## 5 2 2 1 2 1
# method 2
myvars <- c("q1", "q2", "q3", "q4", "q5")
newdata <- leadership[myvars]
newdata## q1 q2 q3 q4 q5
## 1 5 4 5 5 5
## 2 3 5 2 5 5
## 3 3 5 5 5 2
## 4 3 3 4 NA NA
## 5 2 2 1 2 1
# method 3: using paste() function
myvars <- paste("q", 1:5, sep = "")
newdata <- leadership[myvars]
newdata## q1 q2 q3 q4 q5
## 1 5 4 5 5 5
## 2 3 5 2 5 5
## 3 3 5 5 5 2
## 4 3 3 4 NA NA
## 5 2 2 1 2 1
- Dropping Variables Drop q3 and q4 from the leadership data frame and saves them to the data frame newdata.
leadership <- leadershipOriginal
# method 1
newdata <- leadership[, c(-8,-9)]
newdata## manager date country gender age q1 q2 q5
## 1 1 10/24/08 US M 32 5 4 5
## 2 2 10/28/08 US F 45 3 5 5
## 3 3 10/1/08 UK F 25 3 5 2
## 4 4 10/12/08 UK M 39 3 3 NA
## 5 5 5/1/09 UK F 99 2 2 1
# method 2
myvars <- names(leadership) %in% c("q3", "q4")
newdata <- leadership[!myvars]
# method 3
leadership$q3 <- leadership$q4 <- NULL
leadership## manager date country gender age q1 q2 q5
## 1 1 10/24/08 US M 32 5 4 5
## 2 2 10/28/08 US F 45 3 5 5
## 3 3 10/1/08 UK F 25 3 5 2
## 4 4 10/12/08 UK M 39 3 3 NA
## 5 5 5/1/09 UK F 99 2 2 1
Selecting Observations Again use
leadershipdata.select rows 1 through 3
leadership <- leadershipOriginal
newdata <- leadership[1:3,]
newdata## manager date country gender age q1 q2 q3 q4 q5
## 1 1 10/24/08 US M 32 5 4 5 5 5
## 2 2 10/28/08 US F 45 3 5 2 5 5
## 3 3 10/1/08 UK F 25 3 5 5 5 2
- select observations where men over 30
leadership <- leadershipOriginal
newdata <- leadership[leadership$gender == "M" & leadership$age > 30, ]
newdata## manager date country gender age q1 q2 q3 q4 q5
## 1 1 10/24/08 US M 32 5 4 5 5 5
## 4 4 10/12/08 UK M 39 3 3 4 NA NA
- select observations where men over 30 and be careful with comma. Figure out what is going on below
leadership <- leadershipOriginal
newdata <- leadership[leadership$gender == "M" & leadership$age > 30]
newdata## manager gender q1 q4
## 1 1 M 5 5
## 2 2 F 3 5
## 3 3 F 3 5
## 4 4 M 3 NA
## 5 5 F 2 2
- limit your analyses to observations collected between January 1, 2009 and December 31, 2009.
leadership <- leadershipOriginal
# Converts the date values read in originally as character values to date values using the format mm/dd/yy
leadership$date <- as.Date(leadership$date, "%m/%d/%y")
# Create starting and ending dates
startdate <- as.Date("2009-01-01")
enddate <- as.Date("2009-10-31")
# Selects cases meeting your desired criteria
newdata <- leadership[which(leadership$date >= startdate &
leadership$date <= enddate), ]
newdata## manager date country gender age q1 q2 q3 q4 q5
## 5 5 2009-05-01 UK F 99 2 2 1 2 1
subset() function The
subset()function is probably the easiest way to select variables and observations.Selects all rows that have a value of age greater than or equal to 35 or less than 24. Keeps variables q1 through q4.
leadership <- leadershipOriginal
newdata <- subset(leadership, age >= 35 | age < 24,
select=c(q1, q2, q3, q4))
newdata## q1 q2 q3 q4
## 2 3 5 2 5
## 4 3 3 4 NA
## 5 2 2 1 2
- Selects all men over the age of 25, and keeps variables gender through q4 (gender, q4, and all columns between them)
leadership <- leadershipOriginal
newdata <- subset(leadership, age > 25 & gender == "M",
select=c(gender:q4))
newdata## gender age q1 q2 q3 q4
## 1 M 32 5 4 5 5
## 4 M 39 3 3 4 NA
sample() function The
sample()function enables you to take a random sample (with or without replacement) of size n from a dataset.take a random sample of size 3 from the leadership dataset
leadership <- leadershipOriginal
mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),]
mysample## manager date country gender age q1 q2 q3 q4 q5
## 4 4 10/12/08 UK M 39 3 3 4 NA NA
## 2 2 10/28/08 US F 45 3 5 2 5 5
## 1 1 10/24/08 US M 32 5 4 5 5 5
mystats <- function(x, parametric=TRUE, print=FALSE) {
if (parametric) {
center <- mean(x); spread <- sd(x)
} else {
center <- median(x); spread <- mad(x)
}
if (print & parametric) {
cat("Mean=", center, "\n", "SD=", spread, "\n")
} else if (print & !parametric) {
cat("Median=", center, "\n", "MAD=", spread, "\n")
}
result <- list(center=center, spread=spread)
return(result)
}
mystats(rnorm(500), parametric = FALSE, print = FALSE)## $center
## [1] -0.02230415
##
## $spread
## [1] 1.034385
Williams, Joseph M, and Joseph Bizup. 2010. Style: Lessons in Clarity and Grace. Vol. 565214475. Longman Boston.
Xie, Yihui. 2015. Dynamic Documents with R and Knitr. 2nd ed. Boca Raton, Florida: Chapman; Hall/CRC. http://yihui.name/knitr/.
———. 2016. Dynamic Documents with R and Knitr. Chapman; Hall/CRC.
Xie, Yihui, JJ Allaire, and Garrett Grolemund. 2018. R Markdown: The Definitive Guide. CRC Press.