I started by using the library
function to read in
tidyverse
and readr
. Then I read in the csv
file using the reader function read_csv
and stored the
original data in sheet1.
Next, I used chaining to clean the data. The select
function selects the columns I wanted, pivot_longer
transforms the data to long format. Then I used mutate
to
add two new columns. The first new column is for the year and the second
new column was to keep track of which measurement was grabbed.
library(tidyverse)
library(readr)
sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
sheet2 <- sheet1 %>%
select(area_name = Area_name, STCOU, ends_with("D")) %>%
pivot_longer(cols = 3:12, names_to = "census_id", values_to = "enrollment")
sheet3a <- sheet2 %>%
mutate(year = if_else(as.numeric(substr(census_id, 8, 9)) > 69, as.numeric(substr(census_id, 8, 9)) + 1900, as.numeric(substr(census_id, 8, 9)) + 2000), measurementID = substr(census_id, 1, 7))
Now we will split the data into two sets, one that contains only
state data and one that contains only county level data. First, I added
a variable called “county_state” to distinguish between the different
area names. Then I used filter
to filter by that new
variable and create two separate data sets. Finally, I added a class to
each of the data sets.
sheet3 <- sheet3a %>%
mutate(county_state = ifelse(grepl(pattern = ", \\w\\w", area_name), "county", "state"))
state <- filter(sheet3, county_state == "state")
county <- filter(sheet3, county_state == "county")
class(county) <- c("county", class(county))
class(state) <- c("state", class(state))
Using the mutate
function, add the two character state
abbreviation to the county data set. I used mutate
to add a
new variable called state and the substr
function to grab
the last two characters from the area_name.
county1 <- county %>%
mutate(state = substr(area_name, nchar(area_name)-1, nchar(area_name)))
Create a separate list that includes the states for each division. Then, add column to connect the region to the row based on the state the data is from. If the data isn’t from a specific state, list an error.
Div1 = list("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
Div2 = list("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
Div3 = list("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
Div4 = list("IOWA", "KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
Div5 = list("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "DISTRICT OF COLUMBIA", "District of Columbia", "WEST VIRGINIA")
Div6 = list("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
Div7 = list("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
Div8 = list("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
Div9 = list("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")
state1 <- mutate(state, division = if_else(area_name %in% Div1, "1",
if_else(area_name %in% Div2, "2",
if_else(area_name %in% Div3, "3",
if_else(area_name %in% Div4, "4",
if_else(area_name %in% Div5, "5",
if_else(area_name %in% Div6, "6",
if_else(area_name %in% Div7, "7",
if_else(area_name %in% Div8, "8",
if_else(area_name %in% Div9, "9",
"ERROR"))))))))))
Now we will use the code above to create functions which will work with multiple csv files.
Create function convertLong
to read in raw csv file,
select appropriate columns, and then switch to long format.
convertLong <- function(dataRaw, new_var = "enrollment"){
dataRawLong <- dataRaw %>%
select(area_name = Area_name, STCOU, ends_with("D")) %>%
pivot_longer(cols = 3:12, names_to = "census_id", values_to = new_var)
return(dataRawLong)
}
Create function extractYear
to find the year that the
data was collected. Since the year was 2 digit, add 1900 to any years
between 70-99 (1970-1999) and add 2000 to any years from 00-69
(2000-2069).
extractYear <- function(data_long){
dataRawLongYr <- data_long %>%
mutate(year = if_else(as.numeric(substr(census_id, 8, 9)) > 69, as.numeric(substr(census_id, 8, 9)) + 1900, as.numeric(substr(census_id, 8, 9)) + 2000), measurementID = substr(census_id, 1, 7))
return(dataRawLongYr)
}
Create function extractState
to find the state for all
of the county level data using the last two characters of the
area_name
.
extractState <- function(data_county){
dataWSTate <- mutate(data_county, state = substr(area_name, nchar(area_name)-1, nchar(area_name)))
return(dataWSTate)
}
Using lists, create function createDivision
to map each
state to its division
. The function first creates lists for
each division and then uses an if_else
statement to
identify the appropriate division for each state.
createDivision <- function(data_state){
Div1 = list("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
Div2 = list("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
Div3 = list("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
Div4 = list("IOWA", "KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
Div5 = list("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA", "DISTRICT OF COLUMBIA", "District of Columbia", "WEST VIRGINIA")
Div6 = list("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
Div7 = list("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
Div8 = list("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
Div9 = list("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")
dataDiv <- mutate(data_state, division = if_else(area_name %in% Div1, "1",
if_else(area_name %in% Div2, "2",
if_else(area_name %in% Div3, "3",
if_else(area_name %in% Div4, "4",
if_else(area_name %in% Div5, "5",
if_else(area_name %in% Div6, "6",
if_else(area_name %in% Div7, "7",
if_else(area_name %in% Div8, "8",
if_else(area_name %in% Div9, "9",
"ERROR"))))))))))
return(dataDiv)
}
Create a function to split the data into two different data sets, one
for the county-level data and one for the state data. First, create a
new variable called county_state
to determine which list
the data should be split into. Then, filter the data into two separate
data sets using filter
. Once the data is split, run the
function createDivision
on the data set that has the
state-level data and run the function extractState
on the
data that contains the county-level data.
#create function to split data
splitDataC <- function(data_longyr){
#first add column to identify data into county & state
dataClassification <- data_longyr %>%
mutate(county_state = ifelse(grepl(pattern = ", \\w\\w", area_name), "county", "state"))
#filter state data
data_state <- dataClassification %>%
filter(county_state == "state")
class(data_state) <- c("state", class(data_state))
#run createDivision function to map divisions.
stateF <- createDivision(data_state)
#now filter county data.
data_county <- dataClassification %>%
filter(county_state == "county")
class(data_county) <- c("county", class(data_county))
#extract state from county data.
countyF <- extractState(data_county)
return(list(state = stateF, realcounty = countyF))
}
Now, create a function called my_wrapper
to combine all
of the other functions created above.
my_wrapper <- function(rawData, new_var = "enrollment"){
data_imported <- read_csv(rawData)
data_long <- convertLong(data_imported)
data_longyr <- extractYear(data_long)
results <- splitDataC(data_longyr)
return(results)
}
Now, run function my_wrapper
twice on two different data
sets. Then, create a function called combineData
to combine
the results from the two data sets. The result will be two different
data sets - one for the state-level data that has data from EDU01a &
EDU01b and another for the county-level data, again from EDU01a &
EDU01b.
EDU01a <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
EDU01b <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
combineData <- function(data1, data2){
data_state <- dplyr::bind_rows(data1$state, data2$state)
data_county <- dplyr::bind_rows(data1$realcounty, data2$realcounty)
return(list(data_state = data_state, data_county = data_county))
}
data_all <- combineData(EDU01a, EDU01b)
Next, we will create custom graphing functions.
Create the custom plot.state function for data where class is state.
The inputs are df
for the data set and
var_name
for the column name. The default
var_name
is enrollment. First, the function will group the
data set by division and year, then it will summarize the mean of
var_name
. Then, it will filter out any rows where division
is an ERROR
. Finally it will plot the data using
ggplot
.
plot.state <- function(df, var_name = "enrollment"){
new_df <- df %>%
group_by(division, year) %>%
summarize(mean = mean(get(var_name))) %>%
filter(division != "ERROR")
ggplot(new_df, aes(x = year, y = mean, color = division)) + geom_line()
}
Now we will create the custom plot.state function for data where
class is county. The inputs are df
for the data set,
censusState
for the state (default is NC),
top_bottom
for the top or bottom n rows (default is top),
number
for the number of rows to return (default is 5), and
var_name
for the column name (default is enrollment).
To begin, we will first filter all of the data by state. Then, we
will use an if/else if
statement to determine what to do
based on whether the user selects the default of “top” or the
alternative “bottom”. If the user selects “top”, we will create a new
data set based on the filtered data set (filtered by state) where we
group by area_name, take the mean of the var_name, and then arrange the
data in descending order by mean of var_name. If the user opts for
“bottom”, we do exactly the same except we use the default ascending
sort order.
Once we have the new data frame, we will find the requested number of
area_name and save that into a new data set called
df_lookup
. Now, we will create a final data set. Here we
will add a variable called include
which will take the
value of 1 if the area_name is found in the df_lookup
data
set and 0 if it is not. We will then filter only the rows that match.
Finally, we will map this new data set that will include our requested
number of rows, top or bottom, by our selected state.
plot.county <- function(df, var_name = "enrollment", censusState = "NC", top_bottom = "top", number = 5){
df_filtered <- df %>%
filter(state == censusState)
if (top_bottom == "top"){
new_df <- df_filtered %>%
group_by(area_name) %>%
summarize(mean = mean(get(var_name))) %>%
arrange(desc(mean))
}
else if (top_bottom == "bottom"){
new_df <- df_filtered %>%
group_by(area_name) %>%
summarize(mean = mean(get(var_name))) %>%
arrange(mean)
}
df_lookup <- new_df[1:number, 1]
df_subset <- df_filtered %>%
mutate(include = if_else(area_name %in% df_lookup$area_name, "1", "0")) %>%
filter(include == "1")
ggplot(df_subset, aes(x = year, y = get(var_name), color = area_name)) + geom_line()
}
Now we will use everything we have created so far to output different data plots.
First, we will use the two csv files that we have been working with.
Run these using the my_wrapper
function first.
EDU01a_4 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", "enrollment")
EDU01b_4 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", "enrollment")
Then, use the combineData
function to put these into one
object.
allData <- combineData(EDU01a_4, EDU01b_4)
state
data frame.Note: NJ data missing for 1997 which explains the dip in region 2.
plot(allData$data_state, var_name = "enrollment")
county
data frame.Let’s find the plot using state “PA”, group is “top”, number is 7.
plot(data_all$data_county, censusState = "PA", top_bottom = "top", number = 7)
Now let’s specifying state is “PA”, group is “bottom”, number is 4.
plot(data_all$data_county, censusState = "PA", top_bottom = "bottom", number = 4)
Let’s use all of the defaults.
plot(data_all$data_county)
And now let’s run it specifying the state to be “MN”, group is top, number is 10.
plot(data_all$data_county, censusState = "MN", top_bottom = "top", number = 10)
Now we will get four entirely new data sets. First let’s use the
function my_wrapper
.
PST01a <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv", "enrollment")
PST01b <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv", "enrollment")
PST01c <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv", "enrollment")
PST01d <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv", "enrollment")
Now, run the combining function to combine data into one object. We have to run it three times since there are a total of four objects this time.
comboData1 <- combineData(PST01a, PST01b)
comboData2 <- combineData(comboData1, PST01c)
comboDataAll <- combineData(comboData2, PST01d)
Now for the plots. First, we will run the data by division.
plot(comboDataAll$data_state, var_name = "enrollment")
county
data frame.Now let’s plot the county-level data, first using state “CT”, group is “top”, number is 6.
plot(comboDataAll$data_county, censusState = "CT", top_bottom = "top", number = 6)
Now, let’s specify state is “NC”, group is “bottom”, number is 10.
plot(comboDataAll$data_county, censusState = "NC", top_bottom = "bottom", number = 10)
And now with the defaults.
plot(comboDataAll$data_county)
And finally, specifying the state to be “MN”, group is top, number is 4.
plot(comboDataAll$data_county, censusState = "MN", top_bottom = "top", number = 4)