Project Phase 1: Data Processing

Read in first data set (steps 1-3).

I started by using the library function to read in tidyverse and readr. Then I read in the csv file using the reader function read_csv and stored the original data in sheet1.

Next, I used chaining to clean the data. The select function selects the columns I wanted, pivot_longer transforms the data to long format. Then I used mutate to add two new columns. The first new column is for the year and the second new column was to keep track of which measurement was grabbed.

library(tidyverse)
library(readr)

sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv") 

sheet2 <- sheet1 %>%
  select(area_name = Area_name, STCOU, ends_with("D")) %>%
  pivot_longer(cols = 3:12, names_to = "census_id", values_to = "enrollment")

sheet3a <- sheet2 %>%
  mutate(year = if_else(as.numeric(substr(census_id, 8, 9)) > 69, as.numeric(substr(census_id, 8, 9)) + 1900,     as.numeric(substr(census_id, 8, 9)) + 2000), measurementID = substr(census_id, 1, 7))

Split data into two datasets (step 4).

Now we will split the data into two sets, one that contains only state data and one that contains only county level data. First, I added a variable called “county_state” to distinguish between the different area names. Then I used filter to filter by that new variable and create two separate data sets. Finally, I added a class to each of the data sets.

sheet3 <- sheet3a %>%
  mutate(county_state = ifelse(grepl(pattern = ", \\w\\w", area_name), "county", "state"))

state <- filter(sheet3, county_state == "state")
county <- filter(sheet3, county_state == "county")

class(county) <- c("county", class(county))
class(state) <- c("state", class(state))

Create a new column identifying the state (step 5).

Using the mutate function, add the two character state abbreviation to the county data set. I used mutate to add a new variable called state and the substr function to grab the last two characters from the area_name.

county1 <- county %>%
  mutate(state = substr(area_name, nchar(area_name)-1, nchar(area_name)))

Categorize state data by region (step 6).

Create a separate list that includes the states for each division. Then, add column to connect the region to the row based on the state the data is from. If the data isn’t from a specific state, list an error.

Div1 = list("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
Div2 = list("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
Div3 = list("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
Div4 = list("IOWA", "KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
Div5 = list("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA",       "DISTRICT OF COLUMBIA", "District of Columbia", "WEST VIRGINIA")
Div6 = list("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
Div7 = list("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
Div8 = list("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
Div9 = list("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")

state1 <- mutate(state, division = if_else(area_name %in% Div1, "1", 
                 if_else(area_name %in% Div2, "2", 
                   if_else(area_name %in% Div3, "3",
                     if_else(area_name %in% Div4, "4", 
                       if_else(area_name %in% Div5, "5",
                         if_else(area_name %in% Div6, "6",
                           if_else(area_name %in% Div7, "7", 
                             if_else(area_name %in% Div8, "8",
                               if_else(area_name %in% Div9, "9", 
                                 "ERROR"))))))))))

Project Phase 2: Create Functions

Now we will use the code above to create functions which will work with multiple csv files.

Create function to import and manipulate data (steps 1 & 2).

Create function convertLong to read in raw csv file, select appropriate columns, and then switch to long format.

convertLong <- function(dataRaw, new_var = "enrollment"){
  
  dataRawLong <- dataRaw %>%
    select(area_name = Area_name, STCOU, ends_with("D")) %>%
    pivot_longer(cols = 3:12, names_to = "census_id", values_to = new_var)

  return(dataRawLong)

}

Create function that takes the results from the previous function and extract the year (step 3).

Create function extractYear to find the year that the data was collected. Since the year was 2 digit, add 1900 to any years between 70-99 (1970-1999) and add 2000 to any years from 00-69 (2000-2069).

extractYear <- function(data_long){

  dataRawLongYr <- data_long %>% 
    mutate(year = if_else(as.numeric(substr(census_id, 8, 9)) > 69, as.numeric(substr(census_id, 8, 9)) +            1900, as.numeric(substr(census_id, 8, 9)) + 2000), measurementID = substr(census_id, 1, 7))

  return(dataRawLongYr)

}

Create function to extract state (step 5).

Create function extractState to find the state for all of the county level data using the last two characters of the area_name.

extractState <- function(data_county){
  
  dataWSTate <- mutate(data_county, state = substr(area_name, nchar(area_name)-1, nchar(area_name)))

  return(dataWSTate)

}

Create function to create new variable called division (step 6).

Using lists, create function createDivision to map each state to its division. The function first creates lists for each division and then uses an if_else statement to identify the appropriate division for each state.

createDivision <- function(data_state){

  Div1 = list("CONNECTICUT", "MAINE", "MASSACHUSETTS", "NEW HAMPSHIRE", "RHODE ISLAND", "VERMONT")
  Div2 = list("NEW JERSEY", "NEW YORK", "PENNSYLVANIA")
  Div3 = list("ILLINOIS", "INDIANA", "MICHIGAN", "OHIO", "WISCONSIN")
  Div4 = list("IOWA", "KANSAS", "MINNESOTA", "MISSOURI", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA")
  Div5 = list("DELAWARE", "FLORIDA", "GEORGIA", "MARYLAND", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA",       "DISTRICT OF COLUMBIA", "District of Columbia", "WEST VIRGINIA")
  Div6 = list("ALABAMA", "KENTUCKY", "MISSISSIPPI", "TENNESSEE")
  Div7 = list("ARKANSAS", "LOUISIANA", "OKLAHOMA", "TEXAS")
  Div8 = list("ARIZONA", "COLORADO", "IDAHO", "MONTANA", "NEVADA", "NEW MEXICO", "UTAH", "WYOMING")
  Div9 = list("ALASKA", "CALIFORNIA", "HAWAII", "OREGON", "WASHINGTON")

  dataDiv <- mutate(data_state, division = if_else(area_name %in% Div1, "1", 
    if_else(area_name %in% Div2, "2", 
      if_else(area_name %in% Div3, "3",
        if_else(area_name %in% Div4, "4", 
          if_else(area_name %in% Div5, "5",
            if_else(area_name %in% Div6, "6",
              if_else(area_name %in% Div7, "7", 
                if_else(area_name %in% Div8, "8",
                  if_else(area_name %in% Div9, "9", 
                    "ERROR"))))))))))

  return(dataDiv)

}

Split data into county & state (step 4) then wrap functions for steps 5 & 6.

Create a function to split the data into two different data sets, one for the county-level data and one for the state data. First, create a new variable called county_state to determine which list the data should be split into. Then, filter the data into two separate data sets using filter. Once the data is split, run the function createDivision on the data set that has the state-level data and run the function extractState on the data that contains the county-level data.

#create function to split data
splitDataC <- function(data_longyr){

  #first add column to identify data into county & state
  dataClassification <- data_longyr %>%
    mutate(county_state = ifelse(grepl(pattern = ", \\w\\w", area_name), "county", "state"))

  #filter state data
  data_state <- dataClassification %>% 
    filter(county_state == "state")

  class(data_state) <- c("state", class(data_state))

  #run createDivision function to map divisions.
  stateF <- createDivision(data_state)

  #now filter county data.
  data_county <- dataClassification %>%
    filter(county_state == "county")
  
    class(data_county) <- c("county", class(data_county))

  #extract state from county data.  
  countyF <- extractState(data_county) 

  return(list(state = stateF, realcounty = countyF))

}

Wrap all of the functions together into one.

Now, create a function called my_wrapper to combine all of the other functions created above.

my_wrapper <- function(rawData, new_var = "enrollment"){
  data_imported <- read_csv(rawData)
  data_long <- convertLong(data_imported)
  data_longyr <- extractYear(data_long)
  results <- splitDataC(data_longyr)
  return(results)
  
}

Write a function to combine results.

Now, run function my_wrapper twice on two different data sets. Then, create a function called combineData to combine the results from the two data sets. The result will be two different data sets - one for the state-level data that has data from EDU01a & EDU01b and another for the county-level data, again from EDU01a & EDU01b.

EDU01a <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")

EDU01b <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")

combineData <- function(data1, data2){
  data_state <- dplyr::bind_rows(data1$state, data2$state)
  data_county <- dplyr::bind_rows(data1$realcounty, data2$realcounty)
  
  return(list(data_state = data_state, data_county = data_county))
  
}

data_all <- combineData(EDU01a, EDU01b)  

Project Phase 3: Write a generic function for summarizing.

Next, we will create custom graphing functions.

Create plot.state function.

Create the custom plot.state function for data where class is state. The inputs are df for the data set and var_name for the column name. The default var_name is enrollment. First, the function will group the data set by division and year, then it will summarize the mean of var_name. Then, it will filter out any rows where division is an ERROR. Finally it will plot the data using ggplot.

plot.state <- function(df, var_name = "enrollment"){

  new_df <- df %>%
    group_by(division, year) %>%
    summarize(mean = mean(get(var_name))) %>%
    filter(division != "ERROR")

  ggplot(new_df, aes(x = year, y = mean, color = division)) + geom_line()  

}

Create plot.county function.

Now we will create the custom plot.state function for data where class is county. The inputs are df for the data set, censusState for the state (default is NC), top_bottom for the top or bottom n rows (default is top), number for the number of rows to return (default is 5), and var_name for the column name (default is enrollment).

To begin, we will first filter all of the data by state. Then, we will use an if/else if statement to determine what to do based on whether the user selects the default of “top” or the alternative “bottom”. If the user selects “top”, we will create a new data set based on the filtered data set (filtered by state) where we group by area_name, take the mean of the var_name, and then arrange the data in descending order by mean of var_name. If the user opts for “bottom”, we do exactly the same except we use the default ascending sort order.

Once we have the new data frame, we will find the requested number of area_name and save that into a new data set called df_lookup. Now, we will create a final data set. Here we will add a variable called include which will take the value of 1 if the area_name is found in the df_lookup data set and 0 if it is not. We will then filter only the rows that match. Finally, we will map this new data set that will include our requested number of rows, top or bottom, by our selected state.

plot.county <- function(df, var_name = "enrollment", censusState = "NC", top_bottom = "top", number = 5){
  
  df_filtered <- df %>%
    filter(state == censusState) 

  if (top_bottom == "top"){  
    new_df <- df_filtered %>%
      group_by(area_name) %>%
      summarize(mean = mean(get(var_name))) %>%
      arrange(desc(mean))
  }
   
  else if (top_bottom == "bottom"){
    new_df <- df_filtered %>%
      group_by(area_name) %>%
      summarize(mean = mean(get(var_name))) %>%
      arrange(mean)
  }

  df_lookup <- new_df[1:number, 1]

  df_subset <- df_filtered %>%
    mutate(include = if_else(area_name %in% df_lookup$area_name, "1", "0")) %>%
    filter(include == "1")

  ggplot(df_subset, aes(x = year, y = get(var_name), color = area_name)) + geom_line()  

}

Project Phase 4: Put it Together

Now we will use everything we have created so far to output different data plots.

Two enrollment URLs

First, we will use the two csv files that we have been working with. Run these using the my_wrapper function first.

EDU01a_4 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv", "enrollment")

EDU01b_4 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv", "enrollment")

Then, use the combineData function to put these into one object.

allData <- combineData(EDU01a_4, EDU01b_4)  

Use plot function on state data frame.

Note: NJ data missing for 1997 which explains the dip in region 2.

plot(allData$data_state, var_name = "enrollment")

Use plot function on county data frame.

Let’s find the plot using state “PA”, group is “top”, number is 7.

plot(data_all$data_county, censusState = "PA", top_bottom = "top", number = 7)

Now let’s specifying state is “PA”, group is “bottom”, number is 4.

plot(data_all$data_county, censusState = "PA", top_bottom = "bottom", number = 4)

Let’s use all of the defaults.

plot(data_all$data_county)

And now let’s run it specifying the state to be “MN”, group is top, number is 10.

plot(data_all$data_county, censusState = "MN", top_bottom = "top", number = 10)

Four New Data Sets

Now we will get four entirely new data sets. First let’s use the function my_wrapper.

PST01a <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv", "enrollment")
PST01b <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv", "enrollment")
PST01c <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv", "enrollment")
PST01d <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv", "enrollment")

Now, run the combining function to combine data into one object. We have to run it three times since there are a total of four objects this time.

comboData1 <- combineData(PST01a, PST01b)
comboData2 <- combineData(comboData1, PST01c)
comboDataAll <- combineData(comboData2, PST01d)

Plot new data for state data frames.

Now for the plots. First, we will run the data by division.

plot(comboDataAll$data_state, var_name = "enrollment")

Use plot function on new county data frame.

Now let’s plot the county-level data, first using state “CT”, group is “top”, number is 6.

plot(comboDataAll$data_county, censusState = "CT", top_bottom = "top", number = 6)

Now, let’s specify state is “NC”, group is “bottom”, number is 10.

plot(comboDataAll$data_county, censusState = "NC", top_bottom = "bottom", number = 10)

And now with the defaults.

plot(comboDataAll$data_county)

And finally, specifying the state to be “MN”, group is top, number is 4.

plot(comboDataAll$data_county, censusState = "MN", top_bottom = "top", number = 4)