VT Alliance Selection Protocol - Sample

Author

Elinor Benami

Published

May 17, 2024

Code
# Clear the workspace
rm(list = ls())

# load the libraries
library(tidyverse)
library(data.table) # to read in files faster
library(tidylog)
library(janitor)
library(scales) # for comma and formatting
library(knitr) #for kable
library(kableExtra)
library(dtplyr)

# set params ---- 
options(digits = 2) # show two significant digits
download_date <- as.Date("2024-05-01")
today_date <- Sys.Date()
'%nin%' <- function(x, table) !(x %in% table)

## Custom Functions ---
# Define the function to remove columns that are all NA
remove_na_columns <- function(df) {
  # Use select and where to filter out columns that are all NA
  df_selected <- df %>%
    select(where(~ !all(is.na(.))))

  # Return the modified dataframe
  return(df_selected)
}

flag_for_review <- function(df){
  df %>%
    group_by(lastname_entryid, farm_number) %>%
    mutate(
      sum_practice_units = sum(farm_practice_units, farm_practice_animals, na.rm = TRUE),
      over_totalfarm_units = sum_practice_units > farm_totalunits,
      na_count = rowSums(is.na(across(everything()))),
      flag_for_review = FALSE
    ) %>%
    group_by(lastname_entryid, farm_number) %>%
    mutate(
      flag_for_review = ifelse(over_totalfarm_units == TRUE & n() > 1 & na_count == max(na_count), 1, 0)
    ) %>%
    ungroup() %>% 
    select(flag_for_review, na_count, over_totalfarm_units, everything()) 
}

1 Load the Data

The application data is from 2024-05-01.

Code
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Load the data ---
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
data_path <- function(x) paste0(my_file_path, x)

# Read the data, tidy column names, and sort by last name
applications <- 
  fread(data_path(paste0("vt_alliance/application-", download_date,".csv"))) %>% 
  as.data.frame() %>% 
  clean_names() %>% 
  # convert all blank columns to NA's
  na_if("") %>% 
  na_if(" ") %>% 
  # these have an incorrect number of total acres/animal units (using the sum of the two farms instead)
  mutate(total_number_of_acres_or_animal_units = 
           # this entry had plans to enroll 160 but for farm 1 and farm 2 put 0
           ifelse(entry_id == 2324, 160, total_number_of_acres_or_animal_units)) %>%
  group_by(entry_id) %>% 
  # these entries had a total that didn't equal the sum of the two farms; defaulted to sum of two farms
  mutate(total_number_of_acres_or_animal_units = 
           case_when(
             entry_id %in% c(3851, 1853) ~
               sum(acres_or_animal_units_for_farm_number_1, 
                   acres_or_animal_units_for_farm_number_2, na.rm = TRUE),
             TRUE ~ total_number_of_acres_or_animal_units
  )) %>% 
  # this is an ironistic test account -- it also has also has some subfiles
  filter(entry_id %nin% c(2450, 2453, 2457))  %>% 
    mutate(
    primary_applicant_name = paste(primary_applicant_name_first, primary_applicant_name_last, sep = "_"),
    coapplicant1_name = ifelse(!is.na(co_applicant_number_1_name_first), 
                               paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep = "_"), NA),
    coapplicant2_name = ifelse(!is.na(co_applicant_number_2_name_first), 
                               paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep = "_"), NA), 
    state = trimws(gsub(" ", "_", tolower(state)))
  ) %>% 
  ungroup()  #  filter(primary_applicant_name_last != "Lee123") 

2 Generate a file that identifies and exports the farm, commodity, practice, and enhancement data

Code
process_applications <- function(applications, farm_number, included_state, excluded_states) {
  included_state <- gsub(" ", "_", tolower(included_state))
  state_title <- included_state

  if(farm_number == 2){ 
    farm_tractnum = "if_known_please_enter_the_tract_numbers_where_the_practices_will_be_installed"
    farm_practice = sym(paste0(included_state, "_planned_conservation_practices_practice_3"))
    farm_practice_units = paste0(included_state, "_planned_conservation_practices_number_of_acres_you_plan_to_enroll_2")
    farm_practice_animals = paste0(included_state, "_planned_conservation_practices_number_of_animals_you_plan_to_enroll_2")
    farm_enhancement_yesno = sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional_2"))
  } else if (farm_number == 1){
    farm_tractnum = sym(paste0("farm_number_", farm_number, "_tract_numbers"))
    farm_practice = sym(paste0(included_state, "_planned_conservation_practices_practice"))
    farm_practice_units = paste0(included_state, "_planned_conservation_practices_number_of_acres_you_plan_to_enroll")
    farm_practice_animals = paste0(included_state, "_planned_conservation_practices_number_of_animals_you_plan_to_enroll")
    farm_enhancement_yesno = sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional"))
  } else{
    print("Please enter a valid farm number (1 or 2 only)")
  }
  
  # Construct column names dynamically and force evaluation
  rename_list <- list(
    farm_commodity = sym(paste0(included_state, 
                                "_planned_conservation_practices_residue_and_tillage_management_no_till_enhancements_optional_", farm_number * 2)),
    farm_fsa = sym(paste0("please_enter_the_fsa_number_for_farm_number_", farm_number)),
    farm_tractnum = farm_tractnum,
    farm_practice = farm_practice,
    farm_practice_units = farm_practice_units,
    farm_practice_animals = farm_practice_animals,
    farm_totalunits = sym(paste0("acres_or_animal_units_for_farm_number_", farm_number)),
    farm_enhancement_yesno = farm_enhancement_yesno,
    farm_enhancement_yesno2 = sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional_yes"))
  )
  
  # Use rename with !!! to splice the list
  renamed_columns <- applications %>% rename(!!!rename_list)
  
  # Adjusting selection and filtering according to the input parameters
  processed_data <- renamed_columns %>%
    select(entry_id, state, contains("name"), contains(c("farm", "enhancements", "pdf")),
           -contains(c("name_middle", "prefix", "suffix"))) %>%
    mutate(
      lastname_entryid = ifelse(!is.na(primary_applicant_name_last), 
                                paste0(primary_applicant_name_last, "_", entry_id), NA_character_)
    ) %>%
    fill(lastname_entryid, contains(c("primary_applicant")), state, .direction = "down") %>%
    select(lastname_entryid, everything(), contains("enhancements"), pdf_alliance_application) %>%
    filter(state == state_title) %>%
    select(-contains(excluded_states)) %>%
    group_by(lastname_entryid) %>%
    fill(contains(c("applicant")), .direction = "down") %>% 
    ungroup() %>% 
    remove_na_columns()
  
  # Additional processing
  long2 <- 
    processed_data %>%
    mutate(
      primary_applicant_name = paste(primary_applicant_name_first, primary_applicant_name_last, sep = "_"),
      coapplicant1_name = ifelse(!is.na(co_applicant_number_1_name_first),
                                 paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep = "_"), NA),
      coapplicant2_name = ifelse(!is.na(co_applicant_number_2_name_first),
                                 paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep = "_"), NA)
    ) %>%
    pivot_longer(
      cols = contains("enhancements"),
      names_to = c("prefix", "enhancement_category"),
      names_sep = paste0(included_state, "_planned_conservation_practices_"),
      values_to = "farm_enhancement"
    ) %>%
    select(-contains(c("prefix", "category", "name_first", "name_last", "entry_id"))) %>%
    select(lastname_entryid, state, primary_applicant_name, coapplicant1_name, coapplicant2_name, 
           farm_commodity, contains("farm_practice"), everything()) %>%
    filter(rowSums(!is.na(select(., -c(1:3)))) > 0) %>%
    group_by(lastname_entryid) %>%
    fill(contains(c("fsa", "totalunits", 
                    "tractnum",
                    "pdf")), .direction = "down") %>%
    ungroup() %>%
    distinct()
  
  # Filter and reorganize
  long2 %>%
    mutate(
      completeness = rowSums(!is.na(select(., -lastname_entryid))),
      # filter_flag = round(farm_practice_units) == round(farm_totalunits),
      has_farm_enhancement = trimws(farm_enhancement_yesno),
      drop_flag = ifelse(has_farm_enhancement == "Yes" & is.na(farm_enhancement), 1, 0)
    ) %>%
    group_by(lastname_entryid) %>%
    filter(drop_flag == 0 | is.na(drop_flag),
           !is.na(farm_commodity)) %>%
    # mutate_add a flag for the farm number as input
    mutate(farm_number = farm_number) %>% 
    select(lastname_entryid, state,
           primary_applicant_name, 
           contains("coapplicant"),
           farm_number,
           farm_fsa, 
           farm_tractnum,
           farm_commodity,
           contains("farm_practice"), farm_totalunits, has_farm_enhancement, farm_enhancement,
           contains("pdf"), completeness)
}
# For Farm 1 in Virginia, exclude North Dakota, Arkansas, and Minnesota
va_farm1 <- process_applications(applications, farm_number = 1, "Virginia", c("north_dakota", "arkansas", "minnesota"))
va_farm2 <- process_applications(applications, farm_number = 2, "Virginia", c("north_dakota", "arkansas", "minnesota"))
va_farms_total <- bind_rows(va_farm1, va_farm2) %>% arrange(lastname_entryid)

nd_farm1 <- process_applications(applications, farm_number = 1, "North Dakota", c("virginia", "arkansas", "minnesota"))
nd_farm2 <- process_applications(applications, farm_number = 2, "North Dakota", c("virginia", "arkansas", "minnesota"))
nd_farms_total <- bind_rows(nd_farm1, nd_farm2) %>% arrange(lastname_entryid)

combine_nd_va <- bind_rows(va_farms_total, nd_farms_total) %>% arrange(state, lastname_entryid)
write.csv(combine_nd_va, data_path(paste0("vt_alliance/combinedfarms_", today_date, ".csv")))

3 View Basic Summary Stats in Advance of Randomization

state n_applications acres_or_animal_units
north_dakota 445 81,445
virginia 510 69,588
Total 955 151,033

4 Set the Random Seed and Conduct Selection

Code
# | output: asis

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Here a colleague provides a random seed number to ensure reproducibility ----
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
seed_number <- 20
set.seed(seed_number)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 1. Conduct Selection (Simple Version) ---
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Generate Unique ID's for recommended applicants in each state and append a randomized priority
applications <-
  applications %>%
  filter(district_verification_status == "Recommended") %>%
  arrange(primary_applicant_name_last) %>% 
  group_by(state) %>%
  mutate(
    randomized_priority = sample(n(), replace = FALSE)
  ) %>%
  ungroup() %>%
  select(
    randomized_priority, state, entry_id, farm_number_1_address_city, primary_applicant_name_last,
    everything()
  ) %>% 
  arrange(randomized_priority, state, entry_id)

# View the output
applications %>% head(10)
# A tibble: 10 × 612
   randomized_priority state        entry_id farm_number_1_address_city
                 <int> <chr>           <int> <chr>                     
 1                   1 north_dakota     3216 Donnybrook                
 2                   1 virginia         1920 St Stephens Church        
 3                   2 north_dakota     1291 Hazen                     
 4                   2 virginia         2030 Lovingston                
 5                   3 north_dakota     3249 Minot                     
 6                   3 virginia         1705 Kents Store               
 7                   4 north_dakota     2354 Carrington                
 8                   4 virginia          816 New Kent                  
 9                   5 north_dakota     3151 Grassy Butte              
10                   5 virginia         3865 Suffolk City              
# ℹ 608 more variables: primary_applicant_name_last <chr>,
#   primary_applicant_name_prefix <lgl>, primary_applicant_name_first <chr>,
#   primary_applicant_name_middle <lgl>, primary_applicant_name_suffix <lgl>,
#   primary_applicant_email <chr>, primary_applicant_phone <chr>, mobile <chr>,
#   landline <chr>, primary_applicant_address_street_address <chr>,
#   primary_applicant_address_address_line_2 <chr>,
#   primary_applicant_address_city <chr>, …

4.1 Export the Selection

This exports our new application file with the prioritized list for funding based on our randomization.

Code
# Export the list with the ids and priorities to a CSV file ----
fwrite(applications,
       file = data_path(paste0("vt_alliance/selection_files/selection-", today_date,"-final.csv")),
       row.names = FALSE)

5 Evaluate budget, acreage, and producer implications of selection

5.1 How many applications do we have that feature coapplicants?

First we’ll identify all the columns with any text in the last name column as a heuristic for the number of producers. We’ll need this to figure out what our estimated budget is, as applicants differ in the share of the payment requested per farm and whether or not they are eligible for the 25% incremental payment for HUP status.

Code
# | output: asis
# Step 1: Identify columns with that feature last names as our heuristic for distinct producers
identify_applicant_cols <- applications %>% select(ends_with("name_last")) %>% colnames(); identify_applicant_cols
[1] "primary_applicant_name_last"     "co_applicant_number_1_name_last"
[3] "co_applicant_number_2_name_last"
Code
# Step 2: Strip out leading and trailing white space from selected columns and 
# Create new columns with "_flag" suffix indicating whether each cell in the 
# identified cols contains text (1) or not (0), without replacing the original values.
applications <- 
  applications %>%
  mutate(across(all_of(identify_applicant_cols), ~str_squish(.))) %>%
  mutate(across(all_of(identify_applicant_cols), ~if_else(is.na(.x) | .x == "", 0, 1), .names = "{.col}_flag")) %>% 
  # generate a column with the total number of applicants per application
  mutate(total_applicants = rowSums(select(., ends_with("_flag"))))

applications %>%
  tabyl(total_applicants) %>% 
  kable(
    digits = c(1,1,2),
    caption = "Distribution of Applications with Multiple Applicants"
    )
Distribution of Applications with Multiple Applicants
total_applicants n percent
1 896 0.94
2 33 0.03
3 26 0.03

The majority of applications have only one applicant, but there are a few with multiple applicants. The total number of applicants is the “total_applicants” times the number of applications with that number of applicants.

5.2 How many total applicants do we have?

Code
# | output: asis

# Step 3: Generate a total count of the number of times we have text in each of those columns by state --- 
total_producers_by_state <- applications %>%
  select(ends_with("_flag"), state) %>%
  group_by(state) %>%
  summarise(across(everything(), ~sum(. == 1, na.rm = TRUE))) %>%
  # Convert to long format for ease of viewing
  pivot_longer(cols = -state, names_to = "category", values_to = "value") %>%
  arrange(desc(state), desc(value)) %>%
  adorn_totals("row")

total_producers_by_state %>% 
  kable(
    caption = "Number of Applicants by State"
    )
Number of Applicants by State
state category value
virginia primary_applicant_name_last_flag 510
virginia co_applicant_number_1_name_last_flag 24
virginia co_applicant_number_2_name_last_flag 7
north_dakota primary_applicant_name_last_flag 445
north_dakota co_applicant_number_1_name_last_flag 32
north_dakota co_applicant_number_2_name_last_flag 22
Total - 1040
Code
total_producers <- total_producers_by_state %>% filter(state == "Total") %>% pull(value)
total_producers_va <- total_producers_by_state %>% filter(state == "virginia") %>% summarise(n = sum(value)) %>% pull(n)
total_producers_nd <- total_producers_by_state %>% filter(state == "north_dakota") %>%  summarise(n= sum(value)) %>% pull(n)

5.3 Generate counts of and flags for historically underserved producers

First we will identify all the columns that relate to our HUP flags and determine who they apply to (primary applicant, the first co-applicant, indicated with “_2”, or the second co-applicant, indicated with “_3”).

Code
# Step 1: Identify columns that classify as HUP
define_hup <- 
  applications %>%
  select(matches("limited_resource_producer|small_producer|beginning_producer|women|
                 female_owned_operation|socially_disadvantaged_producer|veteran_producer")) %>%
  colnames(); define_hup
 [1] "beginning_producer"                "small_producer"                   
 [3] "x100_percent_women_run_operation"  "limited_resource_producer"        
 [5] "socially_disadvantaged_producer"   "veteran_producer"                 
 [7] "beginning_producer_2"              "small_producer_2"                 
 [9] "limited_resource_producer_2"       "socially_disadvantaged_producer_2"
[11] "veteran_producer_2"                "beginning_producer_3"             
[13] "small_producer_3"                  "limited_resource_producer_3"      
[15] "socially_disadvantaged_producer_3" "veteran_producer_3"               
Code
define_hup_primary <- define_hup %>% str_subset(pattern = "^(?!.*(_2|_3)).*$"); define_hup_primary
[1] "beginning_producer"               "small_producer"                  
[3] "x100_percent_women_run_operation" "limited_resource_producer"       
[5] "socially_disadvantaged_producer"  "veteran_producer"                
Code
define_hup_secondary <- define_hup %>%  str_subset(pattern = "_2"); define_hup_secondary
[1] "beginning_producer_2"              "small_producer_2"                 
[3] "limited_resource_producer_2"       "socially_disadvantaged_producer_2"
[5] "veteran_producer_2"               
Code
define_hup_tertiary <- define_hup %>%  str_subset(pattern = "_3"); define_hup_tertiary
[1] "beginning_producer_3"              "small_producer_3"                 
[3] "limited_resource_producer_3"       "socially_disadvantaged_producer_3"
[5] "veteran_producer_3"               
Code
# Step 2: Strip out leading and trailing white space from selected columns (str_squish)
# Step 3: Convert all cells without text (including NA's) to 0
# Step 4: Add flags for whether any of the columns in the define_hup lists contain text (1) or not (0)
applications <- 
  applications %>%
  mutate(across(all_of(define_hup), 
                ~if_else(is.na(str_squish(.)) | str_squish(.) == "", 0, 1))) %>% 
  mutate(
    hup_sum = rowSums(select(., define_hup)),     # by application
    # by producer
    hup_primary = rowSums(select(., define_hup_primary)),
    hup_secondary = rowSums(select(., define_hup_secondary)),
    hup_tertiary = rowSums(select(., define_hup_tertiary))
  ) %>%
  mutate(
    hup_any = as.numeric(hup_sum >= 1),     # by application
    # by producer
    hup_primary_any = as.numeric(hup_primary >= 1),
    hup_secondary_any = as.numeric(hup_secondary >= 1),
    hup_tertiary_any = as.numeric(hup_tertiary >= 1)
  ) %>%
  select(
    hup_any, hup_primary_any, hup_secondary_any, hup_tertiary_any,
    hup_sum, hup_primary, hup_secondary, hup_tertiary, define_hup, everything()
  ) %>%
  # order from largest to smallest (desc)
  arrange(across(c(hup_sum, hup_any, hup_primary, hup_primary_any, hup_secondary, hup_secondary_any, 
          hup_tertiary, hup_tertiary_any, define_hup), desc))

applications %>% head(10) 
# A tibble: 10 × 624
   hup_any hup_primary_any hup_secondary_any hup_tertiary_any hup_sum
     <dbl>           <dbl>             <dbl>            <dbl>   <dbl>
 1       1               1                 0                0       6
 2       1               1                 1                0       6
 3       1               1                 1                1       6
 4       1               1                 1                1       6
 5       1               1                 0                0       5
 6       1               1                 0                0       5
 7       1               1                 0                0       5
 8       1               1                 0                0       4
 9       1               1                 0                0       4
10       1               1                 0                0       4
# ℹ 619 more variables: hup_primary <dbl>, hup_secondary <dbl>,
#   hup_tertiary <dbl>, beginning_producer <dbl>, small_producer <dbl>,
#   x100_percent_women_run_operation <dbl>, limited_resource_producer <dbl>,
#   socially_disadvantaged_producer <dbl>, veteran_producer <dbl>,
#   beginning_producer_2 <dbl>, small_producer_2 <dbl>,
#   limited_resource_producer_2 <dbl>, socially_disadvantaged_producer_2 <dbl>,
#   veteran_producer_2 <dbl>, beginning_producer_3 <dbl>, …

5.3.1 Summarize HUP status by category and state

Code
# Step 5a: Summarize hup status by category and state (a producer may fall into multiple categories) ----
hup_summary <-
  applications %>%
  select(define_hup, state) %>%
  group_by(state) %>%
  summarise(across(everything(), ~sum(. == 1, na.rm = TRUE))) %>%
  ungroup() %>%
  # convert to long format for ease of viewing
  pivot_longer(cols = -state, names_to = "category", values_to = "count") %>%
  arrange(desc(state), desc(count)) %>%
  adorn_totals("row") %>%
  mutate(pct_producers_per_state = case_when(
    state == "virginia" ~ round(count / total_producers_va, 2),
    state == "north_dakota" ~ round(count / total_producers_nd, 2),
    TRUE ~ NA_real_  # Handle other states if needed
  )) %>% 
  # drop if state is blank or na
  filter(!is.na(state) & state != "")


# Step 5b: Generate a summary of hup ANY status by state (by producer) -----
hup_any_per_state <-
  applications %>% 
  select(hup_primary_any, hup_secondary_any, hup_tertiary_any, state) %>% 
  group_by(state) %>%
  summarise(across(everything(), ~sum(. == 1, na.rm = TRUE))) %>%
  ungroup() %>% 
  # convert to long format
  pivot_longer(cols = -state, 
               names_to = "category", 
               values_to = "count") %>% 
  arrange(desc(state), desc(count)) %>% 
  adorn_totals("row")   %>% 
  mutate(pct_producers_per_state = case_when(
    state == "virginia" ~ round(count / total_producers_va,2),
    state == "north_dakota" ~ round(count / total_producers_nd,2),
    TRUE ~ NA_real_  # Handle other states if needed
  )) %>%  # drop if state is blank or na
  filter(!is.na(state) & state != "")

# scales::percent(hup_any_per_state$pct_producers_per_state, accuracy = 0.01)

# View the outputs
hup_summary %>% kable(title = "Counts of HUP status by producer (note a producer may have multiple flags)") 
state category count pct_producers_per_state
virginia small_producer 227 0.42
virginia beginning_producer 126 0.23
virginia x100_percent_women_run_operation 58 0.11
virginia socially_disadvantaged_producer 35 0.06
virginia veteran_producer 22 0.04
virginia limited_resource_producer 17 0.03
virginia small_producer_2 12 0.02
virginia beginning_producer_2 6 0.01
virginia socially_disadvantaged_producer_2 5 0.01
virginia small_producer_3 3 0.01
virginia beginning_producer_3 2 0.00
virginia limited_resource_producer_2 0 0.00
virginia veteran_producer_2 0 0.00
virginia limited_resource_producer_3 0 0.00
virginia socially_disadvantaged_producer_3 0 0.00
virginia veteran_producer_3 0 0.00
north_dakota small_producer 118 0.24
north_dakota beginning_producer 95 0.19
north_dakota x100_percent_women_run_operation 25 0.05
north_dakota socially_disadvantaged_producer 14 0.03
north_dakota beginning_producer_2 7 0.01
north_dakota veteran_producer 5 0.01
north_dakota small_producer_2 3 0.01
north_dakota beginning_producer_3 3 0.01
north_dakota socially_disadvantaged_producer_3 2 0.00
north_dakota limited_resource_producer 1 0.00
north_dakota socially_disadvantaged_producer_2 1 0.00
north_dakota limited_resource_producer_2 0 0.00
north_dakota veteran_producer_2 0 0.00
north_dakota small_producer_3 0 0.00
north_dakota limited_resource_producer_3 0 0.00
north_dakota veteran_producer_3 0 0.00
Total - 787 NA
Code
hup_any_per_state %>% 
  kable(title = "Counts when any producer has any HUP status", booktabs = TRUE)
state category count pct_producers_per_state
virginia hup_primary_any 307 0.57
virginia hup_secondary_any 18 0.03
virginia hup_tertiary_any 3 0.01
north_dakota hup_primary_any 210 0.42
north_dakota hup_secondary_any 9 0.02
north_dakota hup_tertiary_any 5 0.01
Total - 552 NA
Code
n_hup <- hup_any_per_state %>% filter(state == "Total") %>% pull(count)
pct_hup <- n_hup / total_producers; pct_hup
[1] 0.53
Code
scales::percent(pct_hup)
[1] "53%"

The number of applicants that fall into any HUP category is 552 (rscales::percent(pct_hup))`).

5.4 Generate counts of and flags for limited resource producers

First we will identify all the columns that relate to our LRP flags

Code
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# C. Generate a total count of limited resource and socially disadvantaged producers
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
gen_summary <- function(str_pattern) {
  summary <- 
    applications %>%
    select(matches(str_pattern), state) %>%
    group_by(state) %>%
    summarise(across(matches(str_pattern), ~sum(. == 1, na.rm = TRUE))) %>%
    pivot_longer(cols = -state, names_to = "category", values_to = "count") %>%
    arrange(desc(state), desc(count)) %>%
    adorn_totals("row") %>%
    mutate(pct_producers_state = case_when(
      state == "Virginia" ~ round(count / total_producers_va, 2),
      state == "North Dakota" ~ round(count / total_producers_nd, 2),
      TRUE ~ NA_real_  # Handle other states if needed
    )) %>%
    filter(!is.na(state) & state != "")
  
  return(summary)
}

lrp_summary <- gen_summary("limited_resource_producer"); lrp_summary %>% kable()
state category count pct_producers_state
virginia limited_resource_producer 17 NA
virginia limited_resource_producer_2 0 NA
virginia limited_resource_producer_3 0 NA
north_dakota limited_resource_producer 1 NA
north_dakota limited_resource_producer_2 0 NA
north_dakota limited_resource_producer_3 0 NA
Total - 18 NA
Code
sdp_summary <- gen_summary("socially_disadvantaged_producer"); sdp_summary %>% kable()
state category count pct_producers_state
virginia socially_disadvantaged_producer 35 NA
virginia socially_disadvantaged_producer_2 5 NA
virginia socially_disadvantaged_producer_3 0 NA
north_dakota socially_disadvantaged_producer 14 NA
north_dakota socially_disadvantaged_producer_3 2 NA
north_dakota socially_disadvantaged_producer_2 1 NA
Total - 57 NA

The number of producers that are in the LRP and SDP categories are:

Code
# Calculate total LRP producers ----
total_lrp_producers <- lrp_summary %>% filter(state == "Total") %>% pull(count)
total_sdp_producers <- sdp_summary %>% filter(state == "Total") %>% pull(count)

# Calculate percentage of LRP producers ----
pct_lrp <- total_lrp_producers / total_producers; pct_lrp
[1] 0.017
Code
pct_sdp <- total_sdp_producers / total_producers; pct_sdp
[1] 0.055

6 Estimate implications on program targets (producers, acreage, budget)

First we need to account for the fact that there are up to two farms per application but up to three applicants, and each person may have their own HUP status. We also have different payment shares per applicant/coapplicant to account for.

Code
applications_slim <- 
  applications %>% 
  # create payment rate columns for each primary, coapplicant, and coapplicant 1
  # should be 125 if hup is 1 for hup_primary, hup_secondary, or hup_tertiary, else 100
  # bonus payments for lrp should be 500 if limited_resource_producer is 1, else 0
  mutate(
      payment_rate_primary = if_else(hup_primary_any == 1, 125, 100),
      payment_rate_coapplicant1 = if_else(hup_secondary_any == 1, 125, 100),
      payment_rate_coapplicant2 = if_else(hup_tertiary_any == 1, 125, 100), 
      bonus_lrp_primary = if_else(limited_resource_producer == 1, 500, 0),
      bonus_lrp_coapplicant1 = if_else(limited_resource_producer_2 == 1, 500, 0),
      bonus_lrp_coapplicant2 = if_else(limited_resource_producer_3 == 1, 500, 0)
  ) %>% 
  # turn NA's to 0's
  mutate_at(vars(contains("share_of_payment_percent")), ~ifelse(is.na(.), 0, ./100)) %>% 
  group_by(entry_id, state) %>%
  mutate(primary_applicant_number_1_share_of_payment_percent = 
           ifelse(total_applicants == 1, 1, primary_applicant_number_1_share_of_payment_percent)) %>% 
  mutate(budget_farm_1 = 
           primary_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_primary + bonus_lrp_primary +
           co_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_coapplicant1 + bonus_lrp_coapplicant1, 
         budget_farm_2 = 
          # if there is a farm 2, calculate its budget
           ifelse(!is.na(acres_or_animal_units_for_farm_number_2),
           primary_applicant_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_primary +  bonus_lrp_primary +
           co_applicant_number_2_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_coapplicant2 + bonus_lrp_coapplicant2, 0)
         ) %>%
  mutate(budget_application = sum(budget_farm_1, budget_farm_2, na.rm = TRUE),
         total_acres = sum(acres_or_animal_units_for_farm_number_1, acres_or_animal_units_for_farm_number_2, na.rm = TRUE)) %>%
  ungroup() %>%
  select(randomized_priority, state, entry_id, total_acres, budget_application, total_applicants, budget_farm_1, budget_farm_2, 
         contains("rate"), contains("bonus"),
         everything()) %>% 
  arrange(randomized_priority, state, entry_id, total_acres, budget_application)

6.1 Summary of Sample Selection Implications Across All States

Code
applications_slim %>%
     arrange(randomized_priority, state)  %>% 
     group_by(state) %>% 
     summarize(
       total_acres_au = sum(total_acres, na.rm = TRUE),
       total_producers = sum(total_applicants, na.rm = TRUE),
       total_budget = sum(budget_application, na.rm = TRUE)
     ) %>% 
  adorn_totals("row") %>% 
  mutate(across(where(is.numeric), ~comma(.))) %>%
  kable(caption = "Summary of Selection Implications Across All States, if all were funded (and assuming all animal units are as stated in application)")
Summary of Selection Implications Across All States, if all were funded (and assuming all animal units are as stated in application)
state total_acres_au total_producers total_budget
north_dakota 100,081 499 7,364,194
virginia 81,864 541 5,728,970
Total 181,945 1,040 13,093,163

6.2 Apply selection thresholds to applications

We’ll first flag the applications that meet the priority criteria (LRP and SDP) and select the first 75% of those flagged. Of the remaining applications, we’ll then select 50% from VA and 75% from ND.

The priority flags are in any of these columns:

Code
# Parameters
priority_flags <- 
  applications %>%
  select(matches("limited_resource_producer|socially_disadvantaged_producer")) %>%
  colnames(); priority_flags
[1] "limited_resource_producer"         "socially_disadvantaged_producer"  
[3] "limited_resource_producer_2"       "socially_disadvantaged_producer_2"
[5] "limited_resource_producer_3"       "socially_disadvantaged_producer_3"

Then apply the given thresholds:

Code
# Define the flagged_fraction as 75% for flagged applications
flagged_fraction <- 0.76

# First, select 75% of all ND applicants and 50% of all VA applicants, including both flagged and unflagged applications
applications_selection_status <- 
  applications_slim %>%
  arrange(randomized_priority, state) %>%
  ungroup() %>% 
  group_by(state) %>%
  mutate(
    # Create a flag column that checks if any priority flags are TRUE for each row
    flag = rowSums(across(all_of(priority_flags)) == TRUE) > 0,
    n_state = n(),
    # Dynamically assign state_fraction based on the state
    state_fraction = case_when(
      state == "virginia" ~ 0.5,
      state == "north_dakota" ~ 0.75,
      TRUE ~ 0.75  # Default value for other states
    ),
    # Compute the row number for selection within each state
    row_num_for_selection = row_number(),
    # Select applicants based on the state_fraction
    selection_status = ifelse(!flag & (row_num_for_selection <= (n_state * state_fraction)), "selected", NA_character_)) %>% 
  mutate(
     # Compute the number of flagged applications to select based on flagged_fraction
    total_flagged = sum(flag),
    select_flagged_count = ceiling(total_flagged * flagged_fraction),
    flag_cumulative = cumsum(flag),
    # Mark selected flagged applications
    selection_status = ifelse(flag & flag_cumulative <= select_flagged_count, "selected", selection_status),
  ) %>%
  ungroup() %>% 
  mutate(selection_status = replace_na(selection_status, "not selected"))

# Display the updated dataset
applications_selection_status %>% head(10)
# A tibble: 10 × 642
   randomized_priority state        entry_id total_acres budget_application
                 <int> <chr>           <int>       <dbl>              <dbl>
 1                   1 north_dakota     3216       320               16000 
 2                   1 virginia         1920        14.2              1775 
 3                   2 north_dakota     1291       126               12600 
 4                   2 virginia         2030       131               16375 
 5                   3 north_dakota     3249       320               16000 
 6                   3 virginia         1705       119.               8162.
 7                   4 north_dakota     2354       320               36000 
 8                   4 virginia          816       320               16000 
 9                   5 north_dakota     3151       294.              13450 
10                   5 virginia         3865       194.              20000 
# ℹ 637 more variables: total_applicants <dbl>, budget_farm_1 <dbl>,
#   budget_farm_2 <dbl>,
#   i_have_not_operated_a_farm_or_ranch_or_i_have_operated_my_farm_or_ranch_for_less_than_10_years <chr>,
#   i_have_not_operated_a_farm_or_ranch_or_i_have_operated_my_farm_or_ranch_for_less_than_10_years_2 <lgl>,
#   i_have_not_operated_a_farm_or_ranch_or_i_have_operated_my_farm_or_ranch_for_less_than_10_years_3 <lgl>,
#   payment_rate_primary <dbl>, payment_rate_coapplicant1 <dbl>,
#   payment_rate_coapplicant2 <dbl>, bonus_lrp_primary <dbl>, …
Code
# Add a check to see how many flagged applications are selected by state
flagged_selection_summary <- applications_selection_status %>%
  group_by(state) %>%
  summarize(
    total_applications = n(),
    total_flagged = sum(flag),
    selected_flagged = sum(flag & selection_status == "selected")
  ) %>%
  mutate(pct_flagged = selected_flagged / total_flagged,
         pct_flagged_selected = selected_flagged / total_applications)

# # Display the summary
# flagged_selection_summary %>% kable()

6.3 Export application file with labels for “selected” or “not selected”

Once have finalized the input data and the relevant conversions, we will export the application file with the “selected” and “not selected” columns in the lefthand most column for ease of viewing. The below shows a sample

Code
applications_selection_status_slim <-
  applications_selection_status %>%
  mutate(primary_applicant_name = paste(primary_applicant_name_first, primary_applicant_name_last, sep = "_"),
         coapplicant1_name = ifelse(!is.na(co_applicant_number_1_name_first), 
                               paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep = "_"), NA),
         coapplicant2_name = ifelse(!is.na(co_applicant_number_2_name_first), 
                               paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep = "_"), NA),
         lastname_entryid = paste0(primary_applicant_name_last, "_", entry_id)) %>% 
  select(selection_status, flag, entry_id, primary_applicant_name_last, state, #randomized_priority, 
         matches(".*primary.*name.*"),  matches(".*primary.*address.*"), matches(".*primary.*email.*"), 
         matches(".*primary.*phone.*"), hup_primary_any, contains("counties"),
         matches(".*farm.*address.*"),   matches(".*coapplicant.*name.*"),  matches(".*co_applicant.*address.*"),
         matches(".*co_applicant.*email.*"), matches(".*co_applicant.*phone.*"), contains("share_of_payment"),
         total_acres, budget_application, contains("zip"), contains("virginia_counties"), contains("north_dakota_counties"), contains("name"), define_hup, contains("hup"), are_you_interested_in_using_comet_farm,
         # contains("comet"),
         pdf_alliance_application) %>% 
  select(-contains(c("name_middle", "prefix", "suffix", "name_first", "name_last", "minnesota", "arkansas"))) %>% 
  select(selection_status, flag, entry_id, state, primary_applicant_address_zip_postal_code, contains("counties"), everything())

head(applications_selection_status_slim)
# A tibble: 6 × 81
  selection_status flag  entry_id state        primary_applicant_address_zip_p…¹
  <chr>            <lgl>    <int> <chr>        <chr>                            
1 selected         FALSE     3216 north_dakota 58734                            
2 selected         FALSE     1920 virginia     23148                            
3 selected         FALSE     1291 north_dakota 58545                            
4 selected         FALSE     2030 virginia     27519                            
5 selected         FALSE     3249 north_dakota 58701                            
6 selected         FALSE     1705 virginia     23084                            
# ℹ abbreviated name: ¹​primary_applicant_address_zip_postal_code
# ℹ 76 more variables: north_dakota_counties <chr>, virginia_counties <chr>,
#   north_dakota_counties_2 <chr>, virginia_counties_2 <chr>,
#   primary_applicant_name <chr>,
#   primary_applicant_address_street_address <chr>,
#   primary_applicant_address_address_line_2 <chr>,
#   primary_applicant_address_city <chr>, …
Code
# fwrite(applications_selection_status_slim,
#        file = data_path(paste0("vt_alliance/selection_files/selection_run-",
#                                today_date,"-final-seed-", seed_number,".csv")), row.names = FALSE)

6.4 Merge in the selection data with farm/crop/practice/enhancement information

Code
# Merge in the selection data with the farm/crop/practice/enhancement information
check <- 
  applications_selection_status_slim %>%
  full_join(combine_nd_va, by = c("lastname_entryid", "state", "primary_applicant_name", "coapplicant1_name", "coapplicant2_name", "pdf_alliance_application")) %>% 
  # full_join(va_farms_total_flag, by = c("lastname_entryid", "pdf_alliance_application",) %>% 
  mutate(flag = as.integer(as.logical(flag)),
         counties = coalesce(north_dakota_counties, virginia_counties)) %>% 
  rename(lrp_sdp_priority_flag = flag,
         primary_applicant_number_2_share_of_payment_percent =  primary_applicant_share_of_payment_percent) %>% 
  select(-contains(c
                   ("counties_2", "virginia_counties", "north_dakota_counties", "small", "women", "veteran",
                     "beginning")), 
         -primary_applicant_address_country, 
         -farm_number_1_address_country, -farm_number_2_address_country, -contains("country"), -representative_name, everything()) %>% 
  mutate(farm_number_1_address_state_province = ifelse(state == "virginia", "VA", "ND"),
         farm_number_2_address_state_province = ifelse(state == "virginia", "VA", "ND"))


# Function to concatenate address parts
concatenate_address <- function(street, line2, city, state, zip) {
  address <- paste0(
    street, ", ",
    ifelse(is.na(line2) | line2 == "", "", paste0(line2, " ")),
    city, ", ",
    state, " ",
    zip
  )
  return(address)
}

# List of address prefixes to process
address_prefixes <- c("farm_number_1_address", "farm_number_2_address", 
                      "primary_applicant_address", "co_applicant_number_1_address",  "co_applicant_number_2_address")

# Function to create full address columns
create_full_address <- function(df, prefix) {
  df %>%
    mutate("{prefix}_full_address" := pmap_chr(list(
      df[[paste0(prefix, "_street_address")]],
      df[[paste0(prefix, "_address_line_2")]],
      df[[paste0(prefix, "_city")]],
      df[[paste0(prefix, "_state_province")]],
      df[[paste0(prefix, "_zip_postal_code")]]
    ), concatenate_address))
}

# Apply the function to each prefix and reduce the data frame
addy_check <- 
  reduce(address_prefixes, create_full_address, .init = check)  %>%
  select(-matches("address"), contains("full_address")) %>% 
  select(-contains(c("completeness", "small", "women", "veteran","beginning", "representative", 
                     "virginia_counties", "north_dakota_counties")))

farm1_details <- 
  addy_check %>% 
  filter(farm_number == 1) %>% 
  select(-contains(c("farm_number_2", "coapplicant2", "co_applicant_number_2", "producer_3", "tertiary",
                     "hup_sum", "primary_applicant_number_2_share_of_payment_percent")))

farm2_details <- 
  addy_check %>% 
  filter(farm_number == 2) %>% 
  select(-contains(c("farm_number_1", "coapplicant1", "co_applicant_number_1", "producer_2", "secondary",
                     "hup_sum", "primary_applicant_number_1_share_of_payment_percent")))
  
all_farm_details <- 
  bind_rows(farm1_details, farm2_details) %>% 
    select(-contains("pdf"), everything(), contains("pdf"))

# Combine co_applicant_number_1 and co_applicant_number_2 related columns using coalesce
all_farms_combine <- all_farm_details %>%
  mutate(
    coapplicant_name = coalesce(coapplicant1_name, coapplicant2_name),
    co_applicant_email = coalesce(co_applicant_number_1_email, co_applicant_number_2_email),
    co_applicant_phone = coalesce(co_applicant_number_1_phone, co_applicant_number_2_phone),
    co_applicant_share_of_payment_percent = coalesce(co_applicant_number_1_share_of_payment_percent, co_applicant_number_2_share_of_payment_percent),
    primary_applicant_share_of_payment_percent = coalesce(primary_applicant_number_1_share_of_payment_percent, primary_applicant_number_2_share_of_payment_percent), 
    farm_address = coalesce(farm_number_1_address_full_address, farm_number_2_address_full_address),
    coapplicant_address = coalesce(co_applicant_number_1_address_full_address, co_applicant_number_2_address_full_address),
    hup_coapplicant = coalesce(hup_secondary_any, hup_tertiary_any),
    socially_disadvantaged_coapplicant = coalesce(socially_disadvantaged_producer_2, socially_disadvantaged_producer_3),
    limited_resource_coapplicant = coalesce(limited_resource_producer_2, limited_resource_producer_3)
  ) %>%
  select(
    -coapplicant1_name, -coapplicant2_name,
    -co_applicant_number_1_email, -co_applicant_number_2_email,
    -co_applicant_number_1_phone, -co_applicant_number_2_phone,
    -co_applicant_number_1_share_of_payment_percent, -co_applicant_number_2_share_of_payment_percent,
    -primary_applicant_number_1_share_of_payment_percent, -primary_applicant_number_2_share_of_payment_percent, 
    -farm_number_1_address_full_address, -farm_number_2_address_full_address,
    -co_applicant_number_1_address_full_address, -co_applicant_number_2_address_full_address,
    -hup_primary, -hup_secondary, -hup_tertiary, 
    -socially_disadvantaged_producer_2, -socially_disadvantaged_producer_3,
    -hup_secondary_any, -hup_tertiary_any,
    -limited_resource_producer_2, -limited_resource_producer_3
  ) %>% 
  arrange(desc(selection_status), state, primary_applicant_name, entry_id, coapplicant_name) 
#%>% 
  # filter(!is.na(entry_id))

col_ordering <- 
  c(
  "selection_status", 
  "entry_id", 
  # "lastname_entryid", 
  "state", 
  "primary_applicant_name", 
  "primary_applicant_address_full_address", 
  "primary_applicant_email", 
  "primary_applicant_phone", 
  # underserved status for primary applicant
  "lrp_sdp_priority_flag", 
  "hup_primary_any", 
  "limited_resource_producer", 
  "socially_disadvantaged_producer", 
  # farm information
  "farm_number", 
  "counties", 
  "farm_address", 
  "farm_fsa", 
  "farm_tractnum", 
  "farm_commodity", 
  "farm_practice", 
  "has_farm_enhancement", 
  "farm_enhancement", 
  "farm_practice_units",
  "farm_practice_animals", 
  "farm_totalunits", 
  "total_acres", 
  
  "coapplicant_name", 
  "coapplicant_address", 
  "co_applicant_email", 
  "co_applicant_phone", 
  "limited_resource_coapplicant", 
  "socially_disadvantaged_coapplicant", 
  "hup_coapplicant",

  "budget_application", 
  "primary_applicant_share_of_payment_percent", 
  "co_applicant_share_of_payment_percent", 
  
  # "hup_any", # on the fence about whether to include this
  "are_you_interested_in_using_comet_farm",
  "pdf_alliance_application"
)

# Custom function to convert text to title case except for state abbreviations
to_title_case_except_states <- function(address) {
  # Define a vector of state abbreviations
  state_abbreviations <- c(
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", 
    "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", 
    "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", 
    "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", 
    "VT", "VA", "WA", "WV", "WI", "WY"
  )
  
  # Convert the address to title case
  address_title_case <- str_to_title(address)
  
  # Replace state abbreviations with uppercase versions
  for (state in state_abbreviations) {
    address_title_case <- str_replace_all(
      address_title_case, 
      regex(paste0("\\b", state, "\\b"), ignore_case = TRUE), 
      state
    )
  }
  
  return(address_title_case)
}


all_farms_ordered <- 
  all_farms_combine %>% 
  select(all_of(col_ordering)) %>% 
  mutate(state = ifelse(state == "virginia", "VA", "ND")) %>% 
  group_by(primary_applicant_name) %>% 
  arrange(desc(selection_status), state, primary_applicant_name, farm_number) %>% 
  ungroup() %>% 
  distinct()

# Dynamically identify columns that are likely address columns
address_columns <- names(all_farms_ordered)[grepl("address", names(all_farms_ordered), ignore.case = TRUE)]

# Apply the function to all identified address columns
all_farms_ordered[address_columns] <- lapply(all_farms_ordered[address_columns], function(column) {
  sapply(column, to_title_case_except_states)
})

# Custom function to format names
format_name <- function(name) {
  # Replace underscores with spaces
  name <- str_replace_all(name, "_", " ")
  
  # Convert the name to title case
  name <- str_to_title(name)
  
  # Handle the special case for "de" prefix
  name <- str_replace_all(name, "\\bDe\\b", "de")
  
  return(name)
}

# Custom function to replace full state names with abbreviations
convert_state_abbreviation <- function(address) {
  # Define a named vector of state abbreviations
  state_names <- c(
    "Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR", 
    "California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE", 
    "Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID", 
    "Illinois" = "IL", "Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS", 
    "Kentucky" = "KY", "Louisiana" = "LA", "Maine" = "ME", "Maryland" = "MD", 
    "Massachusetts" = "MA", "Michigan" = "MI", "Minnesota" = "MN", "Mississippi" = "MS", 
    "Missouri" = "MO", "Montana" = "MT", "Nebraska" = "NE", "Nevada" = "NV", 
    "New Hampshire" = "NH", "New Jersey" = "NJ", "New Mexico" = "NM", "New York" = "NY", 
    "North Carolina" = "NC", "North Dakota" = "ND", "Ohio" = "OH", "Oklahoma" = "OK", 
    "Oregon" = "OR", "Pennsylvania" = "PA", "Rhode Island" = "RI", "South Carolina" = "SC", 
    "South Dakota" = "SD", "Tennessee" = "TN", "Texas" = "TX", "Utah" = "UT", 
    "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA", "West Virginia" = "WV", 
    "Wisconsin" = "WI", "Wyoming" = "WY"
  )
  
  # Replace full state names with abbreviations
  for (state in names(state_names)) {
    address <- str_replace_all(address, regex(paste0("\\b", state, "\\b"), ignore_case = TRUE), state_names[state])
  }
  
  return(address)
}

# Custom function to capitalize directional calls with specific pattern
capitalize_directionals <- function(address) {
  # Define a vector of directional calls
  directionals <- c("NE", "NW", "SE", "SW", "N", "E", "S", "W")
  
  # Capitalize directional calls only if they have a space before and a comma or space after
  for (dir in directionals) {
    address <- str_replace_all(address, regex(paste0("(?<=\\s)", dir, "(?=\\s|,)"), ignore_case = TRUE), toupper(dir))
  }
  
  return(address)
}

all_farms_ordered_interim <- 
  all_farms_ordered %>% 
  rename(county = counties) %>% 
  mutate(across(contains("name", ignore.case = TRUE), ~ sapply(., format_name))) %>% 
  mutate(across(contains("address", ignore.case = TRUE), ~ sapply(., convert_state_abbreviation))) %>% 
  mutate(across(contains("address", ignore.case = TRUE), ~ sapply(., capitalize_directionals)))

all_farms_ordered_final <-
  all_farms_ordered_interim %>% 
  group_by(primary_applicant_name) %>%
  mutate(selection_status = ifelse(is.na(selection_status), "Not Recommended", selection_status)) %>%
  group_by(primary_applicant_name, farm_number) %>% 
  # identify where the sum of practice acres and animal units does not total the farm total units, and flag with 1 if not equal
  mutate(flag_for_review = ifelse(round(sum(farm_practice_units, farm_practice_animals, na.rm = TRUE)) != round(farm_totalunits), 1, 0)) %>%
  arrange(desc(selection_status), state, primary_applicant_name, farm_number) %>% 
  ungroup()
  

all_farms_ordered_final %>%  
  select(flag_for_review, selection_status, entry_id, state, county,lrp_sdp_priority_flag,  everything()) %>% 
  head()
# A tibble: 6 × 37
  flag_for_review selection_status entry_id state county   lrp_sdp_priority_flag
            <dbl> <chr>               <int> <chr> <chr>                    <int>
1               0 selected             3883 ND    Rolette…                     0
2               0 selected             3883 ND    Rolette…                     0
3               0 selected             2547 ND    Eastern…                     0
4               0 selected             1208 ND    Stark C…                     0
5               0 selected             2878 ND    Stark C…                     0
6               0 selected             2878 ND    Stark C…                     0
# ℹ 31 more variables: primary_applicant_name <chr>,
#   primary_applicant_address_full_address <chr>,
#   primary_applicant_email <chr>, primary_applicant_phone <chr>,
#   hup_primary_any <dbl>, limited_resource_producer <dbl>,
#   socially_disadvantaged_producer <dbl>, farm_number <dbl>,
#   farm_address <chr>, farm_fsa <chr>, farm_tractnum <chr>,
#   farm_commodity <chr>, farm_practice <chr>, has_farm_enhancement <chr>, …
Code
write.csv(all_farms_ordered_final, 
          file = paste0(data_path("vt_alliance/selection_files/all_farms_ordered_final_updated_"),
                        today_date, ".csv"),
          row.names = FALSE)

7 Generate summary of output for selected thresholds

In order to have a credible research design to evaluate the impacts of this funding, we cannot fund all applications in this round. Nor can we choose to fund all applicants but reduce the amount. Either of these options will render future impact evaluations for this program invalid.

We will need a subset of applications that are not selected – at complete random – to have a credible comparison group (called a “counterfactual”). These people will still be eligible for selection in future periods if they choose to re-enter the pool.

Given our understanding of the various target we seek to achieve and the multiple application periods, we propose the following thresholds, which would result in roughly the following amounts:

(Note unless otherwise indicated by the column applications, all numbers refer to producer counts rather than applications.)

Code
generate_summary_table <- function(df, state_input){
  # Step 3: Generate a summary output table
  state = trimws(gsub("", "_", tolower(state_input)))

  summary_table <- 
    df %>%
    filter(state == state_input,
           selection_status == "selected") %>% 
    group_by(hup_any) %>% 
    summarise(
      state = state,  # Display the state as a constant column
      threshold_used = as.character(state_fraction),
      budget_est = sum(budget_application, na.rm = TRUE),
      total_acres_au = sum(total_acres, na.rm = TRUE),
      budget_est = sum(budget_application, na.rm = TRUE),
      total_applications = n(),
      total_producers = sum(total_applicants, na.rm = TRUE),
      total_hup = sum(hup_any, na.rm = TRUE),
      total_lrp = sum(limited_resource_producer, na.rm = TRUE),
      total_sdp = sum(socially_disadvantaged_producer, na.rm = TRUE),
    ) %>% 
    mutate(hup_any = as.character(hup_any)) %>% 
    distinct()
  
  return(summary_table)
}

# Generate summary tables for different thresholds
nd_75 <- generate_summary_table(df = applications_selection_status,
                         state_input = "north_dakota") %>% 
  adorn_totals("row") %>% 
  mutate(across(where(is.numeric), ~comma(.)))

nd_75 %>%  kable(title = "Summary of Budget Implications for ND By HUP Status")
hup_any state threshold_used budget_est total_acres_au total_applications total_producers total_hup total_lrp total_sdp
0 north_dakota 0.75 2,904,560 46,372 181 210 0 0 0
1 north_dakota 0.75 2,624,776 30,610 152 168 152 1 11
Total - - 5,529,336 76,981 333 378 152 1 11
Code
va_50 <- 
 generate_summary_table(df = applications_selection_status,
                         state_input = "virginia") %>% 
  adorn_totals("row") %>% 
  mutate(across(where(is.numeric), ~comma(.)))

va_50 %>% kable(title = "Summary of Budget Implications for Virginia By HUP Status")
hup_any state threshold_used budget_est total_acres_au total_applications total_producers total_hup total_lrp total_sdp
0 virginia 0.5 1,395,590 24,676 103 105 0 0 0
1 virginia 0.5 1,620,825 19,411 166 181 166 14 28
Total - - 3,016,415 44,087 269 286 166 14 28
Code
nd_75_noformat <- generate_summary_table(df = applications_selection_status,
                         state_input = "north_dakota") %>% adorn_totals("row") 

va_50_noformat <- generate_summary_table(df = applications_selection_status,
                         state_input = "virginia") %>%
  adorn_totals("row") 

# Sum all rows but the ones labeled Total
totals <- 
  rbind(va_50_noformat, nd_75_noformat) %>% 
    mutate(state = ifelse(hup_any == "Total", "state subtotal", state),
         hup_any = ifelse(hup_any == "Total", "-", hup_any))  
 
# Step 2: Calculate new totals for rows previously labeled "Total"
total_row <- 
  totals %>%
  filter(state == "state subtotal") %>%
  summarise(across(where(is.numeric), sum, na.rm = TRUE))

# Adding a label for the total row
total_row$state <- "Total"

# Combining the modified data with the new total row
final_data <- 
  bind_rows(totals, total_row) %>% 
  # filter(state == "Total") %>% 
  mutate(across(everything(), ~ifelse(is.na(.), "-", .))) %>% 
  mutate(across(where(is.numeric), ~comma(as.numeric(.))))

final_data %>% kable(caption =
                     "Summary of Proposed Selection Implications by State and HUP Status") %>%
kable_styling(full_width = F, position = "left", bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
Summary of Proposed Selection Implications by State and HUP Status
hup_any state threshold_used budget_est total_acres_au total_applications total_producers total_hup total_lrp total_sdp
0 virginia 0.5 1,395,590 24,676 103 105 0 0 0
1 virginia 0.5 1,620,825 19,411 166 181 166 14 28
- state subtotal - 3,016,415 44,087 269 286 166 14 28
0 north_dakota 0.75 2,904,560 46,372 181 210 0 0 0
1 north_dakota 0.75 2,624,776 30,610 152 168 152 1 11
- state subtotal - 5,529,336 76,981 333 378 152 1 11
- Total - 8,545,751 121,068 602 664 318 15 39

8 Check on share of HUP and SDP selected

The flag here indicates whether an application has any applicant with HUP or SDP status. The numbers in the prior table refer to applicants (of which there may be up to three per application), whereas this table refers to applications where any of up to three applicants have HUP or SDP status.

Code
# Display the summary
flagged_selection_summary %>% kable()
state total_applications total_flagged selected_flagged pct_flagged pct_flagged_selected
north_dakota 445 16 13 0.81 0.03
virginia 510 51 39 0.76 0.08

9 Remaining TO-DO’s

The budget estimates should be considered as exactly that – estimates! There are a few cases where the application files were inconsistent on the acreage within a farm and the total acreage requested, so we anticipate there may be adjustments once the acreage is reviewed/verified by local partners and the producers themselves.

More broadly, as we have continued to evaluate the implications of selection, we have further identified a few more errors that should have been caught in the applications input files before the data file arrived to selection but were not (e.g. inconsistent acreages between columns, a test case making it through to “Recommended” status). As we learn more in this round, we can make improvements that will allow us to move more quickly in subsequent rounds on what types of automatic flags to have before producers are able to submit their files.

10 Manual flags that we had to fix

We will seek to systematize the following checks moving forward:

  1. Sum of practice code animals or acres should equal total farm acres
  2. Practice code animals or acres should not be zero, missing, or negative
  3. Practice code animals or acres should not be greater than total farm acres
  4. Practice code animals or acres should not be greater than 160 per farm
  5. The “has enhancement” column should not be empty when the enhancement is present (or is not NA)
  6. Check to confirm the number of producers in LRP and SDP status per state are selected.