VT Alliance Selection Protocol - Sample

Author

Elinor Benami

Published

May 17, 2024

Code

# Clear the workspace
rm(list = ls())

# load the libraries
library(tidyverse)
library(data.table) # to read in files faster
library(tidylog)
library(janitor)
library(scales) # for comma and formatting
library(knitr) #for kable
library(kableExtra)
library(dtplyr)

# set params ---- 
options(digits = 2) # show two significant digits
download_date <- as.Date("2024-05-01")
today_date <- Sys.Date()
'%nin%' <- function(x, table) !(x %in% table)

## Custom Functions ---
# Define the function to remove columns that are all NA
remove_na_columns <- function(df) {
  # Use select and where to filter out columns that are all NA
  df_selected <- df %>%
    select(where(~ !all(is.na(.))))

  # Return the modified dataframe
  return(df_selected)
}

flag_for_review <- function(df){
  df %>%
    group_by(lastname_entryid, farm_number) %>%
    mutate(
      sum_practice_units = sum(farm_practice_units, farm_practice_animals, na.rm = TRUE),
      over_totalfarm_units = sum_practice_units > farm_totalunits,
      na_count = rowSums(is.na(across(everything()))),
      flag_for_review = FALSE
    ) %>%
    group_by(lastname_entryid, farm_number) %>%
    mutate(
      flag_for_review = ifelse(over_totalfarm_units == TRUE & n() > 1 & na_count == max(na_count), 1, 0)
    ) %>%
    ungroup() %>% 
    select(flag_for_review, na_count, over_totalfarm_units, everything()) 
}

1 Load the Data

The application data is from 2024-05-01.

Code

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Load the data ---
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
data_path <- function(x) paste0(my_file_path, x)

# Read the data, tidy column names, and sort by last name
applications <- 
  fread(data_path(paste0("vt_alliance/application-", download_date,".csv"))) %>% 
  as.data.frame() %>% 
  clean_names() %>% 
  # convert all blank columns to NA's
  na_if("") %>% 
  na_if(" ") %>% 
  # these have an incorrect number of total acres/animal units (using the sum of the two farms instead)
  mutate(total_number_of_acres_or_animal_units = 
           # this entry had plans to enroll 160 but for farm 1 and farm 2 put 0
           ifelse(entry_id == 2324, 160, total_number_of_acres_or_animal_units)) %>%
  group_by(entry_id) %>% 
  # these entries had a total that didn't equal the sum of the two farms; defaulted to sum of two farms
  mutate(total_number_of_acres_or_animal_units = 
           case_when(
             entry_id %in% c(3851, 1853) ~
               sum(acres_or_animal_units_for_farm_number_1, 
                   acres_or_animal_units_for_farm_number_2, na.rm = TRUE),
             TRUE ~ total_number_of_acres_or_animal_units
  )) %>% 
  # this is an ironistic test account -- it also has also has some subfiles
  filter(entry_id %nin% c(2450, 2453, 2457))  %>% 
    mutate(
    primary_applicant_name = paste(primary_applicant_name_first, primary_applicant_name_last, sep = "_"),
    coapplicant1_name = ifelse(!is.na(co_applicant_number_1_name_first), 
                               paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep = "_"), NA),
    coapplicant2_name = ifelse(!is.na(co_applicant_number_2_name_first), 
                               paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep = "_"), NA), 
    state = trimws(gsub(" ", "_", tolower(state)))
  ) %>% 
  ungroup()  #  filter(primary_applicant_name_last != "Lee123")

2 Generate a file that identifies and exports the farm, commodity, practice, and enhancement data

Code

process_applications <- function(applications, farm_number, included_state, excluded_states) {
  included_state <- gsub(" ", "_", tolower(included_state))
  state_title <- included_state

  if(farm_number == 2){ 
    farm_tractnum = "if_known_please_enter_the_tract_numbers_where_the_practices_will_be_installed"
    farm_practice = sym(paste0(included_state, "_planned_conservation_practices_practice_3"))
    farm_practice_units = paste0(included_state, "_planned_conservation_practices_number_of_acres_you_plan_to_enroll_2")
    farm_practice_animals = paste0(included_state, "_planned_conservation_practices_number_of_animals_you_plan_to_enroll_2")
    farm_enhancement_yesno = sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional_2"))
  } else if (farm_number == 1){
    farm_tractnum = sym(paste0("farm_number_", farm_number, "_tract_numbers"))
    farm_practice = sym(paste0(included_state, "_planned_conservation_practices_practice"))
    farm_practice_units = paste0(included_state, "_planned_conservation_practices_number_of_acres_you_plan_to_enroll")
    farm_practice_animals = paste0(included_state, "_planned_conservation_practices_number_of_animals_you_plan_to_enroll")
    farm_enhancement_yesno = sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional"))
  } else{
    print("Please enter a valid farm number (1 or 2 only)")
  }
  
  # Construct column names dynamically and force evaluation
  rename_list <- list(
    farm_commodity = sym(paste0(included_state, 
                                "_planned_conservation_practices_residue_and_tillage_management_no_till_enhancements_optional_", farm_number * 2)),
    farm_fsa = sym(paste0("please_enter_the_fsa_number_for_farm_number_", farm_number)),
    farm_tractnum = farm_tractnum,
    farm_practice = farm_practice,
    farm_practice_units = farm_practice_units,
    farm_practice_animals = farm_practice_animals,
    farm_totalunits = sym(paste0("acres_or_animal_units_for_farm_number_", farm_number)),
    farm_enhancement_yesno = farm_enhancement_yesno,
    farm_enhancement_yesno2 = sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional_yes"))
  )
  
  # Use rename with !!! to splice the list
  renamed_columns <- applications %>% rename(!!!rename_list)
  
  # Adjusting selection and filtering according to the input parameters
  processed_data <- renamed_columns %>%
    select(entry_id, state, contains("name"), contains(c("farm", "enhancements", "pdf")),
           -contains(c("name_middle", "prefix", "suffix"))) %>%
    mutate(
      lastname_entryid = ifelse(!is.na(primary_applicant_name_last), 
                                paste0(primary_applicant_name_last, "_", entry_id), NA_character_)
    ) %>%
    fill(lastname_entryid, contains(c("primary_applicant")), state, .direction = "down") %>%
    select(lastname_entryid, everything(), contains("enhancements"), pdf_alliance_application) %>%
    filter(state == state_title) %>%
    select(-contains(excluded_states)) %>%
    group_by(lastname_entryid) %>%
    fill(contains(c("applicant")), .direction = "down") %>% 
    ungroup() %>% 
    remove_na_columns()
  
  # Additional processing
  long2 <- 
    processed_data %>%
    mutate(
      primary_applicant_name = paste(primary_applicant_name_first, primary_applicant_name_last, sep = "_"),
      coapplicant1_name = ifelse(!is.na(co_applicant_number_1_name_first),
                                 paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep = "_"), NA),
      coapplicant2_name = ifelse(!is.na(co_applicant_number_2_name_first),
                                 paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep = "_"), NA)
    ) %>%
    pivot_longer(
      cols = contains("enhancements"),
      names_to = c("prefix", "enhancement_category"),
      names_sep = paste0(included_state, "_planned_conservation_practices_"),
      values_to = "farm_enhancement"
    ) %>%
    select(-contains(c("prefix", "category", "name_first", "name_last", "entry_id"))) %>%
    select(lastname_entryid, state, primary_applicant_name, coapplicant1_name, coapplicant2_name, 
           farm_commodity, contains("farm_practice"), everything()) %>%
    filter(rowSums(!is.na(select(., -c(1:3)))) > 0) %>%
    group_by(lastname_entryid) %>%
    fill(contains(c("fsa", "totalunits", 
                    "tractnum",
                    "pdf")), .direction = "down") %>%
    ungroup() %>%
    distinct()
  
  # Filter and reorganize
  long2 %>%
    mutate(
      completeness = rowSums(!is.na(select(., -lastname_entryid))),
      # filter_flag = round(farm_practice_units) == round(farm_totalunits),
      has_farm_enhancement = trimws(farm_enhancement_yesno),
      drop_flag = ifelse(has_farm_enhancement == "Yes" & is.na(farm_enhancement), 1, 0)
    ) %>%
    group_by(lastname_entryid) %>%
    filter(drop_flag == 0 | is.na(drop_flag),
           !is.na(farm_commodity)) %>%
    # mutate_add a flag for the farm number as input
    mutate(farm_number = farm_number) %>% 
    select(lastname_entryid, state,
           primary_applicant_name, 
           contains("coapplicant"),
           farm_number,
           farm_fsa, 
           farm_tractnum,
           farm_commodity,
           contains("farm_practice"), farm_totalunits, has_farm_enhancement, farm_enhancement,
           contains("pdf"), completeness)
}
# For Farm 1 in Virginia, exclude North Dakota, Arkansas, and Minnesota
va_farm1 <- process_applications(applications, farm_number = 1, "Virginia", c("north_dakota", "arkansas", "minnesota"))
va_farm2 <- process_applications(applications, farm_number = 2, "Virginia", c("north_dakota", "arkansas", "minnesota"))
va_farms_total <- bind_rows(va_farm1, va_farm2) %>% arrange(lastname_entryid)

nd_farm1 <- process_applications(applications, farm_number = 1, "North Dakota", c("virginia", "arkansas", "minnesota"))
nd_farm2 <- process_applications(applications, farm_number = 2, "North Dakota", c("virginia", "arkansas", "minnesota"))
nd_farms_total <- bind_rows(nd_farm1, nd_farm2) %>% arrange(lastname_entryid)

combine_nd_va <- bind_rows(va_farms_total, nd_farms_total) %>% arrange(state, lastname_entryid)
write.csv(combine_nd_va, data_path(paste0("vt_alliance/combinedfarms_", today_date, ".csv")))

3 View Basic Summary Stats in Advance of Randomization

state	n_applications	acres_or_animal_units
north_dakota	445	81,445
virginia	510	69,588
Total	955	151,033

4 Set the Random Seed and Conduct Selection

Code

# | output: asis

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Here a colleague provides a random seed number to ensure reproducibility ----
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
seed_number <- 20
set.seed(seed_number)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 1. Conduct Selection (Simple Version) ---
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Generate Unique ID's for recommended applicants in each state and append a randomized priority
applications <-
  applications %>%
  filter(district_verification_status == "Recommended") %>%
  arrange(primary_applicant_name_last) %>% 
  group_by(state) %>%
  mutate(
    randomized_priority = sample(n(), replace = FALSE)
  ) %>%
  ungroup() %>%
  select(
    randomized_priority, state, entry_id, farm_number_1_address_city, primary_applicant_name_last,
    everything()
  ) %>% 
  arrange(randomized_priority, state, entry_id)

# View the output
applications %>% head(10)

# A tibble: 10 × 612
   randomized_priority state        entry_id farm_number_1_address_city
                 <int> <chr>           <int> <chr>                     
 1                   1 north_dakota     3216 Donnybrook                
 2                   1 virginia         1920 St Stephens Church        
 3                   2 north_dakota     1291 Hazen                     
 4                   2 virginia         2030 Lovingston                
 5                   3 north_dakota     3249 Minot                     
 6                   3 virginia         1705 Kents Store               
 7                   4 north_dakota     2354 Carrington                
 8                   4 virginia          816 New Kent                  
 9                   5 north_dakota     3151 Grassy Butte              
10                   5 virginia         3865 Suffolk City              
# ℹ 608 more variables: primary_applicant_name_last <chr>,
#   primary_applicant_name_prefix <lgl>, primary_applicant_name_first <chr>,
#   primary_applicant_name_middle <lgl>, primary_applicant_name_suffix <lgl>,
#   primary_applicant_email <chr>, primary_applicant_phone <chr>, mobile <chr>,
#   landline <chr>, primary_applicant_address_street_address <chr>,
#   primary_applicant_address_address_line_2 <chr>,
#   primary_applicant_address_city <chr>, …

4.1 Export the Selection

This exports our new application file with the prioritized list for funding based on our randomization.

Code

# Export the list with the ids and priorities to a CSV file ----
fwrite(applications,
       file = data_path(paste0("vt_alliance/selection_files/selection-", today_date,"-final.csv")),
       row.names = FALSE)

5 Evaluate budget, acreage, and producer implications of selection

5.1 How many applications do we have that feature coapplicants?

First we’ll identify all the columns with any text in the last name column as a heuristic for the number of producers. We’ll need this to figure out what our estimated budget is, as applicants differ in the share of the payment requested per farm and whether or not they are eligible for the 25% incremental payment for HUP status.

Code

# | output: asis
# Step 1: Identify columns with that feature last names as our heuristic for distinct producers
identify_applicant_cols <- applications %>% select(ends_with("name_last")) %>% colnames(); identify_applicant_cols

[1] "primary_applicant_name_last"     "co_applicant_number_1_name_last"
[3] "co_applicant_number_2_name_last"

Code

# Step 2: Strip out leading and trailing white space from selected columns and 
# Create new columns with "_flag" suffix indicating whether each cell in the 
# identified cols contains text (1) or not (0), without replacing the original values.
applications <- 
  applications %>%
  mutate(across(all_of(identify_applicant_cols), ~str_squish(.))) %>%
  mutate(across(all_of(identify_applicant_cols), ~if_else(is.na(.x) | .x == "", 0, 1), .names = "{.col}_flag")) %>% 
  # generate a column with the total number of applicants per application
  mutate(total_applicants = rowSums(select(., ends_with("_flag"))))

applications %>%
  tabyl(total_applicants) %>% 
  kable(
    digits = c(1,1,2),
    caption = "Distribution of Applications with Multiple Applicants"
    )

Distribution of Applications with Multiple Applicants
total_applicants	n	percent
1	896	0.94
2	33	0.03
3	26	0.03

The majority of applications have only one applicant, but there are a few with multiple applicants. The total number of applicants is the “total_applicants” times the number of applications with that number of applicants.

5.2 How many total applicants do we have?

Code

# | output: asis

# Step 3: Generate a total count of the number of times we have text in each of those columns by state --- 
total_producers_by_state <- applications %>%
  select(ends_with("_flag"), state) %>%
  group_by(state) %>%
  summarise(across(everything(), ~sum(. == 1, na.rm = TRUE))) %>%
  # Convert to long format for ease of viewing
  pivot_longer(cols = -state, names_to = "category", values_to = "value") %>%
  arrange(desc(state), desc(value)) %>%
  adorn_totals("row")

total_producers_by_state %>% 
  kable(
    caption = "Number of Applicants by State"
    )

Number of Applicants by State
state	category	value
virginia	primary_applicant_name_last_flag	510
virginia	co_applicant_number_1_name_last_flag	24
virginia	co_applicant_number_2_name_last_flag	7
north_dakota	primary_applicant_name_last_flag	445
north_dakota	co_applicant_number_1_name_last_flag	32
north_dakota	co_applicant_number_2_name_last_flag	22
Total	-	1040

Code

total_producers <- total_producers_by_state %>% filter(state == "Total") %>% pull(value)
total_producers_va <- total_producers_by_state %>% filter(state == "virginia") %>% summarise(n = sum(value)) %>% pull(n)
total_producers_nd <- total_producers_by_state %>% filter(state == "north_dakota") %>%  summarise(n= sum(value)) %>% pull(n)

5.3 Generate counts of and flags for historically underserved producers

First we will identify all the columns that relate to our HUP flags and determine who they apply to (primary applicant, the first co-applicant, indicated with “_2”, or the second co-applicant, indicated with “_3”).

Code

# Step 1: Identify columns that classify as HUP
define_hup <- 
  applications %>%
  select(matches("limited_resource_producer|small_producer|beginning_producer|women|
                 female_owned_operation|socially_disadvantaged_producer|veteran_producer")) %>%
  colnames(); define_hup

 [1] "beginning_producer"                "small_producer"                   
 [3] "x100_percent_women_run_operation"  "limited_resource_producer"        
 [5] "socially_disadvantaged_producer"   "veteran_producer"                 
 [7] "beginning_producer_2"              "small_producer_2"                 
 [9] "limited_resource_producer_2"       "socially_disadvantaged_producer_2"
[11] "veteran_producer_2"                "beginning_producer_3"             
[13] "small_producer_3"                  "limited_resource_producer_3"      
[15] "socially_disadvantaged_producer_3" "veteran_producer_3"

Code

define_hup_primary <- define_hup %>% str_subset(pattern = "^(?!.*(_2|_3)).*$"); define_hup_primary

[1] "beginning_producer"               "small_producer"                  
[3] "x100_percent_women_run_operation" "limited_resource_producer"       
[5] "socially_disadvantaged_producer"  "veteran_producer"

Code

define_hup_secondary <- define_hup %>%  str_subset(pattern = "_2"); define_hup_secondary

[1] "beginning_producer_2"              "small_producer_2"                 
[3] "limited_resource_producer_2"       "socially_disadvantaged_producer_2"
[5] "veteran_producer_2"

Code

define_hup_tertiary <- define_hup %>%  str_subset(pattern = "_3"); define_hup_tertiary

[1] "beginning_producer_3"              "small_producer_3"                 
[3] "limited_resource_producer_3"       "socially_disadvantaged_producer_3"
[5] "veteran_producer_3"

Code

# Step 2: Strip out leading and trailing white space from selected columns (str_squish)
# Step 3: Convert all cells without text (including NA's) to 0
# Step 4: Add flags for whether any of the columns in the define_hup lists contain text (1) or not (0)
applications <- 
  applications %>%
  mutate(across(all_of(define_hup), 
                ~if_else(is.na(str_squish(.)) | str_squish(.) == "", 0, 1))) %>% 
  mutate(
    hup_sum = rowSums(select(., define_hup)),     # by application
    # by producer
    hup_primary = rowSums(select(., define_hup_primary)),
    hup_secondary = rowSums(select(., define_hup_secondary)),
    hup_tertiary = rowSums(select(., define_hup_tertiary))
  ) %>%
  mutate(
    hup_any = as.numeric(hup_sum >= 1),     # by application
    # by producer
    hup_primary_any = as.numeric(hup_primary >= 1),
    hup_secondary_any = as.numeric(hup_secondary >= 1),
    hup_tertiary_any = as.numeric(hup_tertiary >= 1)
  ) %>%
  select(
    hup_any, hup_primary_any, hup_secondary_any, hup_tertiary_any,
    hup_sum, hup_primary, hup_secondary, hup_tertiary, define_hup, everything()
  ) %>%
  # order from largest to smallest (desc)
  arrange(across(c(hup_sum, hup_any, hup_primary, hup_primary_any, hup_secondary, hup_secondary_any, 
          hup_tertiary, hup_tertiary_any, define_hup), desc))

applications %>% head(10)

# A tibble: 10 × 624
   hup_any hup_primary_any hup_secondary_any hup_tertiary_any hup_sum
     <dbl>           <dbl>             <dbl>            <dbl>   <dbl>
 1       1               1                 0                0       6
 2       1               1                 1                0       6
 3       1               1                 1                1       6
 4       1               1                 1                1       6
 5       1               1                 0                0       5
 6       1               1                 0                0       5
 7       1               1                 0                0       5
 8       1               1                 0                0       4
 9       1               1                 0                0       4
10       1               1                 0                0       4
# ℹ 619 more variables: hup_primary <dbl>, hup_secondary <dbl>,
#   hup_tertiary <dbl>, beginning_producer <dbl>, small_producer <dbl>,
#   x100_percent_women_run_operation <dbl>, limited_resource_producer <dbl>,
#   socially_disadvantaged_producer <dbl>, veteran_producer <dbl>,
#   beginning_producer_2 <dbl>, small_producer_2 <dbl>,
#   limited_resource_producer_2 <dbl>, socially_disadvantaged_producer_2 <dbl>,
#   veteran_producer_2 <dbl>, beginning_producer_3 <dbl>, …

5.3.1 Summarize HUP status by category and state

Code

# Step 5a: Summarize hup status by category and state (a producer may fall into multiple categories) ----
hup_summary <-
  applications %>%
  select(define_hup, state) %>%
  group_by(state) %>%
  summarise(across(everything(), ~sum(. == 1, na.rm = TRUE))) %>%
  ungroup() %>%
  # convert to long format for ease of viewing
  pivot_longer(cols = -state, names_to = "category", values_to = "count") %>%
  arrange(desc(state), desc(count)) %>%
  adorn_totals("row") %>%
  mutate(pct_producers_per_state = case_when(
    state == "virginia" ~ round(count / total_producers_va, 2),
    state == "north_dakota" ~ round(count / total_producers_nd, 2),
    TRUE ~ NA_real_  # Handle other states if needed
  )) %>% 
  # drop if state is blank or na
  filter(!is.na(state) & state != "")


# Step 5b: Generate a summary of hup ANY status by state (by producer) -----
hup_any_per_state <-
  applications %>% 
  select(hup_primary_any, hup_secondary_any, hup_tertiary_any, state) %>% 
  group_by(state) %>%
  summarise(across(everything(), ~sum(. == 1, na.rm = TRUE))) %>%
  ungroup() %>% 
  # convert to long format
  pivot_longer(cols = -state, 
               names_to = "category", 
               values_to = "count") %>% 
  arrange(desc(state), desc(count)) %>% 
  adorn_totals("row")   %>% 
  mutate(pct_producers_per_state = case_when(
    state == "virginia" ~ round(count / total_producers_va,2),
    state == "north_dakota" ~ round(count / total_producers_nd,2),
    TRUE ~ NA_real_  # Handle other states if needed
  )) %>%  # drop if state is blank or na
  filter(!is.na(state) & state != "")

# scales::percent(hup_any_per_state$pct_producers_per_state, accuracy = 0.01)

# View the outputs
hup_summary %>% kable(title = "Counts of HUP status by producer (note a producer may have multiple flags)")

state	category	count	pct_producers_per_state
virginia	small_producer	227	0.42
virginia	beginning_producer	126	0.23
virginia	x100_percent_women_run_operation	58	0.11
virginia	socially_disadvantaged_producer	35	0.06
virginia	veteran_producer	22	0.04
virginia	limited_resource_producer	17	0.03
virginia	small_producer_2	12	0.02
virginia	beginning_producer_2	6	0.01
virginia	socially_disadvantaged_producer_2	5	0.01
virginia	small_producer_3	3	0.01
virginia	beginning_producer_3	2	0.00
virginia	limited_resource_producer_2	0	0.00
virginia	veteran_producer_2	0	0.00
virginia	limited_resource_producer_3	0	0.00
virginia	socially_disadvantaged_producer_3	0	0.00
virginia	veteran_producer_3	0	0.00
north_dakota	small_producer	118	0.24
north_dakota	beginning_producer	95	0.19
north_dakota	x100_percent_women_run_operation	25	0.05
north_dakota	socially_disadvantaged_producer	14	0.03
north_dakota	beginning_producer_2	7	0.01
north_dakota	veteran_producer	5	0.01
north_dakota	small_producer_2	3	0.01
north_dakota	beginning_producer_3	3	0.01
north_dakota	socially_disadvantaged_producer_3	2	0.00
north_dakota	limited_resource_producer	1	0.00
north_dakota	socially_disadvantaged_producer_2	1	0.00
north_dakota	limited_resource_producer_2	0	0.00
north_dakota	veteran_producer_2	0	0.00
north_dakota	small_producer_3	0	0.00
north_dakota	limited_resource_producer_3	0	0.00
north_dakota	veteran_producer_3	0	0.00
Total	-	787	NA

Code

hup_any_per_state %>% 
  kable(title = "Counts when any producer has any HUP status", booktabs = TRUE)

state	category	count	pct_producers_per_state
virginia	hup_primary_any	307	0.57
virginia	hup_secondary_any	18	0.03
virginia	hup_tertiary_any	3	0.01
north_dakota	hup_primary_any	210	0.42
north_dakota	hup_secondary_any	9	0.02
north_dakota	hup_tertiary_any	5	0.01
Total	-	552	NA

Code

n_hup <- hup_any_per_state %>% filter(state == "Total") %>% pull(count)
pct_hup <- n_hup / total_producers; pct_hup

[1] 0.53

Code

scales::percent(pct_hup)

[1] "53%"

The number of applicants that fall into any HUP category is 552 (rscales::percent(pct_hup))`).

5.4 Generate counts of and flags for limited resource producers

First we will identify all the columns that relate to our LRP flags

Code

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# C. Generate a total count of limited resource and socially disadvantaged producers
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
gen_summary <- function(str_pattern) {
  summary <- 
    applications %>%
    select(matches(str_pattern), state) %>%
    group_by(state) %>%
    summarise(across(matches(str_pattern), ~sum(. == 1, na.rm = TRUE))) %>%
    pivot_longer(cols = -state, names_to = "category", values_to = "count") %>%
    arrange(desc(state), desc(count)) %>%
    adorn_totals("row") %>%
    mutate(pct_producers_state = case_when(
      state == "Virginia" ~ round(count / total_producers_va, 2),
      state == "North Dakota" ~ round(count / total_producers_nd, 2),
      TRUE ~ NA_real_  # Handle other states if needed
    )) %>%
    filter(!is.na(state) & state != "")
  
  return(summary)
}

lrp_summary <- gen_summary("limited_resource_producer"); lrp_summary %>% kable()

state	category	count	pct_producers_state
virginia	limited_resource_producer	17	NA
virginia	limited_resource_producer_2	0	NA
virginia	limited_resource_producer_3	0	NA
north_dakota	limited_resource_producer	1	NA
north_dakota	limited_resource_producer_2	0	NA
north_dakota	limited_resource_producer_3	0	NA
Total	-	18	NA

Code

sdp_summary <- gen_summary("socially_disadvantaged_producer"); sdp_summary %>% kable()

state	category	count	pct_producers_state
virginia	socially_disadvantaged_producer	35	NA
virginia	socially_disadvantaged_producer_2	5	NA
virginia	socially_disadvantaged_producer_3	0	NA
north_dakota	socially_disadvantaged_producer	14	NA
north_dakota	socially_disadvantaged_producer_3	2	NA
north_dakota	socially_disadvantaged_producer_2	1	NA
Total	-	57	NA

The number of producers that are in the LRP and SDP categories are:

Code

# Calculate total LRP producers ----
total_lrp_producers <- lrp_summary %>% filter(state == "Total") %>% pull(count)
total_sdp_producers <- sdp_summary %>% filter(state == "Total") %>% pull(count)

# Calculate percentage of LRP producers ----
pct_lrp <- total_lrp_producers / total_producers; pct_lrp

[1] 0.017

Code

pct_sdp <- total_sdp_producers / total_producers; pct_sdp

[1] 0.055

6 Estimate implications on program targets (producers, acreage, budget)

First we need to account for the fact that there are up to two farms per application but up to three applicants, and each person may have their own HUP status. We also have different payment shares per applicant/coapplicant to account for.

Code

applications_slim <- 
  applications %>% 
  # create payment rate columns for each primary, coapplicant, and coapplicant 1
  # should be 125 if hup is 1 for hup_primary, hup_secondary, or hup_tertiary, else 100
  # bonus payments for lrp should be 500 if limited_resource_producer is 1, else 0
  mutate(
      payment_rate_primary = if_else(hup_primary_any == 1, 125, 100),
      payment_rate_coapplicant1 = if_else(hup_secondary_any == 1, 125, 100),
      payment_rate_coapplicant2 = if_else(hup_tertiary_any == 1, 125, 100), 
      bonus_lrp_primary = if_else(limited_resource_producer == 1, 500, 0),
      bonus_lrp_coapplicant1 = if_else(limited_resource_producer_2 == 1, 500, 0),
      bonus_lrp_coapplicant2 = if_else(limited_resource_producer_3 == 1, 500, 0)
  ) %>% 
  # turn NA's to 0's
  mutate_at(vars(contains("share_of_payment_percent")), ~ifelse(is.na(.), 0, ./100)) %>% 
  group_by(entry_id, state) %>%
  mutate(primary_applicant_number_1_share_of_payment_percent = 
           ifelse(total_applicants == 1, 1, primary_applicant_number_1_share_of_payment_percent)) %>% 
  mutate(budget_farm_1 = 
           primary_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_primary + bonus_lrp_primary +
           co_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_coapplicant1 + bonus_lrp_coapplicant1, 
         budget_farm_2 = 
          # if there is a farm 2, calculate its budget
           ifelse(!is.na(acres_or_animal_units_for_farm_number_2),
           primary_applicant_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_primary +  bonus_lrp_primary +
           co_applicant_number_2_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_coapplicant2 + bonus_lrp_coapplicant2, 0)
         ) %>%
  mutate(budget_application = sum(budget_farm_1, budget_farm_2, na.rm = TRUE),
         total_acres = sum(acres_or_animal_units_for_farm_number_1, acres_or_animal_units_for_farm_number_2, na.rm = TRUE)) %>%
  ungroup() %>%
  select(randomized_priority, state, entry_id, total_acres, budget_application, total_applicants, budget_farm_1, budget_farm_2, 
         contains("rate"), contains("bonus"),
         everything()) %>% 
  arrange(randomized_priority, state, entry_id, total_acres, budget_application)

6.1 Summary of Sample Selection Implications Across All States

Code

applications_slim %>%
     arrange(randomized_priority, state)  %>% 
     group_by(state) %>% 
     summarize(
       total_acres_au = sum(total_acres, na.rm = TRUE),
       total_producers = sum(total_applicants, na.rm = TRUE),
       total_budget = sum(budget_application, na.rm = TRUE)
     ) %>% 
  adorn_totals("row") %>% 
  mutate(across(where(is.numeric), ~comma(.))) %>%
  kable(caption = "Summary of Selection Implications Across All States, if all were funded (and assuming all animal units are as stated in application)")

Summary of Selection Implications Across All States, if all were funded (and assuming all animal units are as stated in application)
state	total_acres_au	total_producers	total_budget
north_dakota	100,081	499	7,364,194
virginia	81,864	541	5,728,970
Total	181,945	1,040	13,093,163

6.2 Apply selection thresholds to applications

We’ll first flag the applications that meet the priority criteria (LRP and SDP) and select the first 75% of those flagged. Of the remaining applications, we’ll then select 50% from VA and 75% from ND.

The priority flags are in any of these columns:

Code

# Parameters
priority_flags <- 
  applications %>%
  select(matches("limited_resource_producer|socially_disadvantaged_producer")) %>%
  colnames(); priority_flags

[1] "limited_resource_producer"         "socially_disadvantaged_producer"  
[3] "limited_resource_producer_2"       "socially_disadvantaged_producer_2"
[5] "limited_resource_producer_3"       "socially_disadvantaged_producer_3"

Then apply the given thresholds:

Code

# Define the flagged_fraction as 75% for flagged applications
flagged_fraction <- 0.76

# First, select 75% of all ND applicants and 50% of all VA applicants, including both flagged and unflagged applications
applications_selection_status <- 
  applications_slim %>%
  arrange(randomized_priority, state) %>%
  ungroup() %>% 
  group_by(state) %>%
  mutate(
    # Create a flag column that checks if any priority flags are TRUE for each row
    flag = rowSums(across(all_of(priority_flags)) == TRUE) > 0,
    n_state = n(),
    # Dynamically assign state_fraction based on the state
    state_fraction = case_when(
      state == "virginia" ~ 0.5,
      state == "north_dakota" ~ 0.75,
      TRUE ~ 0.75  # Default value for other states
    ),
    # Compute the row number for selection within each state
    row_num_for_selection = row_number(),
    # Select applicants based on the state_fraction
    selection_status = ifelse(!flag & (row_num_for_selection <= (n_state * state_fraction)), "selected", NA_character_)) %>% 
  mutate(
     # Compute the number of flagged applications to select based on flagged_fraction
    total_flagged = sum(flag),
    select_flagged_count = ceiling(total_flagged * flagged_fraction),
    flag_cumulative = cumsum(flag),
    # Mark selected flagged applications
    selection_status = ifelse(flag & flag_cumulative <= select_flagged_count, "selected", selection_status),
  ) %>%
  ungroup() %>% 
  mutate(selection_status = replace_na(selection_status, "not selected"))

# Display the updated dataset
applications_selection_status %>% head(10)

# A tibble: 10 × 642
   randomized_priority state        entry_id total_acres budget_application
                 <int> <chr>           <int>       <dbl>              <dbl>
 1                   1 north_dakota     3216       320               16000 
 2                   1 virginia         1920        14.2              1775 
 3                   2 north_dakota     1291       126               12600 
 4                   2 virginia         2030       131               16375 
 5                   3 north_dakota     3249       320               16000 
 6                   3 virginia         1705       119.               8162.
 7                   4 north_dakota     2354       320               36000 
 8                   4 virginia          816       320               16000 
 9                   5 north_dakota     3151       294.              13450 
10                   5 virginia         3865       194.              20000 
# ℹ 637 more variables: total_applicants <dbl>, budget_farm_1 <dbl>,
#   budget_farm_2 <dbl>,
#   i_have_not_operated_a_farm_or_ranch_or_i_have_operated_my_farm_or_ranch_for_less_than_10_years <chr>,
#   i_have_not_operated_a_farm_or_ranch_or_i_have_operated_my_farm_or_ranch_for_less_than_10_years_2 <lgl>,
#   i_have_not_operated_a_farm_or_ranch_or_i_have_operated_my_farm_or_ranch_for_less_than_10_years_3 <lgl>,
#   payment_rate_primary <dbl>, payment_rate_coapplicant1 <dbl>,
#   payment_rate_coapplicant2 <dbl>, bonus_lrp_primary <dbl>, …

Code

# Add a check to see how many flagged applications are selected by state
flagged_selection_summary <- applications_selection_status %>%
  group_by(state) %>%
  summarize(
    total_applications = n(),
    total_flagged = sum(flag),
    selected_flagged = sum(flag & selection_status == "selected")
  ) %>%
  mutate(pct_flagged = selected_flagged / total_flagged,
         pct_flagged_selected = selected_flagged / total_applications)

# # Display the summary
# flagged_selection_summary %>% kable()

6.3 Export application file with labels for “selected” or “not selected”

Once have finalized the input data and the relevant conversions, we will export the application file with the “selected” and “not selected” columns in the lefthand most column for ease of viewing. The below shows a sample

Code

applications_selection_status_slim <-
  applications_selection_status %>%
  mutate(primary_applicant_name = paste(primary_applicant_name_first, primary_applicant_name_last, sep = "_"),
         coapplicant1_name = ifelse(!is.na(co_applicant_number_1_name_first), 
                               paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep = "_"), NA),
         coapplicant2_name = ifelse(!is.na(co_applicant_number_2_name_first), 
                               paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep = "_"), NA),
         lastname_entryid = paste0(primary_applicant_name_last, "_", entry_id)) %>% 
  select(selection_status, flag, entry_id, primary_applicant_name_last, state, #randomized_priority, 
         matches(".*primary.*name.*"),  matches(".*primary.*address.*"), matches(".*primary.*email.*"), 
         matches(".*primary.*phone.*"), hup_primary_any, contains("counties"),
         matches(".*farm.*address.*"),   matches(".*coapplicant.*name.*"),  matches(".*co_applicant.*address.*"),
         matches(".*co_applicant.*email.*"), matches(".*co_applicant.*phone.*"), contains("share_of_payment"),
         total_acres, budget_application, contains("zip"), contains("virginia_counties"), contains("north_dakota_counties"), contains("name"), define_hup, contains("hup"), are_you_interested_in_using_comet_farm,
         # contains("comet"),
         pdf_alliance_application) %>% 
  select(-contains(c("name_middle", "prefix", "suffix", "name_first", "name_last", "minnesota", "arkansas"))) %>% 
  select(selection_status, flag, entry_id, state, primary_applicant_address_zip_postal_code, contains("counties"), everything())

head(applications_selection_status_slim)

# A tibble: 6 × 81
  selection_status flag  entry_id state        primary_applicant_address_zip_p…¹
  <chr>            <lgl>    <int> <chr>        <chr>                            
1 selected         FALSE     3216 north_dakota 58734                            
2 selected         FALSE     1920 virginia     23148                            
3 selected         FALSE     1291 north_dakota 58545                            
4 selected         FALSE     2030 virginia     27519                            
5 selected         FALSE     3249 north_dakota 58701                            
6 selected         FALSE     1705 virginia     23084                            
# ℹ abbreviated name: ¹primary_applicant_address_zip_postal_code
# ℹ 76 more variables: north_dakota_counties <chr>, virginia_counties <chr>,
#   north_dakota_counties_2 <chr>, virginia_counties_2 <chr>,
#   primary_applicant_name <chr>,
#   primary_applicant_address_street_address <chr>,
#   primary_applicant_address_address_line_2 <chr>,
#   primary_applicant_address_city <chr>, …

Code

# fwrite(applications_selection_status_slim,
#        file = data_path(paste0("vt_alliance/selection_files/selection_run-",
#                                today_date,"-final-seed-", seed_number,".csv")), row.names = FALSE)

6.4 Merge in the selection data with farm/crop/practice/enhancement information

Code

# Merge in the selection data with the farm/crop/practice/enhancement information
check <- 
  applications_selection_status_slim %>%
  full_join(combine_nd_va, by = c("lastname_entryid", "state", "primary_applicant_name", "coapplicant1_name", "coapplicant2_name", "pdf_alliance_application")) %>% 
  # full_join(va_farms_total_flag, by = c("lastname_entryid", "pdf_alliance_application",) %>% 
  mutate(flag = as.integer(as.logical(flag)),
         counties = coalesce(north_dakota_counties, virginia_counties)) %>% 
  rename(lrp_sdp_priority_flag = flag,
         primary_applicant_number_2_share_of_payment_percent =  primary_applicant_share_of_payment_percent) %>% 
  select(-contains(c
                   ("counties_2", "virginia_counties", "north_dakota_counties", "small", "women", "veteran",
                     "beginning")), 
         -primary_applicant_address_country, 
         -farm_number_1_address_country, -farm_number_2_address_country, -contains("country"), -representative_name, everything()) %>% 
  mutate(farm_number_1_address_state_province = ifelse(state == "virginia", "VA", "ND"),
         farm_number_2_address_state_province = ifelse(state == "virginia", "VA", "ND"))


# Function to concatenate address parts
concatenate_address <- function(street, line2, city, state, zip) {
  address <- paste0(
    street, ", ",
    ifelse(is.na(line2) | line2 == "", "", paste0(line2, " ")),
    city, ", ",
    state, " ",
    zip
  )
  return(address)
}

# List of address prefixes to process
address_prefixes <- c("farm_number_1_address", "farm_number_2_address", 
                      "primary_applicant_address", "co_applicant_number_1_address",  "co_applicant_number_2_address")

# Function to create full address columns
create_full_address <- function(df, prefix) {
  df %>%
    mutate("{prefix}_full_address" := pmap_chr(list(
      df[[paste0(prefix, "_street_address")]],
      df[[paste0(prefix, "_address_line_2")]],
      df[[paste0(prefix, "_city")]],
      df[[paste0(prefix, "_state_province")]],
      df[[paste0(prefix, "_zip_postal_code")]]
    ), concatenate_address))
}

# Apply the function to each prefix and reduce the data frame
addy_check <- 
  reduce(address_prefixes, create_full_address, .init = check)  %>%
  select(-matches("address"), contains("full_address")) %>% 
  select(-contains(c("completeness", "small", "women", "veteran","beginning", "representative", 
                     "virginia_counties", "north_dakota_counties")))

farm1_details <- 
  addy_check %>% 
  filter(farm_number == 1) %>% 
  select(-contains(c("farm_number_2", "coapplicant2", "co_applicant_number_2", "producer_3", "tertiary",
                     "hup_sum", "primary_applicant_number_2_share_of_payment_percent")))

farm2_details <- 
  addy_check %>% 
  filter(farm_number == 2) %>% 
  select(-contains(c("farm_number_1", "coapplicant1", "co_applicant_number_1", "producer_2", "secondary",
                     "hup_sum", "primary_applicant_number_1_share_of_payment_percent")))
  
all_farm_details <- 
  bind_rows(farm1_details, farm2_details) %>% 
    select(-contains("pdf"), everything(), contains("pdf"))

# Combine co_applicant_number_1 and co_applicant_number_2 related columns using coalesce
all_farms_combine <- all_farm_details %>%
  mutate(
    coapplicant_name = coalesce(coapplicant1_name, coapplicant2_name),
    co_applicant_email = coalesce(co_applicant_number_1_email, co_applicant_number_2_email),
    co_applicant_phone = coalesce(co_applicant_number_1_phone, co_applicant_number_2_phone),
    co_applicant_share_of_payment_percent = coalesce(co_applicant_number_1_share_of_payment_percent, co_applicant_number_2_share_of_payment_percent),
    primary_applicant_share_of_payment_percent = coalesce(primary_applicant_number_1_share_of_payment_percent, primary_applicant_number_2_share_of_payment_percent), 
    farm_address = coalesce(farm_number_1_address_full_address, farm_number_2_address_full_address),
    coapplicant_address = coalesce(co_applicant_number_1_address_full_address, co_applicant_number_2_address_full_address),
    hup_coapplicant = coalesce(hup_secondary_any, hup_tertiary_any),
    socially_disadvantaged_coapplicant = coalesce(socially_disadvantaged_producer_2, socially_disadvantaged_producer_3),
    limited_resource_coapplicant = coalesce(limited_resource_producer_2, limited_resource_producer_3)
  ) %>%
  select(
    -coapplicant1_name, -coapplicant2_name,
    -co_applicant_number_1_email, -co_applicant_number_2_email,
    -co_applicant_number_1_phone, -co_applicant_number_2_phone,
    -co_applicant_number_1_share_of_payment_percent, -co_applicant_number_2_share_of_payment_percent,
    -primary_applicant_number_1_share_of_payment_percent, -primary_applicant_number_2_share_of_payment_percent, 
    -farm_number_1_address_full_address, -farm_number_2_address_full_address,
    -co_applicant_number_1_address_full_address, -co_applicant_number_2_address_full_address,
    -hup_primary, -hup_secondary, -hup_tertiary, 
    -socially_disadvantaged_producer_2, -socially_disadvantaged_producer_3,
    -hup_secondary_any, -hup_tertiary_any,
    -limited_resource_producer_2, -limited_resource_producer_3
  ) %>% 
  arrange(desc(selection_status), state, primary_applicant_name, entry_id, coapplicant_name) 
#%>% 
  # filter(!is.na(entry_id))

col_ordering <- 
  c(
  "selection_status", 
  "entry_id", 
  # "lastname_entryid", 
  "state", 
  "primary_applicant_name", 
  "primary_applicant_address_full_address", 
  "primary_applicant_email", 
  "primary_applicant_phone", 
  # underserved status for primary applicant
  "lrp_sdp_priority_flag", 
  "hup_primary_any", 
  "limited_resource_producer", 
  "socially_disadvantaged_producer", 
  # farm information
  "farm_number", 
  "counties", 
  "farm_address", 
  "farm_fsa", 
  "farm_tractnum", 
  "farm_commodity", 
  "farm_practice", 
  "has_farm_enhancement", 
  "farm_enhancement", 
  "farm_practice_units",
  "farm_practice_animals", 
  "farm_totalunits", 
  "total_acres", 
  
  "coapplicant_name", 
  "coapplicant_address", 
  "co_applicant_email", 
  "co_applicant_phone", 
  "limited_resource_coapplicant", 
  "socially_disadvantaged_coapplicant", 
  "hup_coapplicant",

  "budget_application", 
  "primary_applicant_share_of_payment_percent", 
  "co_applicant_share_of_payment_percent", 
  
  # "hup_any", # on the fence about whether to include this
  "are_you_interested_in_using_comet_farm",
  "pdf_alliance_application"
)

# Custom function to convert text to title case except for state abbreviations
to_title_case_except_states <- function(address) {
  # Define a vector of state abbreviations
  state_abbreviations <- c(
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", 
    "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", 
    "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", 
    "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", 
    "VT", "VA", "WA", "WV", "WI", "WY"
  )
  
  # Convert the address to title case
  address_title_case <- str_to_title(address)
  
  # Replace state abbreviations with uppercase versions
  for (state in state_abbreviations) {
    address_title_case <- str_replace_all(
      address_title_case, 
      regex(paste0("\\b", state, "\\b"), ignore_case = TRUE), 
      state
    )
  }
  
  return(address_title_case)
}


all_farms_ordered <- 
  all_farms_combine %>% 
  select(all_of(col_ordering)) %>% 
  mutate(state = ifelse(state == "virginia", "VA", "ND")) %>% 
  group_by(primary_applicant_name) %>% 
  arrange(desc(selection_status), state, primary_applicant_name, farm_number) %>% 
  ungroup() %>% 
  distinct()

# Dynamically identify columns that are likely address columns
address_columns <- names(all_farms_ordered)[grepl("address", names(all_farms_ordered), ignore.case = TRUE)]

# Apply the function to all identified address columns
all_farms_ordered[address_columns] <- lapply(all_farms_ordered[address_columns], function(column) {
  sapply(column, to_title_case_except_states)
})

# Custom function to format names
format_name <- function(name) {
  # Replace underscores with spaces
  name <- str_replace_all(name, "_", " ")
  
  # Convert the name to title case
  name <- str_to_title(name)
  
  # Handle the special case for "de" prefix
  name <- str_replace_all(name, "\\bDe\\b", "de")
  
  return(name)
}

# Custom function to replace full state names with abbreviations
convert_state_abbreviation <- function(address) {
  # Define a named vector of state abbreviations
  state_names <- c(
    "Alabama" = "AL", "Alaska" = "AK", "Arizona" = "AZ", "Arkansas" = "AR", 
    "California" = "CA", "Colorado" = "CO", "Connecticut" = "CT", "Delaware" = "DE", 
    "Florida" = "FL", "Georgia" = "GA", "Hawaii" = "HI", "Idaho" = "ID", 
    "Illinois" = "IL", "Indiana" = "IN", "Iowa" = "IA", "Kansas" = "KS", 
    "Kentucky" = "KY", "Louisiana" = "LA", "Maine" = "ME", "Maryland" = "MD", 
    "Massachusetts" = "MA", "Michigan" = "MI", "Minnesota" = "MN", "Mississippi" = "MS", 
    "Missouri" = "MO", "Montana" = "MT", "Nebraska" = "NE", "Nevada" = "NV", 
    "New Hampshire" = "NH", "New Jersey" = "NJ", "New Mexico" = "NM", "New York" = "NY", 
    "North Carolina" = "NC", "North Dakota" = "ND", "Ohio" = "OH", "Oklahoma" = "OK", 
    "Oregon" = "OR", "Pennsylvania" = "PA", "Rhode Island" = "RI", "South Carolina" = "SC", 
    "South Dakota" = "SD", "Tennessee" = "TN", "Texas" = "TX", "Utah" = "UT", 
    "Vermont" = "VT", "Virginia" = "VA", "Washington" = "WA", "West Virginia" = "WV", 
    "Wisconsin" = "WI", "Wyoming" = "WY"
  )
  
  # Replace full state names with abbreviations
  for (state in names(state_names)) {
    address <- str_replace_all(address, regex(paste0("\\b", state, "\\b"), ignore_case = TRUE), state_names[state])
  }
  
  return(address)
}

# Custom function to capitalize directional calls with specific pattern
capitalize_directionals <- function(address) {
  # Define a vector of directional calls
  directionals <- c("NE", "NW", "SE", "SW", "N", "E", "S", "W")
  
  # Capitalize directional calls only if they have a space before and a comma or space after
  for (dir in directionals) {
    address <- str_replace_all(address, regex(paste0("(?<=\\s)", dir, "(?=\\s|,)"), ignore_case = TRUE), toupper(dir))
  }
  
  return(address)
}

all_farms_ordered_interim <- 
  all_farms_ordered %>% 
  rename(county = counties) %>% 
  mutate(across(contains("name", ignore.case = TRUE), ~ sapply(., format_name))) %>% 
  mutate(across(contains("address", ignore.case = TRUE), ~ sapply(., convert_state_abbreviation))) %>% 
  mutate(across(contains("address", ignore.case = TRUE), ~ sapply(., capitalize_directionals)))

all_farms_ordered_final <-
  all_farms_ordered_interim %>% 
  group_by(primary_applicant_name) %>%
  mutate(selection_status = ifelse(is.na(selection_status), "Not Recommended", selection_status)) %>%
  group_by(primary_applicant_name, farm_number) %>% 
  # identify where the sum of practice acres and animal units does not total the farm total units, and flag with 1 if not equal
  mutate(flag_for_review = ifelse(round(sum(farm_practice_units, farm_practice_animals, na.rm = TRUE)) != round(farm_totalunits), 1, 0)) %>%
  arrange(desc(selection_status), state, primary_applicant_name, farm_number) %>% 
  ungroup()
  

all_farms_ordered_final %>%  
  select(flag_for_review, selection_status, entry_id, state, county,lrp_sdp_priority_flag,  everything()) %>% 
  head()

# A tibble: 6 × 37
  flag_for_review selection_status entry_id state county   lrp_sdp_priority_flag
            <dbl> <chr>               <int> <chr> <chr>                    <int>
1               0 selected             3883 ND    Rolette…                     0
2               0 selected             3883 ND    Rolette…                     0
3               0 selected             2547 ND    Eastern…                     0
4               0 selected             1208 ND    Stark C…                     0
5               0 selected             2878 ND    Stark C…                     0
6               0 selected             2878 ND    Stark C…                     0
# ℹ 31 more variables: primary_applicant_name <chr>,
#   primary_applicant_address_full_address <chr>,
#   primary_applicant_email <chr>, primary_applicant_phone <chr>,
#   hup_primary_any <dbl>, limited_resource_producer <dbl>,
#   socially_disadvantaged_producer <dbl>, farm_number <dbl>,
#   farm_address <chr>, farm_fsa <chr>, farm_tractnum <chr>,
#   farm_commodity <chr>, farm_practice <chr>, has_farm_enhancement <chr>, …

Code

write.csv(all_farms_ordered_final, 
          file = paste0(data_path("vt_alliance/selection_files/all_farms_ordered_final_updated_"),
                        today_date, ".csv"),
          row.names = FALSE)

7 Generate summary of output for selected thresholds

In order to have a credible research design to evaluate the impacts of this funding, we cannot fund all applications in this round. Nor can we choose to fund all applicants but reduce the amount. Either of these options will render future impact evaluations for this program invalid.

We will need a subset of applications that are not selected – at complete random – to have a credible comparison group (called a “counterfactual”). These people will still be eligible for selection in future periods if they choose to re-enter the pool.

Given our understanding of the various target we seek to achieve and the multiple application periods, we propose the following thresholds, which would result in roughly the following amounts:

(Note unless otherwise indicated by the column applications, all numbers refer to producer counts rather than applications.)

Code

generate_summary_table <- function(df, state_input){
  # Step 3: Generate a summary output table
  state = trimws(gsub("", "_", tolower(state_input)))

  summary_table <- 
    df %>%
    filter(state == state_input,
           selection_status == "selected") %>% 
    group_by(hup_any) %>% 
    summarise(
      state = state,  # Display the state as a constant column
      threshold_used = as.character(state_fraction),
      budget_est = sum(budget_application, na.rm = TRUE),
      total_acres_au = sum(total_acres, na.rm = TRUE),
      budget_est = sum(budget_application, na.rm = TRUE),
      total_applications = n(),
      total_producers = sum(total_applicants, na.rm = TRUE),
      total_hup = sum(hup_any, na.rm = TRUE),
      total_lrp = sum(limited_resource_producer, na.rm = TRUE),
      total_sdp = sum(socially_disadvantaged_producer, na.rm = TRUE),
    ) %>% 
    mutate(hup_any = as.character(hup_any)) %>% 
    distinct()
  
  return(summary_table)
}

# Generate summary tables for different thresholds
nd_75 <- generate_summary_table(df = applications_selection_status,
                         state_input = "north_dakota") %>% 
  adorn_totals("row") %>% 
  mutate(across(where(is.numeric), ~comma(.)))

nd_75 %>%  kable(title = "Summary of Budget Implications for ND By HUP Status")

hup_any	state	threshold_used	budget_est	total_acres_au	total_applications	total_producers	total_hup	total_lrp	total_sdp
0	north_dakota	0.75	2,904,560	46,372	181	210	0	0	0
1	north_dakota	0.75	2,624,776	30,610	152	168	152	1	11
Total	-	-	5,529,336	76,981	333	378	152	1	11

Code

va_50 <- 
 generate_summary_table(df = applications_selection_status,
                         state_input = "virginia") %>% 
  adorn_totals("row") %>% 
  mutate(across(where(is.numeric), ~comma(.)))

va_50 %>% kable(title = "Summary of Budget Implications for Virginia By HUP Status")

hup_any	state	threshold_used	budget_est	total_acres_au	total_applications	total_producers	total_hup	total_lrp	total_sdp
0	virginia	0.5	1,395,590	24,676	103	105	0	0	0
1	virginia	0.5	1,620,825	19,411	166	181	166	14	28
Total	-	-	3,016,415	44,087	269	286	166	14	28

Code

nd_75_noformat <- generate_summary_table(df = applications_selection_status,
                         state_input = "north_dakota") %>% adorn_totals("row") 

va_50_noformat <- generate_summary_table(df = applications_selection_status,
                         state_input = "virginia") %>%
  adorn_totals("row") 

# Sum all rows but the ones labeled Total
totals <- 
  rbind(va_50_noformat, nd_75_noformat) %>% 
    mutate(state = ifelse(hup_any == "Total", "state subtotal", state),
         hup_any = ifelse(hup_any == "Total", "-", hup_any))  
 
# Step 2: Calculate new totals for rows previously labeled "Total"
total_row <- 
  totals %>%
  filter(state == "state subtotal") %>%
  summarise(across(where(is.numeric), sum, na.rm = TRUE))

# Adding a label for the total row
total_row$state <- "Total"

# Combining the modified data with the new total row
final_data <- 
  bind_rows(totals, total_row) %>% 
  # filter(state == "Total") %>% 
  mutate(across(everything(), ~ifelse(is.na(.), "-", .))) %>% 
  mutate(across(where(is.numeric), ~comma(as.numeric(.))))

final_data %>% kable(caption =
                     "Summary of Proposed Selection Implications by State and HUP Status") %>%
kable_styling(full_width = F, position = "left", bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )

Summary of Proposed Selection Implications by State and HUP Status
hup_any	state	threshold_used	budget_est	total_acres_au	total_applications	total_producers	total_hup	total_lrp	total_sdp
0	virginia	0.5	1,395,590	24,676	103	105	0	0	0
1	virginia	0.5	1,620,825	19,411	166	181	166	14	28
-	state subtotal	-	3,016,415	44,087	269	286	166	14	28
0	north_dakota	0.75	2,904,560	46,372	181	210	0	0	0
1	north_dakota	0.75	2,624,776	30,610	152	168	152	1	11
-	state subtotal	-	5,529,336	76,981	333	378	152	1	11
-	Total	-	8,545,751	121,068	602	664	318	15	39

8 Check on share of HUP and SDP selected

The flag here indicates whether an application has any applicant with HUP or SDP status. The numbers in the prior table refer to applicants (of which there may be up to three per application), whereas this table refers to applications where any of up to three applicants have HUP or SDP status.

Code

# Display the summary
flagged_selection_summary %>% kable()

state	total_applications	total_flagged	selected_flagged	pct_flagged	pct_flagged_selected
north_dakota	445	16	13	0.81	0.03
virginia	510	51	39	0.76	0.08

9 Remaining TO-DO’s

The budget estimates should be considered as exactly that – estimates! There are a few cases where the application files were inconsistent on the acreage within a farm and the total acreage requested, so we anticipate there may be adjustments once the acreage is reviewed/verified by local partners and the producers themselves.

More broadly, as we have continued to evaluate the implications of selection, we have further identified a few more errors that should have been caught in the applications input files before the data file arrived to selection but were not (e.g. inconsistent acreages between columns, a test case making it through to “Recommended” status). As we learn more in this round, we can make improvements that will allow us to move more quickly in subsequent rounds on what types of automatic flags to have before producers are able to submit their files.

10 Manual flags that we had to fix

We will seek to systematize the following checks moving forward:

Sum of practice code animals or acres should equal total farm acres
Practice code animals or acres should not be zero, missing, or negative
Practice code animals or acres should not be greater than total farm acres
Practice code animals or acres should not be greater than 160 per farm
The “has enhancement” column should not be empty when the enhancement is present (or is not NA)
Check to confirm the number of producers in LRP and SDP status per state are selected.