# Clear the workspacerm(list =ls())# load the librarieslibrary(tidyverse)library(data.table) # to read in files fasterlibrary(tidylog)library(janitor)library(scales) # for comma and formattinglibrary(knitr) #for kablelibrary(kableExtra)library(dtplyr)# set params ---- options(digits =2) # show two significant digitsdownload_date <-as.Date("2024-05-01")today_date <-Sys.Date()'%nin%'<-function(x, table) !(x %in% table)## Custom Functions ---# Define the function to remove columns that are all NAremove_na_columns <-function(df) {# Use select and where to filter out columns that are all NA df_selected <- df %>%select(where(~!all(is.na(.))))# Return the modified dataframereturn(df_selected)}flag_for_review <-function(df){ df %>%group_by(lastname_entryid, farm_number) %>%mutate(sum_practice_units =sum(farm_practice_units, farm_practice_animals, na.rm =TRUE),over_totalfarm_units = sum_practice_units > farm_totalunits,na_count =rowSums(is.na(across(everything()))),flag_for_review =FALSE ) %>%group_by(lastname_entryid, farm_number) %>%mutate(flag_for_review =ifelse(over_totalfarm_units ==TRUE&n() >1& na_count ==max(na_count), 1, 0) ) %>%ungroup() %>%select(flag_for_review, na_count, over_totalfarm_units, everything()) }
1 Load the Data
The application data is from 2024-05-01.
Code
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# Load the data ---# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%data_path <-function(x) paste0(my_file_path, x)# Read the data, tidy column names, and sort by last nameapplications <-fread(data_path(paste0("vt_alliance/application-", download_date,".csv"))) %>%as.data.frame() %>%clean_names() %>%# convert all blank columns to NA'sna_if("") %>%na_if(" ") %>%# these have an incorrect number of total acres/animal units (using the sum of the two farms instead)mutate(total_number_of_acres_or_animal_units =# this entry had plans to enroll 160 but for farm 1 and farm 2 put 0ifelse(entry_id ==2324, 160, total_number_of_acres_or_animal_units)) %>%group_by(entry_id) %>%# these entries had a total that didn't equal the sum of the two farms; defaulted to sum of two farmsmutate(total_number_of_acres_or_animal_units =case_when( entry_id %in%c(3851, 1853) ~sum(acres_or_animal_units_for_farm_number_1, acres_or_animal_units_for_farm_number_2, na.rm =TRUE),TRUE~ total_number_of_acres_or_animal_units )) %>%# this is an ironistic test account -- it also has also has some subfilesfilter(entry_id %nin%c(2450, 2453, 2457)) %>%mutate(primary_applicant_name =paste(primary_applicant_name_first, primary_applicant_name_last, sep ="_"),coapplicant1_name =ifelse(!is.na(co_applicant_number_1_name_first), paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep ="_"), NA),coapplicant2_name =ifelse(!is.na(co_applicant_number_2_name_first), paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep ="_"), NA), state =trimws(gsub(" ", "_", tolower(state))) ) %>%ungroup() # filter(primary_applicant_name_last != "Lee123")
2 Generate a file that identifies and exports the farm, commodity, practice, and enhancement data
3 View Basic Summary Stats in Advance of Randomization
state
n_applications
acres_or_animal_units
north_dakota
445
81,445
virginia
510
69,588
Total
955
151,033
4 Set the Random Seed and Conduct Selection
Code
# | output: asis# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# Here a colleague provides a random seed number to ensure reproducibility ----# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%seed_number <-20set.seed(seed_number)# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# 1. Conduct Selection (Simple Version) ---# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# Generate Unique ID's for recommended applicants in each state and append a randomized priorityapplications <- applications %>%filter(district_verification_status =="Recommended") %>%arrange(primary_applicant_name_last) %>%group_by(state) %>%mutate(randomized_priority =sample(n(), replace =FALSE) ) %>%ungroup() %>%select( randomized_priority, state, entry_id, farm_number_1_address_city, primary_applicant_name_last,everything() ) %>%arrange(randomized_priority, state, entry_id)# View the outputapplications %>%head(10)
# A tibble: 10 × 612
randomized_priority state entry_id farm_number_1_address_city
<int> <chr> <int> <chr>
1 1 north_dakota 3216 Donnybrook
2 1 virginia 1920 St Stephens Church
3 2 north_dakota 1291 Hazen
4 2 virginia 2030 Lovingston
5 3 north_dakota 3249 Minot
6 3 virginia 1705 Kents Store
7 4 north_dakota 2354 Carrington
8 4 virginia 816 New Kent
9 5 north_dakota 3151 Grassy Butte
10 5 virginia 3865 Suffolk City
# ℹ 608 more variables: primary_applicant_name_last <chr>,
# primary_applicant_name_prefix <lgl>, primary_applicant_name_first <chr>,
# primary_applicant_name_middle <lgl>, primary_applicant_name_suffix <lgl>,
# primary_applicant_email <chr>, primary_applicant_phone <chr>, mobile <chr>,
# landline <chr>, primary_applicant_address_street_address <chr>,
# primary_applicant_address_address_line_2 <chr>,
# primary_applicant_address_city <chr>, …
4.1 Export the Selection
This exports our new application file with the prioritized list for funding based on our randomization.
Code
# Export the list with the ids and priorities to a CSV file ----fwrite(applications,file =data_path(paste0("vt_alliance/selection_files/selection-", today_date,"-final.csv")),row.names =FALSE)
5 Evaluate budget, acreage, and producer implications of selection
5.1 How many applications do we have that feature coapplicants?
First we’ll identify all the columns with any text in the last name column as a heuristic for the number of producers. We’ll need this to figure out what our estimated budget is, as applicants differ in the share of the payment requested per farm and whether or not they are eligible for the 25% incremental payment for HUP status.
Code
# | output: asis# Step 1: Identify columns with that feature last names as our heuristic for distinct producersidentify_applicant_cols <- applications %>%select(ends_with("name_last")) %>%colnames(); identify_applicant_cols
# Step 2: Strip out leading and trailing white space from selected columns and # Create new columns with "_flag" suffix indicating whether each cell in the # identified cols contains text (1) or not (0), without replacing the original values.applications <- applications %>%mutate(across(all_of(identify_applicant_cols), ~str_squish(.))) %>%mutate(across(all_of(identify_applicant_cols), ~if_else(is.na(.x) | .x =="", 0, 1), .names ="{.col}_flag")) %>%# generate a column with the total number of applicants per applicationmutate(total_applicants =rowSums(select(., ends_with("_flag"))))applications %>%tabyl(total_applicants) %>%kable(digits =c(1,1,2),caption ="Distribution of Applications with Multiple Applicants" )
Distribution of Applications with Multiple Applicants
total_applicants
n
percent
1
896
0.94
2
33
0.03
3
26
0.03
The majority of applications have only one applicant, but there are a few with multiple applicants. The total number of applicants is the “total_applicants” times the number of applications with that number of applicants.
5.2 How many total applicants do we have?
Code
# | output: asis# Step 3: Generate a total count of the number of times we have text in each of those columns by state --- total_producers_by_state <- applications %>%select(ends_with("_flag"), state) %>%group_by(state) %>%summarise(across(everything(), ~sum(. ==1, na.rm =TRUE))) %>%# Convert to long format for ease of viewingpivot_longer(cols =-state, names_to ="category", values_to ="value") %>%arrange(desc(state), desc(value)) %>%adorn_totals("row")total_producers_by_state %>%kable(caption ="Number of Applicants by State" )
5.3 Generate counts of and flags for historically underserved producers
First we will identify all the columns that relate to our HUP flags and determine who they apply to (primary applicant, the first co-applicant, indicated with “_2”, or the second co-applicant, indicated with “_3”).
Code
# Step 1: Identify columns that classify as HUPdefine_hup <- applications %>%select(matches("limited_resource_producer|small_producer|beginning_producer|women| female_owned_operation|socially_disadvantaged_producer|veteran_producer")) %>%colnames(); define_hup
# Step 2: Strip out leading and trailing white space from selected columns (str_squish)# Step 3: Convert all cells without text (including NA's) to 0# Step 4: Add flags for whether any of the columns in the define_hup lists contain text (1) or not (0)applications <- applications %>%mutate(across(all_of(define_hup), ~if_else(is.na(str_squish(.)) |str_squish(.) =="", 0, 1))) %>%mutate(hup_sum =rowSums(select(., define_hup)), # by application# by producerhup_primary =rowSums(select(., define_hup_primary)),hup_secondary =rowSums(select(., define_hup_secondary)),hup_tertiary =rowSums(select(., define_hup_tertiary)) ) %>%mutate(hup_any =as.numeric(hup_sum >=1), # by application# by producerhup_primary_any =as.numeric(hup_primary >=1),hup_secondary_any =as.numeric(hup_secondary >=1),hup_tertiary_any =as.numeric(hup_tertiary >=1) ) %>%select( hup_any, hup_primary_any, hup_secondary_any, hup_tertiary_any, hup_sum, hup_primary, hup_secondary, hup_tertiary, define_hup, everything() ) %>%# order from largest to smallest (desc)arrange(across(c(hup_sum, hup_any, hup_primary, hup_primary_any, hup_secondary, hup_secondary_any, hup_tertiary, hup_tertiary_any, define_hup), desc))applications %>%head(10)
# Step 5a: Summarize hup status by category and state (a producer may fall into multiple categories) ----hup_summary <- applications %>%select(define_hup, state) %>%group_by(state) %>%summarise(across(everything(), ~sum(. ==1, na.rm =TRUE))) %>%ungroup() %>%# convert to long format for ease of viewingpivot_longer(cols =-state, names_to ="category", values_to ="count") %>%arrange(desc(state), desc(count)) %>%adorn_totals("row") %>%mutate(pct_producers_per_state =case_when( state =="virginia"~round(count / total_producers_va, 2), state =="north_dakota"~round(count / total_producers_nd, 2),TRUE~NA_real_# Handle other states if needed )) %>%# drop if state is blank or nafilter(!is.na(state) & state !="")# Step 5b: Generate a summary of hup ANY status by state (by producer) -----hup_any_per_state <- applications %>%select(hup_primary_any, hup_secondary_any, hup_tertiary_any, state) %>%group_by(state) %>%summarise(across(everything(), ~sum(. ==1, na.rm =TRUE))) %>%ungroup() %>%# convert to long formatpivot_longer(cols =-state, names_to ="category", values_to ="count") %>%arrange(desc(state), desc(count)) %>%adorn_totals("row") %>%mutate(pct_producers_per_state =case_when( state =="virginia"~round(count / total_producers_va,2), state =="north_dakota"~round(count / total_producers_nd,2),TRUE~NA_real_# Handle other states if needed )) %>%# drop if state is blank or nafilter(!is.na(state) & state !="")# scales::percent(hup_any_per_state$pct_producers_per_state, accuracy = 0.01)# View the outputshup_summary %>%kable(title ="Counts of HUP status by producer (note a producer may have multiple flags)")
state
category
count
pct_producers_per_state
virginia
small_producer
227
0.42
virginia
beginning_producer
126
0.23
virginia
x100_percent_women_run_operation
58
0.11
virginia
socially_disadvantaged_producer
35
0.06
virginia
veteran_producer
22
0.04
virginia
limited_resource_producer
17
0.03
virginia
small_producer_2
12
0.02
virginia
beginning_producer_2
6
0.01
virginia
socially_disadvantaged_producer_2
5
0.01
virginia
small_producer_3
3
0.01
virginia
beginning_producer_3
2
0.00
virginia
limited_resource_producer_2
0
0.00
virginia
veteran_producer_2
0
0.00
virginia
limited_resource_producer_3
0
0.00
virginia
socially_disadvantaged_producer_3
0
0.00
virginia
veteran_producer_3
0
0.00
north_dakota
small_producer
118
0.24
north_dakota
beginning_producer
95
0.19
north_dakota
x100_percent_women_run_operation
25
0.05
north_dakota
socially_disadvantaged_producer
14
0.03
north_dakota
beginning_producer_2
7
0.01
north_dakota
veteran_producer
5
0.01
north_dakota
small_producer_2
3
0.01
north_dakota
beginning_producer_3
3
0.01
north_dakota
socially_disadvantaged_producer_3
2
0.00
north_dakota
limited_resource_producer
1
0.00
north_dakota
socially_disadvantaged_producer_2
1
0.00
north_dakota
limited_resource_producer_2
0
0.00
north_dakota
veteran_producer_2
0
0.00
north_dakota
small_producer_3
0
0.00
north_dakota
limited_resource_producer_3
0
0.00
north_dakota
veteran_producer_3
0
0.00
Total
-
787
NA
Code
hup_any_per_state %>%kable(title ="Counts when any producer has any HUP status", booktabs =TRUE)
6 Estimate implications on program targets (producers, acreage, budget)
First we need to account for the fact that there are up to two farms per application but up to three applicants, and each person may have their own HUP status. We also have different payment shares per applicant/coapplicant to account for.
Code
applications_slim <- applications %>%# create payment rate columns for each primary, coapplicant, and coapplicant 1# should be 125 if hup is 1 for hup_primary, hup_secondary, or hup_tertiary, else 100# bonus payments for lrp should be 500 if limited_resource_producer is 1, else 0mutate(payment_rate_primary =if_else(hup_primary_any ==1, 125, 100),payment_rate_coapplicant1 =if_else(hup_secondary_any ==1, 125, 100),payment_rate_coapplicant2 =if_else(hup_tertiary_any ==1, 125, 100), bonus_lrp_primary =if_else(limited_resource_producer ==1, 500, 0),bonus_lrp_coapplicant1 =if_else(limited_resource_producer_2 ==1, 500, 0),bonus_lrp_coapplicant2 =if_else(limited_resource_producer_3 ==1, 500, 0) ) %>%# turn NA's to 0'smutate_at(vars(contains("share_of_payment_percent")), ~ifelse(is.na(.), 0, ./100)) %>%group_by(entry_id, state) %>%mutate(primary_applicant_number_1_share_of_payment_percent =ifelse(total_applicants ==1, 1, primary_applicant_number_1_share_of_payment_percent)) %>%mutate(budget_farm_1 = primary_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_primary + bonus_lrp_primary + co_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_coapplicant1 + bonus_lrp_coapplicant1, budget_farm_2 =# if there is a farm 2, calculate its budgetifelse(!is.na(acres_or_animal_units_for_farm_number_2), primary_applicant_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_primary + bonus_lrp_primary + co_applicant_number_2_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_coapplicant2 + bonus_lrp_coapplicant2, 0) ) %>%mutate(budget_application =sum(budget_farm_1, budget_farm_2, na.rm =TRUE),total_acres =sum(acres_or_animal_units_for_farm_number_1, acres_or_animal_units_for_farm_number_2, na.rm =TRUE)) %>%ungroup() %>%select(randomized_priority, state, entry_id, total_acres, budget_application, total_applicants, budget_farm_1, budget_farm_2, contains("rate"), contains("bonus"),everything()) %>%arrange(randomized_priority, state, entry_id, total_acres, budget_application)
6.1 Summary of Sample Selection Implications Across All States
Code
applications_slim %>%arrange(randomized_priority, state) %>%group_by(state) %>%summarize(total_acres_au =sum(total_acres, na.rm =TRUE),total_producers =sum(total_applicants, na.rm =TRUE),total_budget =sum(budget_application, na.rm =TRUE) ) %>%adorn_totals("row") %>%mutate(across(where(is.numeric), ~comma(.))) %>%kable(caption ="Summary of Selection Implications Across All States, if all were funded (and assuming all animal units are as stated in application)")
Summary of Selection Implications Across All States, if all were funded (and assuming all animal units are as stated in application)
state
total_acres_au
total_producers
total_budget
north_dakota
100,081
499
7,364,194
virginia
81,864
541
5,728,970
Total
181,945
1,040
13,093,163
6.2 Apply selection thresholds to applications
We’ll first flag the applications that meet the priority criteria (LRP and SDP) and select the first 75% of those flagged. Of the remaining applications, we’ll then select 50% from VA and 75% from ND.
# Define the flagged_fraction as 75% for flagged applicationsflagged_fraction <-0.76# First, select 75% of all ND applicants and 50% of all VA applicants, including both flagged and unflagged applicationsapplications_selection_status <- applications_slim %>%arrange(randomized_priority, state) %>%ungroup() %>%group_by(state) %>%mutate(# Create a flag column that checks if any priority flags are TRUE for each rowflag =rowSums(across(all_of(priority_flags)) ==TRUE) >0,n_state =n(),# Dynamically assign state_fraction based on the statestate_fraction =case_when( state =="virginia"~0.5, state =="north_dakota"~0.75,TRUE~0.75# Default value for other states ),# Compute the row number for selection within each staterow_num_for_selection =row_number(),# Select applicants based on the state_fractionselection_status =ifelse(!flag & (row_num_for_selection <= (n_state * state_fraction)), "selected", NA_character_)) %>%mutate(# Compute the number of flagged applications to select based on flagged_fractiontotal_flagged =sum(flag),select_flagged_count =ceiling(total_flagged * flagged_fraction),flag_cumulative =cumsum(flag),# Mark selected flagged applicationsselection_status =ifelse(flag & flag_cumulative <= select_flagged_count, "selected", selection_status), ) %>%ungroup() %>%mutate(selection_status =replace_na(selection_status, "not selected"))# Display the updated datasetapplications_selection_status %>%head(10)
# Add a check to see how many flagged applications are selected by stateflagged_selection_summary <- applications_selection_status %>%group_by(state) %>%summarize(total_applications =n(),total_flagged =sum(flag),selected_flagged =sum(flag & selection_status =="selected") ) %>%mutate(pct_flagged = selected_flagged / total_flagged,pct_flagged_selected = selected_flagged / total_applications)# # Display the summary# flagged_selection_summary %>% kable()
6.3 Export application file with labels for “selected” or “not selected”
Once have finalized the input data and the relevant conversions, we will export the application file with the “selected” and “not selected” columns in the lefthand most column for ease of viewing. The below shows a sample
7 Generate summary of output for selected thresholds
In order to have a credible research design to evaluate the impacts of this funding, we cannot fund all applications in this round. Nor can we choose to fund all applicants but reduce the amount. Either of these options will render future impact evaluations for this program invalid.
We will need a subset of applications that are not selected – at complete random – to have a credible comparison group (called a “counterfactual”). These people will still be eligible for selection in future periods if they choose to re-enter the pool.
Given our understanding of the various target we seek to achieve and the multiple application periods, we propose the following thresholds, which would result in roughly the following amounts:
(Note unless otherwise indicated by the column applications, all numbers refer to producer counts rather than applications.)
Code
generate_summary_table <-function(df, state_input){# Step 3: Generate a summary output table state =trimws(gsub("", "_", tolower(state_input))) summary_table <- df %>%filter(state == state_input, selection_status =="selected") %>%group_by(hup_any) %>%summarise(state = state, # Display the state as a constant columnthreshold_used =as.character(state_fraction),budget_est =sum(budget_application, na.rm =TRUE),total_acres_au =sum(total_acres, na.rm =TRUE),budget_est =sum(budget_application, na.rm =TRUE),total_applications =n(),total_producers =sum(total_applicants, na.rm =TRUE),total_hup =sum(hup_any, na.rm =TRUE),total_lrp =sum(limited_resource_producer, na.rm =TRUE),total_sdp =sum(socially_disadvantaged_producer, na.rm =TRUE), ) %>%mutate(hup_any =as.character(hup_any)) %>%distinct()return(summary_table)}# Generate summary tables for different thresholdsnd_75 <-generate_summary_table(df = applications_selection_status,state_input ="north_dakota") %>%adorn_totals("row") %>%mutate(across(where(is.numeric), ~comma(.)))nd_75 %>%kable(title ="Summary of Budget Implications for ND By HUP Status")
hup_any
state
threshold_used
budget_est
total_acres_au
total_applications
total_producers
total_hup
total_lrp
total_sdp
0
north_dakota
0.75
2,904,560
46,372
181
210
0
0
0
1
north_dakota
0.75
2,624,776
30,610
152
168
152
1
11
Total
-
-
5,529,336
76,981
333
378
152
1
11
Code
va_50 <-generate_summary_table(df = applications_selection_status,state_input ="virginia") %>%adorn_totals("row") %>%mutate(across(where(is.numeric), ~comma(.)))va_50 %>%kable(title ="Summary of Budget Implications for Virginia By HUP Status")
hup_any
state
threshold_used
budget_est
total_acres_au
total_applications
total_producers
total_hup
total_lrp
total_sdp
0
virginia
0.5
1,395,590
24,676
103
105
0
0
0
1
virginia
0.5
1,620,825
19,411
166
181
166
14
28
Total
-
-
3,016,415
44,087
269
286
166
14
28
Code
nd_75_noformat <-generate_summary_table(df = applications_selection_status,state_input ="north_dakota") %>%adorn_totals("row") va_50_noformat <-generate_summary_table(df = applications_selection_status,state_input ="virginia") %>%adorn_totals("row") # Sum all rows but the ones labeled Totaltotals <-rbind(va_50_noformat, nd_75_noformat) %>%mutate(state =ifelse(hup_any =="Total", "state subtotal", state),hup_any =ifelse(hup_any =="Total", "-", hup_any)) # Step 2: Calculate new totals for rows previously labeled "Total"total_row <- totals %>%filter(state =="state subtotal") %>%summarise(across(where(is.numeric), sum, na.rm =TRUE))# Adding a label for the total rowtotal_row$state <-"Total"# Combining the modified data with the new total rowfinal_data <-bind_rows(totals, total_row) %>%# filter(state == "Total") %>% mutate(across(everything(), ~ifelse(is.na(.), "-", .))) %>%mutate(across(where(is.numeric), ~comma(as.numeric(.))))final_data %>%kable(caption ="Summary of Proposed Selection Implications by State and HUP Status") %>%kable_styling(full_width = F, position ="left", bootstrap_options =c("striped", "hover", "condensed", "responsive") )
Summary of Proposed Selection Implications by State and HUP Status
hup_any
state
threshold_used
budget_est
total_acres_au
total_applications
total_producers
total_hup
total_lrp
total_sdp
0
virginia
0.5
1,395,590
24,676
103
105
0
0
0
1
virginia
0.5
1,620,825
19,411
166
181
166
14
28
-
state subtotal
-
3,016,415
44,087
269
286
166
14
28
0
north_dakota
0.75
2,904,560
46,372
181
210
0
0
0
1
north_dakota
0.75
2,624,776
30,610
152
168
152
1
11
-
state subtotal
-
5,529,336
76,981
333
378
152
1
11
-
Total
-
8,545,751
121,068
602
664
318
15
39
8 Check on share of HUP and SDP selected
The flag here indicates whether an application has any applicant with HUP or SDP status. The numbers in the prior table refer to applicants (of which there may be up to three per application), whereas this table refers to applications where any of up to three applicants have HUP or SDP status.
Code
# Display the summaryflagged_selection_summary %>%kable()
state
total_applications
total_flagged
selected_flagged
pct_flagged
pct_flagged_selected
north_dakota
445
16
13
0.81
0.03
virginia
510
51
39
0.76
0.08
9 Remaining TO-DO’s
The budget estimates should be considered as exactly that – estimates! There are a few cases where the application files were inconsistent on the acreage within a farm and the total acreage requested, so we anticipate there may be adjustments once the acreage is reviewed/verified by local partners and the producers themselves.
More broadly, as we have continued to evaluate the implications of selection, we have further identified a few more errors that should have been caught in the applications input files before the data file arrived to selection but were not (e.g. inconsistent acreages between columns, a test case making it through to “Recommended” status). As we learn more in this round, we can make improvements that will allow us to move more quickly in subsequent rounds on what types of automatic flags to have before producers are able to submit their files.
10 Manual flags that we had to fix
We will seek to systematize the following checks moving forward:
Sum of practice code animals or acres should equal total farm acres
Practice code animals or acres should not be zero, missing, or negative
Practice code animals or acres should not be greater than total farm acres
Practice code animals or acres should not be greater than 160 per farm
The “has enhancement” column should not be empty when the enhancement is present (or is not NA)
Check to confirm the number of producers in LRP and SDP status per state are selected.
Source Code
---title: "VT Alliance Selection Protocol - Sample"author: "Elinor Benami"date: "`r format(Sys.time(), '%d %B, %Y')`"date-format: longnumber-sections: truetoc: truetoc-depth: 3format: html: code-fold: true code-summary: "Show the code" code-line-numbers: true code-tools: true embed-resources: true self-contained-math: trueexecute: warning: falseeditor: visual---<styletype="text/css">.main-container {max-width: 100%!important;margin: auto;}</style>```{r setup}# Clear the workspacerm(list =ls())# load the librarieslibrary(tidyverse)library(data.table) # to read in files fasterlibrary(tidylog)library(janitor)library(scales) # for comma and formattinglibrary(knitr) #for kablelibrary(kableExtra)library(dtplyr)# set params ---- options(digits =2) # show two significant digitsdownload_date <-as.Date("2024-05-01")today_date <-Sys.Date()'%nin%'<-function(x, table) !(x %in% table)## Custom Functions ---# Define the function to remove columns that are all NAremove_na_columns <-function(df) {# Use select and where to filter out columns that are all NA df_selected <- df %>%select(where(~!all(is.na(.))))# Return the modified dataframereturn(df_selected)}flag_for_review <-function(df){ df %>%group_by(lastname_entryid, farm_number) %>%mutate(sum_practice_units =sum(farm_practice_units, farm_practice_animals, na.rm =TRUE),over_totalfarm_units = sum_practice_units > farm_totalunits,na_count =rowSums(is.na(across(everything()))),flag_for_review =FALSE ) %>%group_by(lastname_entryid, farm_number) %>%mutate(flag_for_review =ifelse(over_totalfarm_units ==TRUE&n() >1& na_count ==max(na_count), 1, 0) ) %>%ungroup() %>%select(flag_for_review, na_count, over_totalfarm_units, everything()) }``````{r load_presteps}#| echo: falsemy_file_path <-"/Users/elinor/Dropbox/research/csa_stats/data/"```# Load the DataThe application data is from `r download_date`.```{r load_data}# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# Load the data ---# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%data_path <-function(x) paste0(my_file_path, x)# Read the data, tidy column names, and sort by last nameapplications <-fread(data_path(paste0("vt_alliance/application-", download_date,".csv"))) %>%as.data.frame() %>%clean_names() %>%# convert all blank columns to NA'sna_if("") %>%na_if(" ") %>%# these have an incorrect number of total acres/animal units (using the sum of the two farms instead)mutate(total_number_of_acres_or_animal_units =# this entry had plans to enroll 160 but for farm 1 and farm 2 put 0ifelse(entry_id ==2324, 160, total_number_of_acres_or_animal_units)) %>%group_by(entry_id) %>%# these entries had a total that didn't equal the sum of the two farms; defaulted to sum of two farmsmutate(total_number_of_acres_or_animal_units =case_when( entry_id %in%c(3851, 1853) ~sum(acres_or_animal_units_for_farm_number_1, acres_or_animal_units_for_farm_number_2, na.rm =TRUE),TRUE~ total_number_of_acres_or_animal_units )) %>%# this is an ironistic test account -- it also has also has some subfilesfilter(entry_id %nin%c(2450, 2453, 2457)) %>%mutate(primary_applicant_name =paste(primary_applicant_name_first, primary_applicant_name_last, sep ="_"),coapplicant1_name =ifelse(!is.na(co_applicant_number_1_name_first), paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep ="_"), NA),coapplicant2_name =ifelse(!is.na(co_applicant_number_2_name_first), paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep ="_"), NA), state =trimws(gsub(" ", "_", tolower(state))) ) %>%ungroup() # filter(primary_applicant_name_last != "Lee123") ```# Generate a file that identifies and exports the farm, commodity, practice, and enhancement data```{r gen_clean_commodity_practice_enhancement}process_applications <-function(applications, farm_number, included_state, excluded_states) { included_state <-gsub(" ", "_", tolower(included_state)) state_title <- included_stateif(farm_number ==2){ farm_tractnum ="if_known_please_enter_the_tract_numbers_where_the_practices_will_be_installed" farm_practice =sym(paste0(included_state, "_planned_conservation_practices_practice_3")) farm_practice_units =paste0(included_state, "_planned_conservation_practices_number_of_acres_you_plan_to_enroll_2") farm_practice_animals =paste0(included_state, "_planned_conservation_practices_number_of_animals_you_plan_to_enroll_2") farm_enhancement_yesno =sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional_2")) } elseif (farm_number ==1){ farm_tractnum =sym(paste0("farm_number_", farm_number, "_tract_numbers")) farm_practice =sym(paste0(included_state, "_planned_conservation_practices_practice")) farm_practice_units =paste0(included_state, "_planned_conservation_practices_number_of_acres_you_plan_to_enroll") farm_practice_animals =paste0(included_state, "_planned_conservation_practices_number_of_animals_you_plan_to_enroll") farm_enhancement_yesno =sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional")) } else{print("Please enter a valid farm number (1 or 2 only)") }# Construct column names dynamically and force evaluation rename_list <-list(farm_commodity =sym(paste0(included_state, "_planned_conservation_practices_residue_and_tillage_management_no_till_enhancements_optional_", farm_number *2)),farm_fsa =sym(paste0("please_enter_the_fsa_number_for_farm_number_", farm_number)),farm_tractnum = farm_tractnum,farm_practice = farm_practice,farm_practice_units = farm_practice_units,farm_practice_animals = farm_practice_animals,farm_totalunits =sym(paste0("acres_or_animal_units_for_farm_number_", farm_number)),farm_enhancement_yesno = farm_enhancement_yesno,farm_enhancement_yesno2 =sym(paste0(included_state, "_planned_conservation_practices_add_practice_enhancement_optional_yes")) )# Use rename with !!! to splice the list renamed_columns <- applications %>%rename(!!!rename_list)# Adjusting selection and filtering according to the input parameters processed_data <- renamed_columns %>%select(entry_id, state, contains("name"), contains(c("farm", "enhancements", "pdf")),-contains(c("name_middle", "prefix", "suffix"))) %>%mutate(lastname_entryid =ifelse(!is.na(primary_applicant_name_last), paste0(primary_applicant_name_last, "_", entry_id), NA_character_) ) %>%fill(lastname_entryid, contains(c("primary_applicant")), state, .direction ="down") %>%select(lastname_entryid, everything(), contains("enhancements"), pdf_alliance_application) %>%filter(state == state_title) %>%select(-contains(excluded_states)) %>%group_by(lastname_entryid) %>%fill(contains(c("applicant")), .direction ="down") %>%ungroup() %>%remove_na_columns()# Additional processing long2 <- processed_data %>%mutate(primary_applicant_name =paste(primary_applicant_name_first, primary_applicant_name_last, sep ="_"),coapplicant1_name =ifelse(!is.na(co_applicant_number_1_name_first),paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep ="_"), NA),coapplicant2_name =ifelse(!is.na(co_applicant_number_2_name_first),paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep ="_"), NA) ) %>%pivot_longer(cols =contains("enhancements"),names_to =c("prefix", "enhancement_category"),names_sep =paste0(included_state, "_planned_conservation_practices_"),values_to ="farm_enhancement" ) %>%select(-contains(c("prefix", "category", "name_first", "name_last", "entry_id"))) %>%select(lastname_entryid, state, primary_applicant_name, coapplicant1_name, coapplicant2_name, farm_commodity, contains("farm_practice"), everything()) %>%filter(rowSums(!is.na(select(., -c(1:3)))) >0) %>%group_by(lastname_entryid) %>%fill(contains(c("fsa", "totalunits", "tractnum","pdf")), .direction ="down") %>%ungroup() %>%distinct()# Filter and reorganize long2 %>%mutate(completeness =rowSums(!is.na(select(., -lastname_entryid))),# filter_flag = round(farm_practice_units) == round(farm_totalunits),has_farm_enhancement =trimws(farm_enhancement_yesno),drop_flag =ifelse(has_farm_enhancement =="Yes"&is.na(farm_enhancement), 1, 0) ) %>%group_by(lastname_entryid) %>%filter(drop_flag ==0|is.na(drop_flag),!is.na(farm_commodity)) %>%# mutate_add a flag for the farm number as inputmutate(farm_number = farm_number) %>%select(lastname_entryid, state, primary_applicant_name, contains("coapplicant"), farm_number, farm_fsa, farm_tractnum, farm_commodity,contains("farm_practice"), farm_totalunits, has_farm_enhancement, farm_enhancement,contains("pdf"), completeness)}# For Farm 1 in Virginia, exclude North Dakota, Arkansas, and Minnesotava_farm1 <-process_applications(applications, farm_number =1, "Virginia", c("north_dakota", "arkansas", "minnesota"))va_farm2 <-process_applications(applications, farm_number =2, "Virginia", c("north_dakota", "arkansas", "minnesota"))va_farms_total <-bind_rows(va_farm1, va_farm2) %>%arrange(lastname_entryid)nd_farm1 <-process_applications(applications, farm_number =1, "North Dakota", c("virginia", "arkansas", "minnesota"))nd_farm2 <-process_applications(applications, farm_number =2, "North Dakota", c("virginia", "arkansas", "minnesota"))nd_farms_total <-bind_rows(nd_farm1, nd_farm2) %>%arrange(lastname_entryid)combine_nd_va <-bind_rows(va_farms_total, nd_farms_total) %>%arrange(state, lastname_entryid)write.csv(combine_nd_va, data_path(paste0("vt_alliance/combinedfarms_", today_date, ".csv")))```# View Basic Summary Stats in Advance of Randomization```{r distribution_acreage, echo = FALSE}# | output: asisapplications %>%group_by(state) %>%filter(district_verification_status =="Recommended") %>%summarise(n_applications =n(),acres_or_animal_units =sum(total_number_of_acres_or_animal_units, na.rm =TRUE)) %>%filter(acres_or_animal_units !=0) %>%adorn_totals() %>%mutate(across(where(is.numeric), ~comma(.))) %>%kable()```# Set the Random Seed and Conduct Selection```{r set_seed_randomize}# | output: asis# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# Here a colleague provides a random seed number to ensure reproducibility ----# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%seed_number <-20set.seed(seed_number)# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# 1. Conduct Selection (Simple Version) ---# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# Generate Unique ID's for recommended applicants in each state and append a randomized priorityapplications <- applications %>%filter(district_verification_status =="Recommended") %>%arrange(primary_applicant_name_last) %>%group_by(state) %>%mutate(randomized_priority =sample(n(), replace =FALSE) ) %>%ungroup() %>%select( randomized_priority, state, entry_id, farm_number_1_address_city, primary_applicant_name_last,everything() ) %>%arrange(randomized_priority, state, entry_id)# View the outputapplications %>%head(10)```## Export the SelectionThis exports our new application file with the prioritized list for funding based on our randomization.```{r export}# Export the list with the ids and priorities to a CSV file ----fwrite(applications,file =data_path(paste0("vt_alliance/selection_files/selection-", today_date,"-final.csv")),row.names =FALSE)```# Evaluate budget, acreage, and producer implications of selection## How many applications do we have that feature coapplicants?First we'll identify all the columns with any text in the last name column as a heuristic for the number of producers. We'll need this to figure out what our estimated budget is, as applicants differ in the share of the payment requested per farm and whether or not they are eligible for the 25% incremental payment for HUP status.```{r total_applicants}# | output: asis# Step 1: Identify columns with that feature last names as our heuristic for distinct producersidentify_applicant_cols <- applications %>%select(ends_with("name_last")) %>%colnames(); identify_applicant_cols# Step 2: Strip out leading and trailing white space from selected columns and # Create new columns with "_flag" suffix indicating whether each cell in the # identified cols contains text (1) or not (0), without replacing the original values.applications <- applications %>%mutate(across(all_of(identify_applicant_cols), ~str_squish(.))) %>%mutate(across(all_of(identify_applicant_cols), ~if_else(is.na(.x) | .x =="", 0, 1), .names ="{.col}_flag")) %>%# generate a column with the total number of applicants per applicationmutate(total_applicants =rowSums(select(., ends_with("_flag"))))applications %>%tabyl(total_applicants) %>%kable(digits =c(1,1,2),caption ="Distribution of Applications with Multiple Applicants" )```The majority of applications have only one applicant, but there are a few with multiple applicants. The total number of applicants is the "total_applicants" times the number of applications with that number of applicants.## How many total applicants do we have?```{r total_producers}# | output: asis# Step 3: Generate a total count of the number of times we have text in each of those columns by state --- total_producers_by_state <- applications %>%select(ends_with("_flag"), state) %>%group_by(state) %>%summarise(across(everything(), ~sum(. ==1, na.rm =TRUE))) %>%# Convert to long format for ease of viewingpivot_longer(cols =-state, names_to ="category", values_to ="value") %>%arrange(desc(state), desc(value)) %>%adorn_totals("row")total_producers_by_state %>%kable(caption ="Number of Applicants by State" )total_producers <- total_producers_by_state %>%filter(state =="Total") %>%pull(value)total_producers_va <- total_producers_by_state %>%filter(state =="virginia") %>%summarise(n =sum(value)) %>%pull(n)total_producers_nd <- total_producers_by_state %>%filter(state =="north_dakota") %>%summarise(n=sum(value)) %>%pull(n)```## Generate counts of and flags for historically underserved producersFirst we will identify all the columns that relate to our HUP flags and determine who they apply to (primary applicant, the first co-applicant, indicated with "\_2", or the second co-applicant, indicated with "\_3").```{r underserved_producers}# Step 1: Identify columns that classify as HUPdefine_hup <- applications %>%select(matches("limited_resource_producer|small_producer|beginning_producer|women| female_owned_operation|socially_disadvantaged_producer|veteran_producer")) %>%colnames(); define_hupdefine_hup_primary <- define_hup %>%str_subset(pattern ="^(?!.*(_2|_3)).*$"); define_hup_primarydefine_hup_secondary <- define_hup %>%str_subset(pattern ="_2"); define_hup_secondarydefine_hup_tertiary <- define_hup %>%str_subset(pattern ="_3"); define_hup_tertiary# Step 2: Strip out leading and trailing white space from selected columns (str_squish)# Step 3: Convert all cells without text (including NA's) to 0# Step 4: Add flags for whether any of the columns in the define_hup lists contain text (1) or not (0)applications <- applications %>%mutate(across(all_of(define_hup), ~if_else(is.na(str_squish(.)) |str_squish(.) =="", 0, 1))) %>%mutate(hup_sum =rowSums(select(., define_hup)), # by application# by producerhup_primary =rowSums(select(., define_hup_primary)),hup_secondary =rowSums(select(., define_hup_secondary)),hup_tertiary =rowSums(select(., define_hup_tertiary)) ) %>%mutate(hup_any =as.numeric(hup_sum >=1), # by application# by producerhup_primary_any =as.numeric(hup_primary >=1),hup_secondary_any =as.numeric(hup_secondary >=1),hup_tertiary_any =as.numeric(hup_tertiary >=1) ) %>%select( hup_any, hup_primary_any, hup_secondary_any, hup_tertiary_any, hup_sum, hup_primary, hup_secondary, hup_tertiary, define_hup, everything() ) %>%# order from largest to smallest (desc)arrange(across(c(hup_sum, hup_any, hup_primary, hup_primary_any, hup_secondary, hup_secondary_any, hup_tertiary, hup_tertiary_any, define_hup), desc))applications %>%head(10) ```### Summarize HUP status by category and state```{r summarize_hup}# Step 5a: Summarize hup status by category and state (a producer may fall into multiple categories) ----hup_summary <- applications %>%select(define_hup, state) %>%group_by(state) %>%summarise(across(everything(), ~sum(. ==1, na.rm =TRUE))) %>%ungroup() %>%# convert to long format for ease of viewingpivot_longer(cols =-state, names_to ="category", values_to ="count") %>%arrange(desc(state), desc(count)) %>%adorn_totals("row") %>%mutate(pct_producers_per_state =case_when( state =="virginia"~round(count / total_producers_va, 2), state =="north_dakota"~round(count / total_producers_nd, 2),TRUE~NA_real_# Handle other states if needed )) %>%# drop if state is blank or nafilter(!is.na(state) & state !="")# Step 5b: Generate a summary of hup ANY status by state (by producer) -----hup_any_per_state <- applications %>%select(hup_primary_any, hup_secondary_any, hup_tertiary_any, state) %>%group_by(state) %>%summarise(across(everything(), ~sum(. ==1, na.rm =TRUE))) %>%ungroup() %>%# convert to long formatpivot_longer(cols =-state, names_to ="category", values_to ="count") %>%arrange(desc(state), desc(count)) %>%adorn_totals("row") %>%mutate(pct_producers_per_state =case_when( state =="virginia"~round(count / total_producers_va,2), state =="north_dakota"~round(count / total_producers_nd,2),TRUE~NA_real_# Handle other states if needed )) %>%# drop if state is blank or nafilter(!is.na(state) & state !="")# scales::percent(hup_any_per_state$pct_producers_per_state, accuracy = 0.01)# View the outputshup_summary %>%kable(title ="Counts of HUP status by producer (note a producer may have multiple flags)") hup_any_per_state %>%kable(title ="Counts when any producer has any HUP status", booktabs =TRUE)``````{r hup_pct_calc}n_hup <- hup_any_per_state %>%filter(state =="Total") %>%pull(count)pct_hup <- n_hup / total_producers; pct_hupscales::percent(pct_hup)```The number of applicants that fall into any HUP category is `r n_hup` (`r `scales::percent(pct_hup))`). ## Generate counts of and flags for limited resource producersFirst we will identify all the columns that relate to our LRP flags```{r limited_resource_producers}# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%# C. Generate a total count of limited resource and socially disadvantaged producers# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%gen_summary <-function(str_pattern) { summary <- applications %>%select(matches(str_pattern), state) %>%group_by(state) %>%summarise(across(matches(str_pattern), ~sum(. ==1, na.rm =TRUE))) %>%pivot_longer(cols =-state, names_to ="category", values_to ="count") %>%arrange(desc(state), desc(count)) %>%adorn_totals("row") %>%mutate(pct_producers_state =case_when( state =="Virginia"~round(count / total_producers_va, 2), state =="North Dakota"~round(count / total_producers_nd, 2),TRUE~NA_real_# Handle other states if needed )) %>%filter(!is.na(state) & state !="")return(summary)}lrp_summary <-gen_summary("limited_resource_producer"); lrp_summary %>%kable()sdp_summary <-gen_summary("socially_disadvantaged_producer"); sdp_summary %>%kable()```The number of producers that are in the LRP and SDP categories are:```{r lrp_pct_calc}# Calculate total LRP producers ----total_lrp_producers <- lrp_summary %>%filter(state =="Total") %>%pull(count)total_sdp_producers <- sdp_summary %>%filter(state =="Total") %>%pull(count)# Calculate percentage of LRP producers ----pct_lrp <- total_lrp_producers / total_producers; pct_lrppct_sdp <- total_sdp_producers / total_producers; pct_sdp```# Estimate implications on program targets (producers, acreage, budget)First we need to account for the fact that there are up to two farms per application but up to three applicants, and each person may have their own HUP status. We also have different payment shares per applicant/coapplicant to account for.```{r budget_estimation}applications_slim <- applications %>%# create payment rate columns for each primary, coapplicant, and coapplicant 1# should be 125 if hup is 1 for hup_primary, hup_secondary, or hup_tertiary, else 100# bonus payments for lrp should be 500 if limited_resource_producer is 1, else 0mutate(payment_rate_primary =if_else(hup_primary_any ==1, 125, 100),payment_rate_coapplicant1 =if_else(hup_secondary_any ==1, 125, 100),payment_rate_coapplicant2 =if_else(hup_tertiary_any ==1, 125, 100), bonus_lrp_primary =if_else(limited_resource_producer ==1, 500, 0),bonus_lrp_coapplicant1 =if_else(limited_resource_producer_2 ==1, 500, 0),bonus_lrp_coapplicant2 =if_else(limited_resource_producer_3 ==1, 500, 0) ) %>%# turn NA's to 0'smutate_at(vars(contains("share_of_payment_percent")), ~ifelse(is.na(.), 0, ./100)) %>%group_by(entry_id, state) %>%mutate(primary_applicant_number_1_share_of_payment_percent =ifelse(total_applicants ==1, 1, primary_applicant_number_1_share_of_payment_percent)) %>%mutate(budget_farm_1 = primary_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_primary + bonus_lrp_primary + co_applicant_number_1_share_of_payment_percent * acres_or_animal_units_for_farm_number_1 * payment_rate_coapplicant1 + bonus_lrp_coapplicant1, budget_farm_2 =# if there is a farm 2, calculate its budgetifelse(!is.na(acres_or_animal_units_for_farm_number_2), primary_applicant_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_primary + bonus_lrp_primary + co_applicant_number_2_share_of_payment_percent * acres_or_animal_units_for_farm_number_2 * payment_rate_coapplicant2 + bonus_lrp_coapplicant2, 0) ) %>%mutate(budget_application =sum(budget_farm_1, budget_farm_2, na.rm =TRUE),total_acres =sum(acres_or_animal_units_for_farm_number_1, acres_or_animal_units_for_farm_number_2, na.rm =TRUE)) %>%ungroup() %>%select(randomized_priority, state, entry_id, total_acres, budget_application, total_applicants, budget_farm_1, budget_farm_2, contains("rate"), contains("bonus"),everything()) %>%arrange(randomized_priority, state, entry_id, total_acres, budget_application)```## Summary of Sample Selection Implications Across All States```{r budget_summary_all}#| output: asisapplications_slim %>%arrange(randomized_priority, state) %>%group_by(state) %>%summarize(total_acres_au =sum(total_acres, na.rm =TRUE),total_producers =sum(total_applicants, na.rm =TRUE),total_budget =sum(budget_application, na.rm =TRUE) ) %>%adorn_totals("row") %>%mutate(across(where(is.numeric), ~comma(.))) %>%kable(caption ="Summary of Selection Implications Across All States, if all were funded (and assuming all animal units are as stated in application)")```## Apply selection thresholds to applicationsWe'll first flag the applications that meet the priority criteria (LRP and SDP) and select the first 75% of those flagged. Of the remaining applications, we'll then select 50% from VA and 75% from ND.The priority flags are in any of these columns:```{r priority_flags}# Parameterspriority_flags <- applications %>%select(matches("limited_resource_producer|socially_disadvantaged_producer")) %>%colnames(); priority_flags```Then apply the given thresholds:```{r apply_thresholds}# Define the flagged_fraction as 75% for flagged applicationsflagged_fraction <-0.76# First, select 75% of all ND applicants and 50% of all VA applicants, including both flagged and unflagged applicationsapplications_selection_status <- applications_slim %>%arrange(randomized_priority, state) %>%ungroup() %>%group_by(state) %>%mutate(# Create a flag column that checks if any priority flags are TRUE for each rowflag =rowSums(across(all_of(priority_flags)) ==TRUE) >0,n_state =n(),# Dynamically assign state_fraction based on the statestate_fraction =case_when( state =="virginia"~0.5, state =="north_dakota"~0.75,TRUE~0.75# Default value for other states ),# Compute the row number for selection within each staterow_num_for_selection =row_number(),# Select applicants based on the state_fractionselection_status =ifelse(!flag & (row_num_for_selection <= (n_state * state_fraction)), "selected", NA_character_)) %>%mutate(# Compute the number of flagged applications to select based on flagged_fractiontotal_flagged =sum(flag),select_flagged_count =ceiling(total_flagged * flagged_fraction),flag_cumulative =cumsum(flag),# Mark selected flagged applicationsselection_status =ifelse(flag & flag_cumulative <= select_flagged_count, "selected", selection_status), ) %>%ungroup() %>%mutate(selection_status =replace_na(selection_status, "not selected"))# Display the updated datasetapplications_selection_status %>%head(10)# Add a check to see how many flagged applications are selected by stateflagged_selection_summary <- applications_selection_status %>%group_by(state) %>%summarize(total_applications =n(),total_flagged =sum(flag),selected_flagged =sum(flag & selection_status =="selected") ) %>%mutate(pct_flagged = selected_flagged / total_flagged,pct_flagged_selected = selected_flagged / total_applications)# # Display the summary# flagged_selection_summary %>% kable()```## Export application file with labels for "selected" or "not selected"Once have finalized the input data and the relevant conversions, we will export the application file with the "selected" and "not selected" columns in the lefthand most column for ease of viewing. The below shows a sample```{r}applications_selection_status_slim <- applications_selection_status %>%mutate(primary_applicant_name =paste(primary_applicant_name_first, primary_applicant_name_last, sep ="_"),coapplicant1_name =ifelse(!is.na(co_applicant_number_1_name_first), paste(co_applicant_number_1_name_first, co_applicant_number_1_name_last, sep ="_"), NA),coapplicant2_name =ifelse(!is.na(co_applicant_number_2_name_first), paste(co_applicant_number_2_name_first, co_applicant_number_2_name_last, sep ="_"), NA),lastname_entryid =paste0(primary_applicant_name_last, "_", entry_id)) %>%select(selection_status, flag, entry_id, primary_applicant_name_last, state, #randomized_priority, matches(".*primary.*name.*"), matches(".*primary.*address.*"), matches(".*primary.*email.*"), matches(".*primary.*phone.*"), hup_primary_any, contains("counties"),matches(".*farm.*address.*"), matches(".*coapplicant.*name.*"), matches(".*co_applicant.*address.*"),matches(".*co_applicant.*email.*"), matches(".*co_applicant.*phone.*"), contains("share_of_payment"), total_acres, budget_application, contains("zip"), contains("virginia_counties"), contains("north_dakota_counties"), contains("name"), define_hup, contains("hup"), are_you_interested_in_using_comet_farm,# contains("comet"), pdf_alliance_application) %>%select(-contains(c("name_middle", "prefix", "suffix", "name_first", "name_last", "minnesota", "arkansas"))) %>%select(selection_status, flag, entry_id, state, primary_applicant_address_zip_postal_code, contains("counties"), everything())head(applications_selection_status_slim)# fwrite(applications_selection_status_slim,# file = data_path(paste0("vt_alliance/selection_files/selection_run-",# today_date,"-final-seed-", seed_number,".csv")), row.names = FALSE)```## Merge in the selection data with farm/crop/practice/enhancement information```{r merge_selection_data_with_farm_info}# Merge in the selection data with the farm/crop/practice/enhancement informationcheck <- applications_selection_status_slim %>%full_join(combine_nd_va, by =c("lastname_entryid", "state", "primary_applicant_name", "coapplicant1_name", "coapplicant2_name", "pdf_alliance_application")) %>%# full_join(va_farms_total_flag, by = c("lastname_entryid", "pdf_alliance_application",) %>% mutate(flag =as.integer(as.logical(flag)),counties =coalesce(north_dakota_counties, virginia_counties)) %>%rename(lrp_sdp_priority_flag = flag,primary_applicant_number_2_share_of_payment_percent = primary_applicant_share_of_payment_percent) %>%select(-contains(c ("counties_2", "virginia_counties", "north_dakota_counties", "small", "women", "veteran","beginning")), -primary_applicant_address_country, -farm_number_1_address_country, -farm_number_2_address_country, -contains("country"), -representative_name, everything()) %>%mutate(farm_number_1_address_state_province =ifelse(state =="virginia", "VA", "ND"),farm_number_2_address_state_province =ifelse(state =="virginia", "VA", "ND"))# Function to concatenate address partsconcatenate_address <-function(street, line2, city, state, zip) { address <-paste0( street, ", ",ifelse(is.na(line2) | line2 =="", "", paste0(line2, " ")), city, ", ", state, " ", zip )return(address)}# List of address prefixes to processaddress_prefixes <-c("farm_number_1_address", "farm_number_2_address", "primary_applicant_address", "co_applicant_number_1_address", "co_applicant_number_2_address")# Function to create full address columnscreate_full_address <-function(df, prefix) { df %>%mutate("{prefix}_full_address":=pmap_chr(list( df[[paste0(prefix, "_street_address")]], df[[paste0(prefix, "_address_line_2")]], df[[paste0(prefix, "_city")]], df[[paste0(prefix, "_state_province")]], df[[paste0(prefix, "_zip_postal_code")]] ), concatenate_address))}# Apply the function to each prefix and reduce the data frameaddy_check <-reduce(address_prefixes, create_full_address, .init = check) %>%select(-matches("address"), contains("full_address")) %>%select(-contains(c("completeness", "small", "women", "veteran","beginning", "representative", "virginia_counties", "north_dakota_counties")))farm1_details <- addy_check %>%filter(farm_number ==1) %>%select(-contains(c("farm_number_2", "coapplicant2", "co_applicant_number_2", "producer_3", "tertiary","hup_sum", "primary_applicant_number_2_share_of_payment_percent")))farm2_details <- addy_check %>%filter(farm_number ==2) %>%select(-contains(c("farm_number_1", "coapplicant1", "co_applicant_number_1", "producer_2", "secondary","hup_sum", "primary_applicant_number_1_share_of_payment_percent")))all_farm_details <-bind_rows(farm1_details, farm2_details) %>%select(-contains("pdf"), everything(), contains("pdf"))# Combine co_applicant_number_1 and co_applicant_number_2 related columns using coalesceall_farms_combine <- all_farm_details %>%mutate(coapplicant_name =coalesce(coapplicant1_name, coapplicant2_name),co_applicant_email =coalesce(co_applicant_number_1_email, co_applicant_number_2_email),co_applicant_phone =coalesce(co_applicant_number_1_phone, co_applicant_number_2_phone),co_applicant_share_of_payment_percent =coalesce(co_applicant_number_1_share_of_payment_percent, co_applicant_number_2_share_of_payment_percent),primary_applicant_share_of_payment_percent =coalesce(primary_applicant_number_1_share_of_payment_percent, primary_applicant_number_2_share_of_payment_percent), farm_address =coalesce(farm_number_1_address_full_address, farm_number_2_address_full_address),coapplicant_address =coalesce(co_applicant_number_1_address_full_address, co_applicant_number_2_address_full_address),hup_coapplicant =coalesce(hup_secondary_any, hup_tertiary_any),socially_disadvantaged_coapplicant =coalesce(socially_disadvantaged_producer_2, socially_disadvantaged_producer_3),limited_resource_coapplicant =coalesce(limited_resource_producer_2, limited_resource_producer_3) ) %>%select(-coapplicant1_name, -coapplicant2_name,-co_applicant_number_1_email, -co_applicant_number_2_email,-co_applicant_number_1_phone, -co_applicant_number_2_phone,-co_applicant_number_1_share_of_payment_percent, -co_applicant_number_2_share_of_payment_percent,-primary_applicant_number_1_share_of_payment_percent, -primary_applicant_number_2_share_of_payment_percent, -farm_number_1_address_full_address, -farm_number_2_address_full_address,-co_applicant_number_1_address_full_address, -co_applicant_number_2_address_full_address,-hup_primary, -hup_secondary, -hup_tertiary, -socially_disadvantaged_producer_2, -socially_disadvantaged_producer_3,-hup_secondary_any, -hup_tertiary_any,-limited_resource_producer_2, -limited_resource_producer_3 ) %>%arrange(desc(selection_status), state, primary_applicant_name, entry_id, coapplicant_name) #%>% # filter(!is.na(entry_id))col_ordering <-c("selection_status", "entry_id", # "lastname_entryid", "state", "primary_applicant_name", "primary_applicant_address_full_address", "primary_applicant_email", "primary_applicant_phone", # underserved status for primary applicant"lrp_sdp_priority_flag", "hup_primary_any", "limited_resource_producer", "socially_disadvantaged_producer", # farm information"farm_number", "counties", "farm_address", "farm_fsa", "farm_tractnum", "farm_commodity", "farm_practice", "has_farm_enhancement", "farm_enhancement", "farm_practice_units","farm_practice_animals", "farm_totalunits", "total_acres", "coapplicant_name", "coapplicant_address", "co_applicant_email", "co_applicant_phone", "limited_resource_coapplicant", "socially_disadvantaged_coapplicant", "hup_coapplicant","budget_application", "primary_applicant_share_of_payment_percent", "co_applicant_share_of_payment_percent", # "hup_any", # on the fence about whether to include this"are_you_interested_in_using_comet_farm","pdf_alliance_application")# Custom function to convert text to title case except for state abbreviationsto_title_case_except_states <-function(address) {# Define a vector of state abbreviations state_abbreviations <-c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY" )# Convert the address to title case address_title_case <-str_to_title(address)# Replace state abbreviations with uppercase versionsfor (state in state_abbreviations) { address_title_case <-str_replace_all( address_title_case, regex(paste0("\\b", state, "\\b"), ignore_case =TRUE), state ) }return(address_title_case)}all_farms_ordered <- all_farms_combine %>%select(all_of(col_ordering)) %>%mutate(state =ifelse(state =="virginia", "VA", "ND")) %>%group_by(primary_applicant_name) %>%arrange(desc(selection_status), state, primary_applicant_name, farm_number) %>%ungroup() %>%distinct()# Dynamically identify columns that are likely address columnsaddress_columns <-names(all_farms_ordered)[grepl("address", names(all_farms_ordered), ignore.case =TRUE)]# Apply the function to all identified address columnsall_farms_ordered[address_columns] <-lapply(all_farms_ordered[address_columns], function(column) {sapply(column, to_title_case_except_states)})# Custom function to format namesformat_name <-function(name) {# Replace underscores with spaces name <-str_replace_all(name, "_", " ")# Convert the name to title case name <-str_to_title(name)# Handle the special case for "de" prefix name <-str_replace_all(name, "\\bDe\\b", "de")return(name)}# Custom function to replace full state names with abbreviationsconvert_state_abbreviation <-function(address) {# Define a named vector of state abbreviations state_names <-c("Alabama"="AL", "Alaska"="AK", "Arizona"="AZ", "Arkansas"="AR", "California"="CA", "Colorado"="CO", "Connecticut"="CT", "Delaware"="DE", "Florida"="FL", "Georgia"="GA", "Hawaii"="HI", "Idaho"="ID", "Illinois"="IL", "Indiana"="IN", "Iowa"="IA", "Kansas"="KS", "Kentucky"="KY", "Louisiana"="LA", "Maine"="ME", "Maryland"="MD", "Massachusetts"="MA", "Michigan"="MI", "Minnesota"="MN", "Mississippi"="MS", "Missouri"="MO", "Montana"="MT", "Nebraska"="NE", "Nevada"="NV", "New Hampshire"="NH", "New Jersey"="NJ", "New Mexico"="NM", "New York"="NY", "North Carolina"="NC", "North Dakota"="ND", "Ohio"="OH", "Oklahoma"="OK", "Oregon"="OR", "Pennsylvania"="PA", "Rhode Island"="RI", "South Carolina"="SC", "South Dakota"="SD", "Tennessee"="TN", "Texas"="TX", "Utah"="UT", "Vermont"="VT", "Virginia"="VA", "Washington"="WA", "West Virginia"="WV", "Wisconsin"="WI", "Wyoming"="WY" )# Replace full state names with abbreviationsfor (state innames(state_names)) { address <-str_replace_all(address, regex(paste0("\\b", state, "\\b"), ignore_case =TRUE), state_names[state]) }return(address)}# Custom function to capitalize directional calls with specific patterncapitalize_directionals <-function(address) {# Define a vector of directional calls directionals <-c("NE", "NW", "SE", "SW", "N", "E", "S", "W")# Capitalize directional calls only if they have a space before and a comma or space afterfor (dir in directionals) { address <-str_replace_all(address, regex(paste0("(?<=\\s)", dir, "(?=\\s|,)"), ignore_case =TRUE), toupper(dir)) }return(address)}all_farms_ordered_interim <- all_farms_ordered %>%rename(county = counties) %>%mutate(across(contains("name", ignore.case =TRUE), ~sapply(., format_name))) %>%mutate(across(contains("address", ignore.case =TRUE), ~sapply(., convert_state_abbreviation))) %>%mutate(across(contains("address", ignore.case =TRUE), ~sapply(., capitalize_directionals)))all_farms_ordered_final <- all_farms_ordered_interim %>%group_by(primary_applicant_name) %>%mutate(selection_status =ifelse(is.na(selection_status), "Not Recommended", selection_status)) %>%group_by(primary_applicant_name, farm_number) %>%# identify where the sum of practice acres and animal units does not total the farm total units, and flag with 1 if not equalmutate(flag_for_review =ifelse(round(sum(farm_practice_units, farm_practice_animals, na.rm =TRUE)) !=round(farm_totalunits), 1, 0)) %>%arrange(desc(selection_status), state, primary_applicant_name, farm_number) %>%ungroup()all_farms_ordered_final %>%select(flag_for_review, selection_status, entry_id, state, county,lrp_sdp_priority_flag, everything()) %>%head()write.csv(all_farms_ordered_final, file =paste0(data_path("vt_alliance/selection_files/all_farms_ordered_final_updated_"), today_date, ".csv"),row.names =FALSE)``````{r runall}```# Generate summary of output for selected thresholdsIn order to have a credible research design to evaluate the impacts of this funding, we cannot fund all applications in this round. Nor can we choose to fund all applicants but reduce the amount. Either of these options will render future impact evaluations for this program invalid.We will need a subset of applications that are not selected -- at complete random -- to have a credible comparison group (called a "counterfactual"). These people will still be eligible for selection in future periods if they choose to re-enter the pool.Given our understanding of the various target we seek to achieve and the multiple application periods, we propose the following thresholds, which would result in roughly the following amounts:(Note unless otherwise indicated by the column applications, all numbers refer to producer counts rather than applications.)```{r budget_summary_acreage_producers_rev}generate_summary_table <-function(df, state_input){# Step 3: Generate a summary output table state =trimws(gsub("", "_", tolower(state_input))) summary_table <- df %>%filter(state == state_input, selection_status =="selected") %>%group_by(hup_any) %>%summarise(state = state, # Display the state as a constant columnthreshold_used =as.character(state_fraction),budget_est =sum(budget_application, na.rm =TRUE),total_acres_au =sum(total_acres, na.rm =TRUE),budget_est =sum(budget_application, na.rm =TRUE),total_applications =n(),total_producers =sum(total_applicants, na.rm =TRUE),total_hup =sum(hup_any, na.rm =TRUE),total_lrp =sum(limited_resource_producer, na.rm =TRUE),total_sdp =sum(socially_disadvantaged_producer, na.rm =TRUE), ) %>%mutate(hup_any =as.character(hup_any)) %>%distinct()return(summary_table)}# Generate summary tables for different thresholdsnd_75 <-generate_summary_table(df = applications_selection_status,state_input ="north_dakota") %>%adorn_totals("row") %>%mutate(across(where(is.numeric), ~comma(.)))nd_75 %>%kable(title ="Summary of Budget Implications for ND By HUP Status")va_50 <-generate_summary_table(df = applications_selection_status,state_input ="virginia") %>%adorn_totals("row") %>%mutate(across(where(is.numeric), ~comma(.)))va_50 %>%kable(title ="Summary of Budget Implications for Virginia By HUP Status")nd_75_noformat <-generate_summary_table(df = applications_selection_status,state_input ="north_dakota") %>%adorn_totals("row") va_50_noformat <-generate_summary_table(df = applications_selection_status,state_input ="virginia") %>%adorn_totals("row") # Sum all rows but the ones labeled Totaltotals <-rbind(va_50_noformat, nd_75_noformat) %>%mutate(state =ifelse(hup_any =="Total", "state subtotal", state),hup_any =ifelse(hup_any =="Total", "-", hup_any)) # Step 2: Calculate new totals for rows previously labeled "Total"total_row <- totals %>%filter(state =="state subtotal") %>%summarise(across(where(is.numeric), sum, na.rm =TRUE))# Adding a label for the total rowtotal_row$state <-"Total"# Combining the modified data with the new total rowfinal_data <-bind_rows(totals, total_row) %>%# filter(state == "Total") %>% mutate(across(everything(), ~ifelse(is.na(.), "-", .))) %>%mutate(across(where(is.numeric), ~comma(as.numeric(.))))final_data %>%kable(caption ="Summary of Proposed Selection Implications by State and HUP Status") %>%kable_styling(full_width = F, position ="left", bootstrap_options =c("striped", "hover", "condensed", "responsive") )```# Check on share of HUP and SDP selectedThe flag here indicates whether an application has any applicant with HUP or SDP status. The numbers in the prior table refer to applicants (of which there may be up to three per application), whereas this table refers to applications where any of up to three applicants have HUP or SDP status.```{r flagged_selected_summary}# Display the summaryflagged_selection_summary %>%kable()```# Remaining TO-DO'sThe budget estimates should be considered as exactly that -- estimates! There are a few cases where the application files were inconsistent on the acreage within a farm and the total acreage requested, so we anticipate there may be adjustments once the acreage is reviewed/verified by local partners and the producers themselves.More broadly, as we have continued to evaluate the implications of selection, we have further identified a few more errors that should have been caught in the applications input files before the data file arrived to selection but were not (e.g. inconsistent acreages between columns, a test case making it through to "Recommended" status). As we learn more in this round, we can make improvements that will allow us to move more quickly in subsequent rounds on what types of automatic flags to have *before* producers are able to submit their files.# Manual flags that we had to fix We will seek to systematize the following checks moving forward:1. Sum of practice code animals or acres should equal total farm acres2. Practice code animals or acres should not be zero, missing, or negative3. Practice code animals or acres should not be greater than total farm acres4. Practice code animals or acres should not be greater than 160 per farm5. The "has enhancement" column should not be empty when the enhancement is present (or is not NA)6. Check to confirm the number of producers in LRP and SDP status per state are selected.