# Example voter data structure - you would load your actual data here
<- data.frame(
voters sosidnum = c("12345", "67890", "11111"),
stnum = c("123", "456", "789"),
stname = c("Main St", "Oak Ave", "Pine Rd"),
city = c("Athens", "Nelsonville", "McArthur"),
zip = c("45701", "45764", "45651"),
stringsAsFactors = FALSE
)
$street = paste(voters$stnum, voters$stname)
voters<- voters[,c("sosidnum", "street", "city", "zip")]
voters
for (col in c("street", "city")) {
<- stringr::str_to_title(voters[[col]])
voters[[col]]
}
head(voters)
Using the US Census Geocoder in R
The US Census is kind enough to provide their geocoding service to the public. This service allows you to retrieve coordinates for a given postal address and vice versa. For this post we’re going to look at using the batch job endpoint to retrieve coordinates of an address list. I’ll be using the curl
package in R, which is essentially the nuts and bolts of the more popular and easier to use httr
.
We’ll start by gathering a set of addresses. We’ll use the voter rolls from the Vinton County, Ohio Board of Elections
Now that we’ve got some address data, let’s look at our Census API Documentation
Geocoding can be accomplished in batch mode with submission of a .CSV, .TXT, .DAT, .XLS, or .XLSX formatted file. The file needs to be included as part of the HTTP request. The file must be formatted in the following way: Unique ID, Street address, City, State, ZIP If a component is missing from the dataset, it must still retain the delimited format with a null value. Unique ID and Street address are required fields. If there are commas that are part of one of the fields, the whole field needs to be enclosed in quote marks for proper parsing. There is currently an upper limit of 10,000 records per batch file.
So, we want to limit each request to under 10k, and each request is an uploaded csv with particular columns. Let’s put that into action using our .csv of voter registrations.
library(curl)
# Extracting relevant columns from the 'voters' dataset
= voters[,c("sosidnum", "street", "city", "zip")]
to_send
# Renaming the columns
= setNames(to_send, c("Unique ID", "Street Address", "City", "Zip"))
to_send
# Adding a new column for the state
$State = "OH"
to_send
# Rearranging the columns
= to_send[, c("Unique ID", "Street Address", "City", "State", "Zip")]
to_send
# Creating a temporary file to store the data in CSV format
= tempfile(tmpdir = getwd(), fileext = ".csv")
t
# Writing the data to the CSV file
write.csv(to_send, t, row.names = FALSE)
# Creating a new handle for making a request
<- new_handle() |>
h handle_setform(
addressFile = form_file(t),
benchmark = "Public_AR_Current"
)
# Sending a POST request to geocode the addresses
= curl_fetch_memory("https://geocoding.geo.census.gov/geocoder/locations/addressbatch", handle = h)
x
# Removing the temporary file
file.remove(t) |> invisible()
# Creating a new temporary file to store the response
= tempfile(tmpdir = getwd(), fileext = ".csv")
t
# Writing the response content to the temporary file
$content |> rawToChar() |> cat(file = t)
x
# Reading the response into the 'results' dataframe
<- read.csv(t, header = F, col.names = c("uid", "address", "match", "match_type", "matched_address", "latlon", "tigerid", "tigerside"))
results
# Removing the temporary file
file.remove(t) |> invisible()
head(results)
And that’s that. From there we can use this data for analysis.
# Parse latitude and longitude from the results
$lon = apply(results, 1, \(x) {
resultsas.numeric(strsplit(x[["latlon"]], ",")[[1]][1])
})
$lat = apply(results, 1, \(x) {
resultsas.numeric(strsplit(x[["latlon"]], ",")[[1]][2])
})
# Visualize the geocoded addresses
library(ggplot2)
::plot_usmap(regions = "counties", include = c(39163)) +
usmapgeom_point(data = usmap::usmap_transform(results[!is.na(results$lat),]), aes(x = x, y = y)) +
labs(title = "Vinton County")