Chapter 14 Hurricane Ida
Install packages
Load packages
Download and load Hurricane Ida data, combining all three queries into a single data frame.
ida_raw <- data.frame()
url <- ""
for(i in 1:3){
file <- paste0("tevent_raw", i, ".RDS")
path <- here("data_private", file)
download.file(paste0(url, file),
ida_raw <- bind_rows(ida_raw, readRDS(path))
Is every status_id unique? Is there any redundancy in the data?
## [1] 504850
## [1] 281981
It appears that there is redundancy. Let’s try to find the tweets with redundant status_id’s to confirm that we should discard duplicates.
duplicateIDs <- ida_raw |> count(status_id, sort = TRUE) |> head(10)
ida_raw |>
inner_join(duplicateIDs, by = "status_id") |>
arrange(-n, status_id) |>
select(status_id, text) |>
## status_id
## 1 1433150471464706048
## 2 1433150471464706048
## 3 1433150471464706048
## 4 1433150471464706048
## 5 1433150471464706048
## 6 1433150520181547009
## 7 1433150520181547009
## 8 1433150520181547009
## 9 1433150520181547009
## 10 1433172773766156290
## text
## 1 #StCharles Parish #LA Distribution site #Ida \nWest Bank Bridge Park. \nSeptember 1 12pm-5pm , 8am-5pm from September 2 on\nIce, Water and MREs provided. \nAccess from I-310 to River Rd . Do Not Use the Levee\n
## 2 #StCharles Parish #LA Distribution site #Ida \nWest Bank Bridge Park. \nSeptember 1 12pm-5pm , 8am-5pm from September 2 on\nIce, Water and MREs provided. \nAccess from I-310 to River Rd . Do Not Use the Levee\n
## 3 #StCharles Parish #LA Distribution site #Ida \nWest Bank Bridge Park. \nSeptember 1 12pm-5pm , 8am-5pm from September 2 on\nIce, Water and MREs provided. \nAccess from I-310 to River Rd . Do Not Use the Levee\n
## 4 #StCharles Parish #LA Distribution site #Ida \nWest Bank Bridge Park. \nSeptember 1 12pm-5pm , 8am-5pm from September 2 on\nIce, Water and MREs provided. \nAccess from I-310 to River Rd . Do Not Use the Levee\n
## 5 #StCharles Parish #LA Distribution site #Ida \nWest Bank Bridge Park. \nSeptember 1 12pm-5pm , 8am-5pm from September 2 on\nIce, Water and MREs provided. \nAccess from I-310 to River Rd . Do Not Use the Levee\n
## 6 @cjfaison Same here in Pennsylvania. Ida is kicking our butts with all this heavy rain and tornado warnings.
## 7 @cjfaison Same here in Pennsylvania. Ida is kicking our butts with all this heavy rain and tornado warnings.
## 8 @cjfaison Same here in Pennsylvania. Ida is kicking our butts with all this heavy rain and tornado warnings.
## 9 @cjfaison Same here in Pennsylvania. Ida is kicking our butts with all this heavy rain and tornado warnings.
## 10 I want the whole video and the follow up. Was the cow ok? Watch Workers Rescue A Cow Trapped In A Tree After Hurricane Ida
It looks like we can keep the first instance of every unique status ID.
Adding status_id
to distinct
keeps all the unique status_id values, but would only give the one column: status_id
.keep_all = TRUE
keeps all the columns and the data values associated with the first instance of each status_id.
Check the number of rows and number of distinct rows in each data frame
## [1] 281981
## [1] 281981
Now all the tweets are unique! Save this combined and unique dataset.
Load Ida merged data
Which profiles should we geocode? Unnest geo_coords first, to avoid geocoding those
Then, filter for tweets with no geo_coords and insufficiently precise place types.
locations <- ida_merged |>
filter( |>
filter(!(place_type %in% c("poi", "neighborhood", "city"))) |>
select(location) |>
count(location, sort = TRUE)
head(locations, 20)
## # A tibble: 20 × 2
## location n
## <chr> <int>
## 1 United States 13033
## 2 New Orleans, LA 11230
## 3 New York, NY 6443
## 4 Washington, DC 5969
## 5 USA 5051
## 6 Baton Rouge, LA 3784
## 7 Houston, TX 3764
## 8 Louisiana, USA 3688
## 9 Atlanta, GA 3188
## 10 Florida, USA 2591
## 11 New York 2558
## 12 New Orleans 2540
## 13 New York City 2445
## 14 New York, USA 2427
## 15 Philadelphia, PA 2354
## 16 Chicago, IL 2304
## 17 Miami, FL 2304
## 18 California, USA 2131
## 19 Dallas, TX 1956
## 20 New Jersey, USA 1883
There is a lot of junk in the locations data, but it appears that there is also a lot of good data. There is also a pattern in the good data: a city followed by a comma and a state or country. Let’s filter the locations for those with commas, and not commas followed by “USA”
locationsclean <- locations |>
mutate(locclean = tolower(location),
locclean = str_remove(locclean, "[ ,]+u.?s.?a?"),
locclean = str_remove(locclean, "[ ,]+united states"),
locclean = str_remove(locclean, "[ ,]+canada"),
locclean = str_remove(locclean, "(usa)"),
locclean = tolower(locclean),
locclean = str_replace(locclean, "d.c.", "dc"),
locclean = str_replace(locclean, ", new york", "ny"),
locclean = str_replace(locclean, ", nyc", "ny")) |>
filter(str_detect(locclean, ",")) |>
mutate(split = str_split(locclean, ",")) |>
unnest_wider(split, names_sep="_") |>
mutate(split_3 = str_trim(split_3),
split_1 = str_trim(split_1),
split_2 = str_trim(split_2),
city = ifelse( | !(str_length(split_3) == 2),
state = ifelse( | !(str_length(split_3) == 2),
split_3)) |>
filter(, | str_length(split_3) == 2) |>
filter(!state %in% c("canada",
"estados unidos"
)) |>
mutate(state = str_remove_all(state, "[:digit:]"),
state = str_trim(str_remove_all(state, "[:punct:]")),
city = str_trim(str_remove_all(city, "[:digit:]"))) |>
filter(str_length(city) >=3,
state %in% c(tolower(, tolower(, "dc", "puerto rico", "pr"))
locations_to_code <- locationsclean |> count(city, state, sort = TRUE)
Try to geocode and save the results (so that this process never needs to run again).
locations_coded <- geocode(locations_to_code, city = city, state = state)
locations_coded |> saveRDS(here("data_public", "geocodedLocations.RDS"))
Load geocoding results.
Join geocoded locations back to location text and save as profile_geocodes.RDS
locations_xy <- locationsclean |>
left_join(locations_coded, by = c("city", "state")) |>
select(location, prof_lat = lat, prof_long = long)
locations_xy |> saveRDS(here("data_public", "profile_geocodes.RDS"))
Join geocoded locations back to tweets
ida_merged_geo <- ida_unique |>
left_join(locations_xy, by = "location")
ida_merged_geo |> filter(! |> nrow()
## [1] 157579
Now, the Hurricane Ida data can be mapped with geographic coordinates, place name, or users’ location name, so long as the place is more specific than a county and the name has a city and state separated by a comma.