library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Read in data and check.
raw_data <- readxl::read_xlsx("190605_example_data.xlsx") %>%
mutate(row_id = 1:nrow(.)) # add a row identifier
raw_data
## # A tibble: 5 x 18
## species latitude longitude temp_1 temp_2 temp_3 temp_4 temp_5 temp_6
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Magnol… 31.0 -91.5 9.05 11.1 15.5 19.6 23.1 26.3
## 2 Magnol… 35.7 -93.2 2.45 4.89 10.2 15.5 19.5 23.8
## 3 Magnol… 35.7 -93.2 2.45 4.89 10.2 15.5 19.5 23.8
## 4 Magnol… 43.2 -76.3 -5.33 -4.55 0.98 7.42 13.7 18.5
## 5 Magnol… 35.6 -92.9 2.45 4.89 10.2 15.5 19.5 23.8
## # … with 9 more variables: temp_7 <dbl>, temp_8 <dbl>, temp_9 <dbl>,
## # temp_10 <dbl>, temp_11 <dbl>, temp_12 <dbl>, valid_cells <dbl>,
## # warmth_index <dbl>, row_id <int>
I’m reshaping the data from wide to long because having it in a tidy long format makes it easier to reason about unique observations and also to use dplyr
to analyse subsets. See the vignette on tidy data for more info. So what I’ve done is pulled the 12 monthly columns into two columns, one indicating the month and one the temperature. I’ve also created a warm
column which is a logical (TRUE/FALSE
) index of months that were warmer than 5^o
reshaped_data <- raw_data %>%
tidyr::gather(key = "month", value = "temp", temp_1:temp_12) %>%
mutate(month = stringr::str_remove(month, "temp_") %>% readr::parse_number(),
warm = case_when(temp > 5 ~ TRUE,
TRUE ~ FALSE))
reshaped_data
## # A tibble: 60 x 9
## species latitude longitude valid_cells warmth_index row_id month temp
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 Magnol… 31.0 -91.5 12 170. 1 1 9.05
## 2 Magnol… 35.7 -93.2 9 123. 2 1 2.45
## 3 Magnol… 35.7 -93.2 9 123. 3 1 2.45
## 4 Magnol… 43.2 -76.3 7 73.8 4 1 -5.33
## 5 Magnol… 35.6 -92.9 9 123. 5 1 2.45
## 6 Magnol… 31.0 -91.5 12 170. 1 2 11.1
## 7 Magnol… 35.7 -93.2 9 123. 2 2 4.89
## 8 Magnol… 35.7 -93.2 9 123. 3 2 4.89
## 9 Magnol… 43.2 -76.3 7 73.8 4 2 -4.55
## 10 Magnol… 35.6 -92.9 9 123. 5 2 4.89
## # … with 50 more rows, and 1 more variable: warm <lgl>
warmth_index
functionNext I wrote a function to calculate warmth index given a logical vector of warmth
and a vector of temperatures temp
.
warmth_index <- function(warm, temp){
warm_months <- sum(warm)
temp_sum <- sum(warm * temp) # when multiplied, the warm logical vector becomes 0 & 1
temp_sum - (5 * warm_months)
}
row_id
Now I can use dplyr
s group_by
and summarise
function to calculate warmth index for each row_id
in a new column called warmth_index_r
.
warmth_index_df <- reshaped_data %>%
group_by(row_id) %>%
summarise(warmth_index_r = warmth_index(warm, temp))
warmth_index_df
## # A tibble: 5 x 2
## row_id warmth_index_r
## <int> <dbl>
## 1 1 170.
## 2 2 123.
## 3 3 123.
## 4 4 73.8
## 5 5 123.
I can now join the results to the original raw_data
.
results <- left_join(raw_data, warmth_index_df, by = "row_id")
You can inspect that the calculation worked as expected by comparing the two warmth_index
columns
results %>% select(warmth_index, warmth_index_r, everything())
## # A tibble: 5 x 19
## warmth_index warmth_index_r species latitude longitude temp_1 temp_2
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 170. 170. Magnol… 31.0 -91.5 9.05 11.1
## 2 123. 123. Magnol… 35.7 -93.2 2.45 4.89
## 3 123. 123. Magnol… 35.7 -93.2 2.45 4.89
## 4 73.8 73.8 Magnol… 43.2 -76.3 -5.33 -4.55
## 5 123. 123. Magnol… 35.6 -92.9 2.45 4.89
## # … with 12 more variables: temp_3 <dbl>, temp_4 <dbl>, temp_5 <dbl>,
## # temp_6 <dbl>, temp_7 <dbl>, temp_8 <dbl>, temp_9 <dbl>, temp_10 <dbl>,
## # temp_11 <dbl>, temp_12 <dbl>, valid_cells <dbl>, row_id <int>
If you prefer, ie if you want to continue using the tidy (long) version of the data, you can join the results to that instead and carry on working. It just repeats the calculated value for each month of the same row_id
.
tidy_results <- left_join(reshaped_data, warmth_index_df, by = "row_id")
tidy_results %>% arrange(row_id)
## # A tibble: 60 x 10
## species latitude longitude valid_cells warmth_index row_id month temp
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 Magnol… 31.0 -91.5 12 170. 1 1 9.05
## 2 Magnol… 31.0 -91.5 12 170. 1 2 11.1
## 3 Magnol… 31.0 -91.5 12 170. 1 3 15.5
## 4 Magnol… 31.0 -91.5 12 170. 1 4 19.6
## 5 Magnol… 31.0 -91.5 12 170. 1 5 23.1
## 6 Magnol… 31.0 -91.5 12 170. 1 6 26.3
## 7 Magnol… 31.0 -91.5 12 170. 1 7 27.6
## 8 Magnol… 31.0 -91.5 12 170. 1 8 27.3
## 9 Magnol… 31.0 -91.5 12 170. 1 9 25.0
## 10 Magnol… 31.0 -91.5 12 170. 1 10 19.7
## # … with 50 more rows, and 2 more variables: warm <lgl>,
## # warmth_index_r <dbl>