library(gregbio)
<- blev_født(danmark) %>% my_life
library(gregbio)
my_life <- blev_født(danmark) %>%
voksede_op(USA,
state = “Pennsylvania”,
city = “Philadelphia) %>%
undergrad_degree(film_major) %>%
PhD(education_policy) %>%
career = case_when(
job = faculty ~ FIU, (set_location as Miami, FL),
job = data_analyst ~ UC Berkeley & SFSU,
(set_location as SF Bay Area)
library(gregbio)
my_life <- blev_født(danmark) %>%
voksede_op(USA,
state = “Pennsylvania”,
city = “Philadelphia) %>%
undergrad_degree(film_major) %>%
PhD(education_policy) %>%
career = case_when(
job = faculty ~ FIU, (set_location as Miami, FL),
job = data_analyst ~ UC Berkeley & SFSU,
(set_location as SF Bay Area),
job = career_swerve1 ~ freelance_ESL_teacher,
(set_location as Lyon, FR)
library(gregbio)
my_life <- blev_født(danmark) %>%
voksede_op(USA,
state = “Pennsylvania”,
city = “Philadelphia) %>%
undergrad_degree(film_major) %>%
PhD(education_policy) %>%
career = case_when(
job = faculty ~ FIU, (set_location as Miami, FL),
job = data_analyst ~ UC Berkeley & SFSU,
(set_location as SF Bay Area),
job = career_swerve1 ~ freelance_ESL_teacher,
(set_location as Lyon, FR)
job = career_swerve2 ~
study_abroad_student_services,
(set_location as København, DK)))
rStrava
package.DataExplorer
package.gt
gt
tables next to each other.ggplot
code to make some pretty charts.sträva
rStrava
packagelibrary(tidyverse) # to do tidyverse things
library(tidylog) # to get a log of what's happening to the data
library(janitor) # tools for data cleaning
strava_activities <- readr::read_csv("data/activities.csv") %>%
clean_names() %>%
as_tibble() %>%
rename(elapsed_time = elapsed_time_6, distance = distance_7, max_heart_rate = max_heart_rate_8,
relative_effort = relative_effort_9, commute = commute_10, elapsed_time2 = elapsed_time_16,
distance2 = distance_18, relative_effort2 = relative_effort_38, commute2 = commute_51)
mutate(activity_date = str_replace(activity_date, "Jan ", "January "))
separate('activity_date', paste("date", 1:3, sep="_"), sep=",", extra="drop")
mutate(activity_md = str_trim(date_1))
separate('activity_md', paste("activity_md", 1:2, sep="_"), sep=" ", extra="drop")
mutate(activity_mdy = paste0(date_1, ",", date_2))
mutate(activity_ymd = lubridate::mdy(activity_mdy))
mutate(activity_tz = case_when(activity_ymd >= "2022-06-28" ~ "Europe/Copenhagen",
TRUE ~ "US/Pacific"))
rStrava
package - AuthorizationIn a separate file called stoken.r, I create the access token to call in the script that pulls the data.
library(rStrava)
app_name <- 'Year of Riding Danishly' # chosen by user
app_client_id <- '---' # an integer, assigned by Strava
app_secret <- '---' # an alphanumeric secret, assigned by Strava
# create the authentication token - cache = TRUE saves it in the working directory
stoken <- httr::config(token = strava_oauth(
app_name, app_client_id, app_secret,
app_scope="activity:read_all", cache = TRUE))
stoken object is a list:
rStrava
package - Get the data## call the OAuth access token
stoken <- httr::config(token = readRDS('.httr-oauth')[[1]])
## invoke stoken to get data
myact <- get_activity_list(stoken)
Returns a list of activities requested
rStrava
package - Get the data act_data <- compile_activities(myact) %>%
as_tibble() %>%
glimpse()
act_data <- compile_activities(myact) %>%
as_tibble() %>%
mutate(gear_name = case_when(gear_id == "b6298198" ~ "Univega",
gear_id == "b11963967" ~ "Commute bike", TRUE ~ "Not a bike ride")) %>%
mutate(activity_date = lubridate::as_datetime(start_date_local)) %>%
mutate(activity_date_p = as.Date(start_date_local)) %>%
mutate(activity_year = lubridate::year(start_date_local),
activity_month = lubridate::month(start_date_local),
activity_month_t = lubridate::month(start_date_local, label = TRUE, abbr = FALSE),
activity_day = lubridate::day(start_date_local),
activity_md = paste0(activity_month_t, " ", activity_day),
activity_wday = wday(activity_date_p, label = TRUE, abbr = FALSE),
activity_hour = lubridate::hour(activity_date),
activity_min = lubridate::minute(activity_date),
activity_hmt = paste0(activity_hour, ":", activity_min),
activity_hm = hm(activity_hmt),
moving_time_hms = hms::hms(moving_time),
elapsed_time_hms = hms::hms(elapsed_time)) %>%
mutate(location_country = case_when(
timezone == "(GMT+01:00) Europe/Copenhagen" ~ "Denmark",
timezone == "(GMT+01:00) Europe/Paris" ~ "France", TRUE ~ "United States")) %>%
## random edits
mutate(commute = ifelse((activity_year == 2023 & activity_md == "June 14" & name == "Morning commute"),
TRUE, commute))
## merge with variables from CSV but not in API
act_data_csv_ext <- act_data_csv %>%
select(activity_id, calories, average_grade, max_grade, average_elapsed_speed, elevation_loss)
strava_activities_final <- act_data %>%
merge(act_data_csv_ext) %>%
select(activity_id:max_speed, average_elapsed_speed, elevation_gain, elevation_loss, elevation_high, elevation_low,
average_grade, max_grade, location_country:lng_end, average_watts, calories, kilojoules)
library(DataExplorer) # for EDA
plot_intro(strava_data)
plot_missing(strava_data)
Some are obvious (distance in km), these maybe less so…
strava_data %>%
select(distance_km, elapsed_time, moving_time, max_speed, average_speed, elevation_gain, elevation_loss, elevation_low,
elevation_high, average_grade, max_grade, average_watts, calories, kilojoules) %>%
filter(!is.na(average_watts)) %>%
filter(!is.na(calories)) %>%
plot_correlation(maxcat = 5L, type = "continuous", geom_text_args = list("size" = 4))
So what do we see here?
Most of the relationships are positive, some with expectedly near 1:1 relationships, such as distance (in km) and total time for the ride.
Average speed is positively correlated with distance but the relationship is only at 0.14, the weakest of all positive associations with distance. Average speed correlations are low…near 0, for total elevation gain and negative the higher the average grade of the ride.
Averge watts, or weighted power output for the ride, has mostly negative correlations. Longer rides in time or distance meant a lower average power per ride segment.
We’ll keep these correlations in mind when looking at the scatterplots and then later considering the regression results.
DataExplorer
has functionality for scatterplots, but each call only allows for one comparison y-axis variable & not much customization to the output, like a smoothing line.plot_scatterplot(strava_data_filter, by = "distance_km", nrow = 6L)
plot_scatter_lm <- function(data, var1, var2, pointsize = 2, transparency = .5, color = "") {
## check if inputs are valid
if (!exists(substitute(data))) stop("data needs to be a data frame.")
if (!is.data.frame(data)) stop("data needs to be a data frame.")
if (!is.numeric(pull(data[var1]))) stop("Column var1 needs to be of type numeric, passed as string.")
if (!is.numeric(pull(data[var2]))) stop("Column var2 needs to be of type numeric, passed as string.")
if (!is.numeric(pointsize)) stop("pointsize needs to be of type numeric.")
if (!is.numeric(transparency)) stop("transparency needs to be of type numeric.")
if (color != "") { if (!color %in% names(data)) stop("Column color needs to be a column of data, passed as string.") }
g <-
ggplot(data, aes(x = !!sym(var1), y = !!sym(var2))) +
geom_point(aes(color = !!sym(color)), size = pointsize, alpha = transparency) +
geom_smooth(aes(color = !!sym(color), color = after_scale(prismatic::clr_darken(color, .3))),
method = "lm", se = FALSE) +
theme_minimal() +
theme(panel.grid.minor = element_blank(), legend.position = "top")
if (color != "") {
if (is.numeric(pull(data[color]))) {
g <- g + scale_color_viridis_c(direction = -1, end = .85) +
guides(color = guide_colorbar(
barwidth = unit(12, "lines"), barheight = unit(.6, "lines"), title.position = "top"))
} else {g <- g + scale_color_brewer(palette = "Set2")}
}
return(g)
}
## 1st plot call - distance as y axis
patchwork::wrap_plots(
map2(c("elapsed_time", "moving_time", "average_speed","average_watts", "calories", "kilojoules"),
c("distance_km", "distance_km", "distance_km", "distance_km", "distance_km", "distance_km"),
~plot_scatter_lm(data = strava_activities_rides, var1 = .x, var2 = .y, pointsize = 3.5) +
theme(plot.margin = margin(rep(15, 4)))))
Confirms what we saw in the correlation heatmap & displays ride distributions.
Positive and almost 1:1 relationships between distance and both time measures, elapsed and moving.
Negative association with watts that we saw in the correlations. Making a note to take a closer look at how much an effect watts has later on in the regression section.
Note the outlier ride of 60km and an elapsed time of more than 15,000 seconds…more about that one later.
gt
library(gt)
sumtable %>%
select(rides, km_total, elev_total, time_total1, time_total2, cal_total, kiloj_total) %>%
gt() %>%
fmt_number(columns = c(km_total, elev_total, cal_total, kiloj_total), decimals = 0) %>%
cols_label(rides = "Total Rides", km_total = "Total Kilometers",
elev_total = md("Total Elevation *(meters)*"),
time_total1 = md("Total Time *(hours/min/sec)*"),
time_total2 = md("Total Time *(days/hours/min/sec)*"),
cal_total = "Total Calories", kiloj_total = "Total Kilojoules") %>%
cols_align(align = "center", columns = everything()) %>%
tab_style(style = cell_fill(color = "grey"), locations = cells_body(rows = seq(1, 1, 1))) %>%
tab_style( style = cell_text(align = "center"), locations = cells_column_labels(
columns = c(rides, km_total, elev_total, time_total1, time_total2, cal_total, kiloj_total))) %>%
tab_header(title = md("My Year of Riding Danishly<br>*Ride Totals*"))
gt
ggplot2
library(patchwork)
ridesplot1 <- rides_mth_type %>%
ggplot(aes(activity_month_abbv, rides_by_month)) +
geom_col(fill = "#C8102E") +
geom_text(aes(label= rides_by_month), color = "white", size = 5, vjust = 1.5) +
labs(x = "", y = "", title = "Spring & Summer Weather = More Rides",
subtitle = glue::glue("*Average Rides / Month = {round(mean(rides_mth_type$rides_by_month, 3))}*")) +
theme_minimal() +
theme(panel.grid = element_blank(), plot.title = element_text(hjust = 0.5),
plot.subtitle = element_markdown(hjust = 0.5),
axis.text.y = element_blank())
ridesplot2 <- rides_mth_type %>%
group_by(ride_type) %>%
mutate(rides_by_type = sum(ride_type_n)) %>%
ungroup() %>%
distinct(rides_by_type, .keep_all = TRUE) %>%
mutate(ride_type_pct = rides_by_type / sum(rides_by_type)) %>%
{. ->> tmp} %>%
ggplot(aes(ride_type, ride_type_pct)) +
geom_col(fill = "#C8102E") +
scale_x_discrete(labels = paste0(tmp$ride_type, "<br>Total Rides = ", tmp$rides_by_type, "")) +
geom_text(data = subset(tmp, ride_type != "Workout"),
aes(label= scales::percent(round(ride_type_pct, 2))), color = "white", size = 5, vjust = 1.5) +
geom_text(data = subset(tmp, ride_type == "Workout"),
aes(label= scales::percent(round(ride_type_pct, 2))), color = "#C8102E", size = 5, vjust = -.5) +
labs(x = "", y = "", title = "Lots of Riding to Work or Danish Class") +
theme_minimal() +
theme(panel.grid = element_blank(), plot.title = element_text(hjust = 0.5),
axis.text.y = element_blank(), axis.text.x = element_markdown())
rm(tmp)
ridesplot1 + ridesplot2
ggplot2
ggplot2
rides_mth_type %>%
ggplot(aes(activity_month_t, ride_type_pct, fill = ride_type)) +
geom_bar(stat = "identity") +
geom_text(data = subset(rides_mth_type, ride_type != "Workout"),
aes(label = scales::percent(round(ride_type_pct, 2))),
position = position_stack(vjust = 0.5), color= "white", vjust = 1, size = 5) +
labs(x = "", y = "", title = "Most Rides Each Month Were Commutes to/from Work or Danish Class") +
scale_fill_manual(values = c("#0072B2", "#E69F00", "#CC79A7"), labels = c("Commute/<br>Studieskolen", "Other", "Workout")) +
theme_minimal()+
theme(legend.position = "bottom", legend.spacing.x = unit(0, 'cm'),
legend.text = element_markdown(),
legend.key.width = unit(1.5, 'cm'), legend.title = element_blank(),
axis.text.y = element_blank(), plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill = guide_legend(label.position = "bottom"))
ggplot2
ggplot2
ggplot2
strava_data %>%
filter(activity_year == 2023) %>%
count(ride_type, activity_hour) %>%
{. ->> tmp} %>%
ggplot(aes(activity_hour, y = n, fill = ride_type)) +
geom_bar(stat = "identity") +
scale_x_continuous(limits = c(0, 24), breaks = seq(0, 24)) +
geom_text(data = subset(tmp, ride_type == "Commute/Studieskolen" & n > 20),
aes(label= n), color = "white", size = 4) +
coord_polar(start = 0) +
theme_minimal() +
scale_fill_manual(values = c("#0072B2", "#E69F00", "#CC79A7"), labels = c("Commute/<br>Studieskolen", "Other", "Workout")) +
labs(x = "", y = "",
title = "Most Rides During Morning and Evening Commuting Hours",
subtitle = "*Numbers Correspond to Hour of Day on a 24 hr clock*") +
theme(legend.text = element_markdown(),
axis.text.y = element_blank(),
legend.title = element_blank(),
plot.title = element_text(size = 10, hjust = 0.5),
plot.subtitle = element_markdown(hjust = 0.5, size = 9))
rm(tmp)
activty_ampm %>%
ggplot(aes(activity_min, y = activity_min_n, fill = ampm)) +
geom_col(position = position_stack(reverse = TRUE)) +
scale_x_continuous(limits = c(-1, 60), breaks = seq(0, 59), labels = seq(0, 59)) +
geom_text(data = subset(activty_ampm, activity_min_n > 5),
aes(label= activity_min_n), color = "white", size = 4, position = position_nudge(y = -1)) +
coord_polar(start = 0) +
theme_minimal() +
scale_fill_manual(values = c("#E57A77", "#7CA1CC"), labels = c("AM", "PM")) +
labs(x = "", y = "",
title = "Most Morning Rides Started Between 12 & 30 Past the Hour <br>
Evening Rides More Evenly Spaced Through the Hour",
subtitle = "*Numbers Correspond to Minutes of the Hour*") +
theme(legend.text = element_markdown(),
axis.text.y = element_blank(),
legend.title = element_blank(),
plot.title = element_markdown(size = 10, hjust = 0.5),
plot.subtitle = element_markdown(hjust = 0.5, size = 9))
modelsummary
After the EDA and charts & tables, time to run some regressions to see the effect of the variables on explaining how long a ride took, how many average watts I was producing, and how much energy I burned.
The modelsummary
package is very helpful for wrapping a bunch of functions together to help display regression statistics.
Good support for running multiple models…pass the models into a list, and modelsummary
does the rest when you call the object.
library(modelsummary)
strava_models <- strava_data %>%
filter(activity_year == 2023)
ride_models <- list(
"Time" = lm(moving_time ~ distance_km + average_speed + elevation_gain + average_grade + average_watts,
data = strava_models),
"Watts" = lm(average_watts ~ moving_time + distance_km +average_speed + elevation_gain + average_grade + kilojoules,
data = strava_models),
"Kilojoules" = lm(kilojoules ~ moving_time + distance_km +average_speed + elevation_gain + average_grade + average_watts,
data = strava_models))
modelsummary(ride_models, stars = TRUE, gof_omit = "IC|Adj|F|RMSE|Log", output = "gt")
modelplot(ride_models, coef_omit = "Interc")
In the time model, each extra kilometer per ride added almost 190 seconds, or 3 minutes, to the ride. Each km/hour or average speed took about 2 minutes off a ride. It’s a bit counter-intuitive however that steeper average grades resulted in shorter rides. I’d need to dig deeper into the data to see why that might be happening.
For the watts model the largest effect size was average_grade, which makes sense…the steeper the ride the more power I needed to do it. Though oddly, grade had a negative effect on kilojoules burned.
Overall the models were robust, each explaining well over 80% of variance.
But before we’re fully satisfied with the models, let’s do a quick check for colinearity…
car
# create the data objects for the models
colin_time <- stack(car::vif(ride_models$time))
colin_time <- stack(car::vif(ride_models$time))
colin_watts <- stack(car::vif(ride_models$watts))
colin_joules <- stack(car::vif(ride_models$kilojoules))
# show the results using gt (only showing the call for time model)
colin_time %>%
gt() %>%
tab_header(title = "Colinearity - Time Model")
There’s some problematic colinearity with moving_time and distance_km in the watts and kilojoules models, so let’s redo models removing the variables with most colinearity.