R Process for Cyclistic's Bike-Share Project

By Carol Addassi

Abstract

These are the cleaning and analysis steps in R.

Date

August 10, 2021

Time

12:00 AM

Cyclistic_Exercise_Full_Year_Analysis

This analysis is based on the Divvy case study “Sophisticated, Clear, and Polished’: Divvy and Data Visualization” written by Kevin Hartman. The purpose of this script is to consolidate downloaded Divvy data into a single dataframe and then conduct simple analysis to help answer the key question: “In what ways do members and casual riders use Divvy bikes differently?”

Load required packages

library(tidyverse) #helps wrangle data library(lubridate) #helps wrangle date attributes library(ggplot2) #helps visualize data
library(openxlsx) #read in xlsx data files getwd() #displays your working directory setwd("/Users/caroladdassi/RProjects/Case_Study_Cyclistics/data/e_xlsx") #sets your working directory to simplify calls to data … make sure to use your OWN username instead of mine.)

#=====================

Step 1: Collect Data

#=====================

Upload datasets (xlsx files transformed in Excel) here

m1_07_2020 <- read.xlsx(“D2020_07_e.xlsx”) m2_08_2020 <- read.xlsx(“D2020_08_e.xlsx”) m3_09_2020 <- read.xlsx(“D2020_09_e.xlsx”) m4_10_2020 <- read.xlsx(“D2020_10_e.xlsx”) m5_11_2020 <- read.xlsx(“D2020_11_e.xlsx”) m6_12_2020 <- read.xlsx(“D2020_12_e.xlsx”) m7_01_2021 <- read.xlsx(“D2021_01_e.xlsx”) m8_02_2021 <- read.xlsx(“D2021_02_e.xlsx”) m9_03_2021 <- read.xlsx(“D2021_03_e.xlsx”) m10_04_2021 <- read.xlsx(“D2021_04_e.xlsx”) m11_05_2021 <- read.xlsx(“D2021_05_e.xlsx”) m12_06_2021 <- read.xlsx(“D2021_06_e.xlsx”)

#====================================================

Step 2: Wrangle Data and Combine Into a Single File

#====================================================

Compare column names each of the files

While the names don’t have to be in the same order, they DO need to match perfectly before we can use a command to join them into one file

colnames(m1_07_2020) colnames(m2_08_2020) colnames(m3_09_2020) colnames(m4_10_2020) colnames(m5_11_2020) colnames(m6_12_2020) colnames(m7_01_2021) colnames(m8_02_2021) colnames(m9_03_2021) colnames(m10_04_2021) colnames(m11_05_2021) colnames(m12_06_2021)

Rename columns to make them consisent with q1_2020 (as this will be the supposed going-forward table design for Divvy. Turns out it wasn’t as the data I’m using is well after q1_2020.)

Note there are currently 3 additonal columns in my files after some data transforming in Excel.

(m1_07_2020 <- rename(m1_07_2020 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m2_08_2020 <- rename(m2_08_2020 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m3_09_2020 <- rename(m3_09_2020 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m4_10_2020 <- rename(m4_10_2020 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m5_11_2020 <- rename(m5_11_2020 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m6_12_2020 <- rename(m6_12_2020 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m7_01_2021 <- rename(m7_01_2021 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m8_02_2021 <- rename(m8_02_2021 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m9_03_2021 <- rename(m9_03_2021 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m10_04_2021 <- rename(m10_04_2021 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m11_05_2021 <- rename(m11_05_2021 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

(m12_06_2021 <- rename(m12_06_2021 ,trip_id = ride_id ,bikeid = rideable_type ,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name ,from_station_id = start_station_id ,to_station_name = end_station_name ,to_station_id = end_station_id ,usertype = member_casual))

Inspect the dataframes and look for incongruencies

str(m1_07_2020) str(m2_08_2020) str(m3_09_2020) str(m4_10_2020) str(m5_11_2020) str(m6_12_2020) str(m7_01_2021) str(m8_02_2021) str(m9_03_2021) str(m10_04_2021) str(m11_05_2021) str(m12_06_2021)

Convert ride_id and rideable_type to character so that they can stack correctly

Convert trip_id and bike_type to character so that they can stack correctly

m1_07_2020 <- mutate(m1_07_2020, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m2_08_2020 <- mutate(m2_08_2020, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m3_09_2020 <- mutate(m3_09_2020, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m4_10_2020 <- mutate(m4_10_2020, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m5_11_2020 <- mutate(m5_11_2020, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m6_12_2020 <- mutate(m6_12_2020, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m7_01_2021 <- mutate(m7_01_2021, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m8_02_2021 <- mutate(m8_02_2021, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m9_03_2021 <- mutate(m9_03_2021, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m10_04_2021 <- mutate(m10_04_2021, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m11_05_2021 <- mutate(m11_05_2021, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid)) m12_06_2021 <- mutate(m12_06_2021, trip_id = as.character(trip_id) ,bikeid = as.character(bikeid))

Convert ride_length to double so that they can stack correctly

m1_07_2020 <- mutate(m1_07_2020, ride_length = as.double(ride_length))
m2_08_2020 <- mutate(m2_08_2020, ride_length = as.double(ride_length))
m3_09_2020 <- mutate(m3_09_2020, ride_length = as.double(ride_length))
m4_10_2020 <- mutate(m4_10_2020, ride_length = as.double(ride_length))
m5_11_2020 <- mutate(m5_11_2020, ride_length = as.double(ride_length))
m6_12_2020 <- mutate(m6_12_2020, ride_length = as.double(ride_length))
m7_01_2021 <- mutate(m7_01_2021, ride_length = as.double(ride_length))
m8_02_2021 <- mutate(m8_02_2021, ride_length = as.double(ride_length))
m9_03_2021 <- mutate(m9_03_2021, ride_length = as.double(ride_length))
m10_04_2021 <- mutate(m10_04_2021, ride_length = as.double(ride_length))
m11_05_2021 <- mutate(m11_05_2021, ride_length = as.double(ride_length))
m12_06_2021 <- mutate(m12_06_2021, ride_length = as.double(ride_length))

Convert from_station_id and to_station_id to double so that they can stack correctly.

m1_07_2020 <- mutate(m1_07_2020, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m2_08_2020 <- mutate(m2_08_2020, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m3_09_2020 <- mutate(m3_09_2020, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m4_10_2020 <- mutate(m4_10_2020, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m5_11_2020 <- mutate(m5_11_2020, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m6_12_2020 <- mutate(m6_12_2020, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m7_01_2021 <- mutate(m7_01_2021, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m8_02_2021 <- mutate(m8_02_2021, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m9_03_2021 <- mutate(m9_03_2021, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m10_04_2021 <- mutate(m10_04_2021, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
m11_05_2021 <- mutate(m11_05_2021, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))
12_06_2021 <- mutate(m12_06_2021, from_station_id = as.double(from_station_id) ,to_station_id = as.double(to_station_id))

Stack individual month’s data frames into one big data frame

all_trips <- bind_rows(m1_07_2020, m2_08_2020, m3_09_2020, m4_10_2020, m5_11_2020, m6_12_2020, m7_01_2021, m8_02_2021, m9_03_2021, m10_04_2021, m11_05_2021, m12_06_2021)

#======================================================

Step 3: Clean Up and Add Data to Prepare for Analysis

#======================================================

Inspect the new table that has been created

Good to do this again after doing it on the individual files in Excel.

colnames(all_trips) #List of column names
nrow(all_trips) #How many rows are in data frame?
dim(all_trips) #Dimensions of the data frame?
head(all_trips) #See the first 6 rows of data frame. Also tail(qs_raw)
str(all_trips) #See list of columns and data types (numeric, character, etc)
summary(all_trips) #Statistical summary of data. Mainly for numerics

There are a few problems we may need to fix:

all_trips %>% group_by(usertype) %>% summarise()

This yields only the 2 user types our analysis concerns: members and casual riders.

table(all_trips$usertype)

This shows there are 2529695 members and 1928711 casual riders which equal the total number of observations in the file, 4458406.

(1) The data can only be aggregated at the ride-level, which is too granular. We will want to add some additional columns of data – such as day, month, year – that provide additional opportunities to aggregate the data.

Add columns that list the date, month, day, and year of each ride

This will allow us to aggregate ride data for each month, day, or year … before completing these operations we could only aggregate at the ride level. Check out more on date formats in R.

all_trips$date <- as.Date(all_trips$start_time, origin=“1899-12-30”) #The default format is yyyy-mm-dd #Used origin=“1899-12-30” since earlier had converted date info with Excel’s time format with 37:30:55.
all_trips$month <- format(as.Date(all_trips$date), “%m”)
all_trips$day <- format(as.Date(all_trips$date), “%d”)
all_trips$year <- format(as.Date(all_trips$date), “%Y”)
all_trips$day_of_week <- format(as.Date(all_trips$date), “%A”)

(2) We will want to add a calculated field for length of ride since the 2020Q1 data did not have the “tripduration” column. We will add “ride_length” to the entire dataframe for consistency.

This was already done in the file from the analysis in Excel after formatting started_at and ended_at with Excel’s time format with 37:30:55. I’m doing it again here for comparison to the calcuations in Excel.

###The two columns (ride_length and ride_length_r are the same. all_trips_v2$ride_length_r <- as.difftime(all_trips_v2$end_time,tz, units = c(“secs))

Inspect the structure of the columns

str(all_trips)

(3) There are some rides where tripduration shows up as negative, including several hundred rides where Divvy took bikes out of circulation for Quality Control reasons. We will want to delete these rides. (I’ve previously deleted the rides in Excel where they showed up as ############# in the field prior to loading the individual files in R, and noted this in the changelog. I left the rows in the individual files with actual negative numbers to delete in R.) Will delete these in both ride_length (created in Excel) and ride_length_2, created in R.

We will create a new version of the dataframe (v2) since data is being removed

all_trips_v2 <- all_trips[!(all_trips$from_station_id == “HQ QR” | all_trips$ride_length<0),]

8,127 rows were deleted.

Will create another new version of the dataframe (v3) since data is being removed.

all_trips_v3 <- all_trips[!(all_trips$from_station_id == “HQ QR” | all_trips$ride_length_r<0),]

#=====================================

Step 4: Conduct Descriptive Analysis

#=====================================

Descriptive analysis on ride_length (all figures in seconds)

mean(all_trips_v3$ride_length_r, na.rm = TRUE) #straight average (total ride length / rides)
median(all_trips_v3$ride_length_r, na.rm = TRUE) #midpoint number in the ascending array of ride lengths
max(all_trips_v3$ride_length_r, na.rm = TRUE) #longest ride
min(all_trips_v3$ride_length_r, na.rm = TRUE) #shortest ride

You can condense the four lines above to one line using summary() on the specific attribute

summary(all_trips_v3$ride_length_r, na.rm = TRUE)

Compare members and casual users

aggregate(all_trips_v3$ride_length_r ~ all_trips_v3$usertype, FUN = mean)
aggregate(all_trips_v3$ride_length_r ~ all_trips_v3$usertype, FUN = median)
aggregate(all_trips_v3$ride_length_r ~ all_trips_v3$usertype, FUN = max)
aggregate(all_trips_v3$ride_length_r ~ all_trips_v3$usertype, FUN = min)

See the average ride time by each day for members vs casual users

aggregate(all_trips_v3$ride_length_r ~ all_trips_v3$usertype + all_trips_v3$day_of_week, FUN = mean)

Notice that the days of the week are out of order. Let’s fix that.

all_trips_v3$day_of_week <- ordered(all_trips_v3$day_of_week, levels=c(“Sunday”, “Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”, “Saturday”))

Now, let’s run the average ride time by each day for members vs casual users

aggregate(all_trips_v3$ride_length_r ~ all_trips_v3$usertype + all_trips_v2$day_of_week, FUN = mean)

Analyze ridership data by type and weekday

Let’s visualize the number of rides by rider type

all_trips_v3 %>% mutate(weekday = wday(start_time, label = TRUE)) %>% group_by(usertype, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length_r)) %>% arrange(usertype, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, fill = usertype)) + geom_col(position = “dodge”)

Let’s create a visualization for average duration

all_trips_v3 %>% mutate(weekday = wday(start_time, label = TRUE)) %>% group_by(usertype, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length_r)) %>% arrange(usertype, weekday) %>% ggplot(aes(x = weekday, y = average_duration, fill = usertype)) + geom_col(position = “dodge”)

#=================================================

Step 5: Export Summary File for Further Analysis

#=================================================

Create a csv file that we can visualize in Excel or Tableau

To export the data as a csv file you can read more here.

counts <- aggregate(all_trips_v3$ride_length_r ~ all_trips_v3$usertype + all_trips_v3$day_of_week, FUN = mean) write.csv(counts, file = ‘/Users/caroladdassi/RProjects/Case_Study_Cyclistics/data/avg_ride_length.csv’)

Save dataframe as .csv file

write.csv(all_trips_v3,'/Users/caroladdassi/RProjects/Case_Study_Cyclistics/data/all_trips_v3.csv')

#=================================================

Step 6: Further Analysis Continued

#=================================================

Visualization for the number of roundtrip rides at stations having more than 800 roundtrip rides per year by user type. (The distance between station is 0 and the ride_length is > 0 since we deleted all rows with negative ride_length earlier.)

all_trips_v3 %>% filter(distance_traveled_mi == 0) %>% group_by(usertype, from_station_name) %>% summarise(number_of_rides = n()) %>% filter(number_of_rides > 800) %>% arrange(-number_of_rides) %>% ggplot(aes(x = number_of_rides, y = from_station_name, fill = usertype)) + geom_col(position = “stack”) + labs(title= “Most Popular Stations for Roundtrip Rides”, subtitle = “Roundtrip Rides by Casual Riders and Members \nJuly 2020 to June 2021, Chicago \n(Less than 801 per year not shown.)”, caption = “Public data has been made available by Motivate International Inc.") + xlab(“Number of roundtrip rides”) + ylab(“Start / End Station”) + theme(axis.text.y = element_text(size = 6))

Visualization of the most popular start stations to begin rides (over 10,000 rides/year), including unknown start stations.

all_trips_v3 %>% group_by(usertype, from_station_name) %>% summarise(number_of_rides = n()) %>% filter(number_of_rides > 10000) %>% arrange(-number_of_rides, rm.na = TRUE) %>% ggplot(aes(x = from_station_name, y = number_of_rides, fill = usertype)) + geom_col(position = “stack”)+ scale_y_continuous(name=“Number of Rides”, labels = scales::comma) + labs(title= “Most Popular Start Stations”, subtitle = “Stations With Over 10,000 Rides from July 2020 - June 2021, Chicago, \nUnknown Start Stations Shown”, caption = “Public data has been made available by Motivate International Inc.") + xlab(“Station Names”) + coord_flip() + theme(axis.text.y = element_text(size = 6))

Visualization of the most popular start stations to begin rides (over 10,000 rides/year), unknown stations omitted.

all_trips_v3 %>% na_if("") %>% na.omit %>% group_by(usertype, from_station_name) %>% summarise(number_of_rides = n()) %>% filter(number_of_rides > 10000) %>% arrange(-number_of_rides, rm.na = TRUE) %>% ggplot(aes(x = from_station_name, y = number_of_rides, fill = usertype)) + geom_col(position = “stack”)+ scale_y_continuous(name=“Number of Rides”, labels = scales::comma) + labs(title= “Most Popular Start Stations”, subtitle = “Stations With Over 10,000 Rides from July 2020 - June 2021, Chicago,\nUnknown Start Stations Omitted”, caption = “Public data has been made available by Motivate International Inc.") + xlab(“Station Names”) + coord_flip() + theme(axis.text.y = element_text(size = 6))

Visualization of the number of rides by rider type including unknown users

all_trips_v3 %>% mutate(weekday = wday(start_time, label = TRUE)) %>% group_by(usertype, weekday) %>% summarise(number_of_rides = n()) %>% arrange(usertype, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, fill = usertype)) + geom_col(position = “dodge”)+ scale_y_continuous(name=“Number of Rides”, labels = scales::comma) + labs(title= “Number of Rides by Type of Rider: Member or Casual”, subtitle = “Total Rides from July 2020 - June 2021 per Day, Chicago, \nIncluding Unknown Users”, caption = “Public data has been made available by Motivate International Inc.") + xlab(“Day of Week”)

Visualization of the number of rides by rider type including unknown users by month

all_trips_v3 %>% group_by(month, usertype) %>% summarise(number_of_rides = n()) %>% arrange(usertype, month) %>% ggplot(aes(x = month, y = number_of_rides, fill = usertype)) + geom_col(position = “dodge”)+ scale_y_continuous(name=“Number of Rides”, labels = scales::comma) + labs(title= “Number of Rides by Type of Rider: Member or Casual”, subtitle = “Total Rides from July 2020 - June 2021 per Month, Chicago, \nIncluding Unknown Users”, caption = “Public data has been made available by Motivate International Inc.") + xlab(“Month”)

Visualization of the number of rides by rider type exluding unknown users

all_trips_v3 %>% na_if("") %>% na.omit %>% mutate(weekday = wday(start_time, label = TRUE)) %>% group_by(usertype, day_of_week) %>% summarise(number_of_rides = n()) %>% arrange(usertype, day_of_week) %>% ggplot(aes(x = day_of_week, y = number_of_rides, fill = usertype)) + geom_col(position = “dodge”)+ scale_y_continuous(name=“Number of Rides”, labels = scales::comma) + labs(title= “Number of Rides by Type of Rider: Member or Casual”, subtitle = “Total Rides from July 2020 - June 2021 per Day, Chicago, \nExcluding Unknown Users”, caption = “Public data has been made available by Motivate International Inc.") + xlab(“Day of Week”)+ ylab(“Number of Rides”)

Visualization for average duration including unknown users

all_trips_v3 %>% mutate(weekday = wday(start_time, label = TRUE)) %>% group_by(usertype, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length_r)) %>% arrange(usertype, weekday) %>% ggplot(aes(x = weekday, y = average_duration, fill = usertype)) + geom_col(position = “dodge”) + labs(title= “Average Ride Length by Type of Rider: Member or Casual”, subtitle = “July 2020 - June 2021 by Day of Week in Seconds, Chicago”, caption = “Public data has been made available by Motivate International Inc.") + xlab(“Day of Week”) + ylab(“Average Ride Duration in Seconds”)

Posted on:: August 10, 2021

Length:: 12 minute read, 2547 words

Tags:: case-study

See Also:: Workflow Exercise: BigQuery in Kaggle to RStudio; Increase Cyclistic's Bike-Share Membership: How do annual members and casual riders differ?; Changelog for Cyclistic's Bike-Share Project