#load packages
library(kableExtra)
library(readxl)
library(janitor)
library(tidyverse)
library(lubridate)
library(scales)
library(viridis)
#set default chunk options
::opts_chunk$set(message=F,warning=F) knitr
Data Exercise, Week 5
This week’s data exercise introduces some new tools. There are five objectives for this exercise:
- reinforce techniques and make sure you’re comfortable with the skills we’ve used so far;
- learn about loading and using rdata files;
- learn how to make bar plots;
- using factors to arrange data labels in graphs;
- learn how to group and summarize data.
Use the template that I gave you for Assignment 1 if you want a clean RMD to start with, and then anything you see in a code chunk on this page, you should be able to use to populate your file and see if you can get it to run. Add one thing at a time, knit your file, and make sure you understand what each bit is doing. If not, re-read the notes here.
As always start with a code chunk for your packages. I also use knitr::opts_chunk$set(message=F,warning=F)
to set the default for all code chunks in my document to suppress any R warnings or messages in my output. This will clean up your HTML a lot, but it also makes it harder for me to see things that might be causing you issues in your code.
Downloading this week’s data
This week, we’re going to use some Alberta government royalty disclosure data. One of the recommendations of the Alberta Royalty Review in 2015-16 was more transparency with respect to oil sands costs, profits, production and royalty payments. The Alberta Government has followed that recommendation and makes project-level data available on their open data site. I’ve built my own scraper to compile those data and provide some basic graphs that you might find useful. For this exercise, I’ve created a data set for you to use based on these data.
The data are already in R format here and so you can download and read those data directly into R. The data contain project names, year, project type, an indicator for large (>30,000 bbl/d) projects, production, revenues, and costs and an indicator for royalty payout status (we’ll get to that!). The naming conventions should be obvious, especially given that I’ll provide you with some extra code in here.
As before, you can speed up your code by including a chunk option to cache the data so it won’t download every time, using something like this:
{r read data,cache=T}
.
#download the data - remember to use the mode="wb" so that you get the windows binary file
if(!file.exists("os_data.rdata"))
download.file("https://econ366.aleach.ca/resources/os_data.rdata",destfile = "os_data.rdata",mode="wb")
load("os_data.rdata")
After you run this chunk, you should see that os_data
is an object in your global environment.
Examining the data
As with the other exercises, a good habit to get into is to use a code chunk at this point (you can delete it later) to have a look at your data and make sure you have what you’re expecting. For example, we have multiple years of data:
%>% select(reporting_year) %>% distinct() os_data
# A tibble: 7 × 1
reporting_year
<dbl>
1 2016
2 2017
3 2018
4 2019
5 2020
6 2021
7 2022
And we have projects sorted by type using TRUE/FALSE values:
%>% select(mine) %>% distinct() os_data
# A tibble: 2 × 1
mine
<lgl>
1 FALSE
2 TRUE
This last one is a new feature for some of you: a logical variable. You’ll see later on how we can use this really easily in filters and other selection commands.
Column / bar plots
For these data, there are many pieces of project-level data, and so it’s best to think about a graph type that works well for these types of data: column or bar plots. With ggplot
, we use geom_col
to make these sorts of graphs. Let’s start with a basic plot for production (cleaned_crude_bitumen_at_rcp_barrels
) from mining projects last year:
ggplot(os_data %>% filter(mine,reporting_year==2022),
aes(project_name,production/365/1000))+
geom_col(size=0.25,position = position_dodge(width = .25),color="black")+
theme_minimal()+
theme(axis.text.x = element_text(hjust = 0.5,vjust=0.5)
+
)labs(x=NULL,y="Bitumen Production (thousands of barrels per day)",
title="2022 Bitumen Production, Oil Sands Mining Projects",
#subtitle="Excluding Electricity,by NAICS 4-Digit Code",
caption="Data via Government of Alberta, graph by @andrew_leach")
So, that’s a basic plot. Now, we can use the skills we learned in the last data assignment to use a faceted plot to show production by year, by project. Here, I’m using a one command you’ll want to note: I flipped the x-axis labels and aligned them with the center of each bar using theme(axis.text.x = element_text(angle=90,hjust = 0.5,vjust=0.5))
.
ggplot(os_data %>% filter(mine),
aes(reporting_year,production/365/1000))+
geom_col(size=0.25,position = position_dodge(width = .25),color="black")+
scale_x_continuous(breaks = pretty_breaks(6),expand=c(0,0))+
facet_wrap(~project_name,nrow = 1)+
theme_minimal()+
theme(axis.text.x = element_text(angle=90,hjust = 0.5,vjust=0.5)
+
)labs(x=NULL,y="Bitumen Production (thousands of barrels per day)",
title="Bitumen Production, Oil Sands Mining Projects",
#subtitle="Excluding Electricity,by NAICS 4-Digit Code",
caption="Data via Government of Alberta, graph by @andrew_leach")
We can do a similar graph for the larger in-situ projects, which I’ve defined in the data as those with production greater than 30,000 barrels per day. In this graph, you’ll see the power of the logical variables in a filter. I only need to specify the variable name, and it defaults to select based on the value being true, and I can reverse that (e.g. with the filter(!mine)
) to select for projects that are not mines:
ggplot(os_data %>% filter(big_project,!mine),
aes(reporting_year,production/365/1000))+
geom_col(size=0.25,position = position_dodge(width = .25),color="black")+
scale_x_continuous(breaks = pretty_breaks(6),expand=c(0,0))+
facet_wrap(~project_name,nrow = 1)+
theme_minimal()+
theme(axis.text.x = element_text(angle=90,hjust = 0.5,vjust=0.5)
+
)labs(x=NULL,y="Bitumen Production (thousands of barrels per day)",
title="Bitumen Production, Larger Oil Sands In Situ Projects",
#subtitle="Excluding Electricity,by NAICS 4-Digit Code",
caption="Data via Government of Alberta, graph by @andrew_leach")
Average Project Attributes
One of the more powerful features in R is grouping and summarizing. Imagine I wanted to know the key characteristics of different types (mines and in situ) of projects over time. One way that I could do this is by grouping and either mutatating or summarizing. Unlike mutate
, the summarize
command creates a new data set. Let’s look at some examples.
The data contain measures of production, revenues, and costs per year by project, but if we’re interested in summaries by type, we can make that happen using groups. I’ll want to group
by one or more variable, and then either summarize or mutate, and then it’s generally best to ungroup
your data. Let’s suppose I want to know average (production-weighted) royalties paid by project type and year. This is also an opportinity for me to introduce you to pivot_wider
, the opposite of pivot_longer
.
%>%
os_data select(reporting_year,mine,production,royalty_payable)%>%#pick the variables I need
group_by(mine,reporting_year)%>% #group by project type and year
summarize(royalties=sum(royalty_payable,na.rm=T)/sum(production,na.rm = T))%>% #royalties per barrel
ungroup()%>%
#the na.rm = T means that you don't count any missing values in your summation. REALLY useful.
pivot_wider(names_from=reporting_year,values_from = royalties)%>%
#push data to wide format with one column for each year
#create labels for the mine column
mutate(mine=ifelse(mine,"Mine","In Situ"))%>%
#rename the mine column
rename("Project Type"=mine)%>%
#make a table
kbl(escape = FALSE,digits=2,align=c("l",rep('c', 7))) %>%
kable_styling(fixed_thead = T,bootstrap_options = c("hover", "condensed","responsive"),full_width = T)%>%
add_header_above(header = c("Royalties payable ($/bbl) by Project Type"=8))%>%
I()
Project Type | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 |
---|---|---|---|---|---|---|---|
In Situ | 1.40 | 2.65 | 2.66 | 5.15 | 1.47 | 7.89 | 18.2 |
Mine | 0.35 | 2.27 | 2.20 | 3.61 | 0.53 | 5.52 | 14.4 |
Test yourself: see if you can make one for profit per barrel (gross revenue-capital_costs-operating_costs-other_costs+other_net_proceeds-royalty_payable)
:
Project Type | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 |
---|---|---|---|---|---|---|---|
In Situ | 1.08 | 12.49 | 7.49 | 19.9 | 4.30 | 25.9 | 35.7 |
Mine | -19.88 | -4.67 | -1.49 | 7.4 | -8.39 | 11.8 | 20.7 |
We’ll make a lot more use of summarize
and mutate
with group_by
in the weeks ahead. Test yourself and make ones for operating costs, capital costs, and revenues.
Graphing costs and revenues
In this section, we’re going to make use of some of the techniques we’ve learned so far to build a really neat graph of before-tax profits for these projects. But, to do this and make it look good, you’re going to want to use factors to get the labels in the right order. When we’re done, you’ll produce something that looks like this:
So, how did we get here?
Let’s take a quick look at some of the underlying trends we’re looking at, using a facet_grid
plot (note the use of the option scales="free_y
to let each row in the grid have different y axis breaks:
%>%
os_data group_by(mine,reporting_year)%>% #group by project type and year
summarize(revenue=sum(gross_revenue,na.rm=T),
production=sum(production,na.rm=T),
royalties=sum(royalty_payable,na.rm=T),
capital_costs=sum(capital_costs,na.rm=T),
operating_costs=sum(operating_costs,na.rm=T),
other=sum(other_costs-other_net_proceeds,na.rm=T),
#profit_or_loss=sum(gross_revenue,na.rm=T)-sum(royalty_payable,na.rm=T)-sum(capital_costs,na.rm=T)-
# sum(operating_costs,na.rm=T)-sum(other_costs-other_net_proceeds,na.rm=T)
%>%
)ungroup()%>%
pivot_longer(cols=-c(mine,reporting_year,production),names_to = "item")%>%
mutate(value=value/production)%>%
mutate(item=stringr::str_to_title(gsub("_"," ",item)))%>%
mutate(mine=ifelse(mine,"Mine","In Situ"))%>%
ggplot() +
facet_grid(rows=vars(mine),cols=vars(item),scales = "free_y")+
geom_col(aes(reporting_year,value,group=item),size=0.25,position = "stack",color="black",fill="dodgerblue")+
#geom_col( aes(reporting_year,op_costs_bbl,fill="Operating Costs"),size=0.25,position = "stack",color="black")+
scale_x_continuous(breaks = pretty_breaks(6),expand=c(0,0))+
scale_y_continuous(breaks = pretty_breaks(6),expand=c(0,0))+
theme_minimal()+
scale_fill_viridis("",discrete = T,option="magma",begin =0.4,direction=-1)+
theme(axis.text.x = element_text(angle=90,hjust = 0.5,vjust=0.5),
panel.spacing = unit(2, "lines"),
legend.position = "none")+
labs(x=NULL,y="Costs and Revenue (dollars per barrel, before taxes)",
title="Oil Sands Project Trends",
caption="Data via Government of Alberta, graph by @andrew_leach")
Now, back to that graph of everything together. First, we need to summarize and clean the data. The first part of my code for this creates sums of production and all costs as well as operating pre-tax profits by type:
#test<-
%>%
os_data group_by(mine,reporting_year)%>% #group by project type and year
summarize(production=sum(production,na.rm=T),
royalties=sum(royalty_payable,na.rm=T),
capital_costs=sum(capital_costs,na.rm=T),
operating_costs=sum(operating_costs,na.rm=T),
other=sum(other_costs-other_net_proceeds,na.rm=T),
profit_or_loss=sum(gross_revenue,na.rm=T)-sum(royalty_payable,na.rm=T)-sum(capital_costs,na.rm=T)-
sum(operating_costs,na.rm=T)-sum(other_costs-other_net_proceeds,na.rm=T))%>%
ungroup()
Having done that, I’m going to pivot to a longer data frame, but note that I’m keeping the project type,reporting year, and production separate. That way, I have a column called value
that has all my costs and revenues, and a column called production
that has annual bitumen production. I can then just divide value by production and get the per-barrel averages. The fact that I’m summing up all the costs and all the production by type, and then calculating the average means that I’m doing a production-weighted average:
pivot_longer(cols=-c(mine,reporting_year,production),names_to = "item")%>%
mutate(value=value/production)
Next, I am going to clean up my labels by formatting my item column, first to remove the underscores and then to convert to title case:
mutate(item=stringr::str_to_title(gsub("_"," ",item)))%>%
mutate(mine=ifelse(mine,"Mine","In Situ"))
And, finally, I am going to use factors and levels to order my graph. A ‘factor’ is a category variable (like item or mine in this case) and they allow you to do some pretty powerful things, but they are also frustrating. We’ll have more on factors coming up in the next few assignments, but here’s a quick taste:
mutate(item=as_factor(item),#convert item to a factor. This will be stored as a number with a corresponding label
#use fct_relevel to order the factor levels into the order I want for my graph
item=fct_relevel(item,c("Profit Or Loss","Royalties","Capital Costs","Operating Costs","Other")))
Once I do that, I will overcome the ggplot()
default to put things in alphabetical order. If I combine all of those steps, I get this graph:
#test<-
%>%
os_data group_by(mine,reporting_year)%>% #group by project type and year
summarize(production=sum(production,na.rm=T),
royalties=sum(royalty_payable,na.rm=T),
capital_costs=sum(capital_costs,na.rm=T),
operating_costs=sum(operating_costs,na.rm=T),
other=sum(other_costs-other_net_proceeds,na.rm=T),
profit_or_loss=sum(gross_revenue,na.rm=T)-sum(royalty_payable,na.rm=T)-sum(capital_costs,na.rm=T)-
sum(operating_costs,na.rm=T)-sum(other_costs-other_net_proceeds,na.rm=T))%>%
ungroup()%>%
pivot_longer(cols=-c(mine,reporting_year,production),names_to = "item")%>%
mutate(value=value/production)%>%
mutate(item=stringr::str_to_title(gsub("_"," ",item)))%>%
mutate(item=as_factor(item),item=fct_relevel(item,c("Profit Or Loss","Royalties","Capital Costs","Operating Costs","Other")))%>%
mutate(mine=ifelse(mine,"Mine","In Situ"))%>%
ggplot() +
facet_wrap(~mine,nrow = 1)+
geom_col(aes(reporting_year,value,fill=item,group=item),size=0.25,position = "stack",color="black")+
#geom_col( aes(reporting_year,op_costs_bbl,fill="Operating Costs"),size=0.25,position = "stack",color="black")+
scale_x_continuous(breaks = pretty_breaks(6),expand=c(0,0))+
scale_y_continuous(breaks = pretty_breaks(6),expand=c(0,0))+
expand_limits(y=c(0,80))+
theme_minimal()+
scale_fill_viridis("",discrete = T,option="magma",begin =0.4,direction=-1)+
theme(axis.text.x = element_text(angle=90,hjust = 0.5,vjust=0.5),
panel.spacing = unit(2, "lines"))+
labs(x=NULL,y="Costs and Operating Net Revenue (dollars per barrel, before taxes)",
title="Oil Sands Project Profitability",
caption="Data via Government of Alberta, graph by @andrew_leach")
Want to test your understanding? Try to make this graph:
Hints:
- use
filter(!mine,big_project)
- you don’t need to group in this case, because you’re reporting project level data
- you won’t need to use `sum(…,na.rm=T), since you’d be summing up a whole column of data if you did.
I hope you’re noticing just how good a year 2022 was for the oil sands industry: record production, record royalty payments, record profits.