index.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
  letterpaper,
  DIV=11,
  numbers=noendperiod]{scrreprt}

\usepackage{amsmath,amssymb}
\usepackage{iftex}
\ifPDFTeX
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else  
    % xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{5}
% Make \paragraph and \subparagraph free-standing
\ifx\paragraph\undefined\else
  \let\oldparagraph\paragraph
  \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
  \let\oldsubparagraph\subparagraph
  \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
\fi

\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{241,243,245}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.40,0.45,0.13}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.28,0.35,0.67}{#1}}
\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.46,0.62}{#1}}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.07,0.07,0.07}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}

\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother

\KOMAoption{captions}{tableheading}
\makeatletter
\makeatother
\makeatletter
\@ifpackageloaded{bookmark}{}{\usepackage{bookmark}}
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\AtBeginDocument{%
\ifdefined\contentsname
  \renewcommand*\contentsname{Table of contents}
\else
  \newcommand\contentsname{Table of contents}
\fi
\ifdefined\listfigurename
  \renewcommand*\listfigurename{List of Figures}
\else
  \newcommand\listfigurename{List of Figures}
\fi
\ifdefined\listtablename
  \renewcommand*\listtablename{List of Tables}
\else
  \newcommand\listtablename{List of Tables}
\fi
\ifdefined\figurename
  \renewcommand*\figurename{Figure}
\else
  \newcommand\figurename{Figure}
\fi
\ifdefined\tablename
  \renewcommand*\tablename{Table}
\else
  \newcommand\tablename{Table}
\fi
}
\@ifpackageloaded{float}{}{\usepackage{float}}
\floatstyle{ruled}
\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
\floatname{codelisting}{Listing}
\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
\makeatother
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[skins,breakable]{tcolorbox}}
\makeatother
\makeatletter
\@ifundefined{shadecolor}{\definecolor{shadecolor}{rgb}{.97, .97, .97}}
\makeatother
\makeatletter
\makeatother
\makeatletter
\makeatother
\ifLuaTeX
  \usepackage{selnolig}  % disable illegal ligatures
\fi
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
\hypersetup{
  pdftitle={Quantitative Methods 2},
  pdfauthor={Ollie Ballinger},
  colorlinks=true,
  linkcolor={blue},
  filecolor={Maroon},
  citecolor={Blue},
  urlcolor={Blue},
  pdfcreator={LaTeX via pandoc}}

\title{Quantitative Methods 2}
\author{Ollie Ballinger}
\date{2024-10-10}

\begin{document}
\maketitle
<<<<<<< HEAD
\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[frame hidden, enhanced, sharp corners, breakable, interior hidden, boxrule=0pt, borderline west={3pt}{0pt}{shadecolor}]}{\end{tcolorbox}}\fi
=======
\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[frame hidden, sharp corners, breakable, interior hidden, boxrule=0pt, borderline west={3pt}{0pt}{shadecolor}, enhanced]}{\end{tcolorbox}}\fi
>>>>>>> 58a0f66f1426d670b4b670029a7c08abf59b4a8c

\renewcommand*\contentsname{Table of contents}
{
\hypersetup{linkcolor=}
\setcounter{tocdepth}{2}
\tableofcontents
}
\bookmarksetup{startatroot}

\hypertarget{welcome}{%
\chapter*{Welcome}\label{welcome}}
\addcontentsline{toc}{chapter}{Welcome}

\markboth{Welcome}{Welcome}

\hypertarget{welcome-to-basc0005---quantitative-methods-data-science-and-visualisation}{%
\section*{Welcome to BASC0005 - Quantitative Methods: Data Science and
Visualisation}\label{welcome-to-basc0005---quantitative-methods-data-science-and-visualisation}}
\addcontentsline{toc}{section}{Welcome to BASC0005 - Quantitative
Methods: Data Science and Visualisation}

\markright{Welcome to BASC0005 - Quantitative Methods: Data Science and
Visualisation}

This course teaches quantitative skills, with an emphasis on the context
and use of data. Students learn to focus on datasets which will allow
them to explore questions in society -- in arts, humanities, sports,
criminal justice, economics, inequality, or policy. Students are
expected to work with Python to carry out data manipulation (cleaning
and segmentation), analysis (for example, deriving descriptive
statistics) and visualisation (graphing, mapping and other forms of
visualisation). They will engage with literatures around a topic and
connect their datasets and analyses to explore and decide wider
arguments, and link their results to these contextual considerations.
Below is an outline of the course:

\includegraphics{outline.png}

\bookmarksetup{startatroot}

\hypertarget{python-recap}{%
\chapter{Python Recap}\label{python-recap}}

\hypertarget{workshop-1-open-in-colab}{%
\section[\emph{Workshop 1} ]{\texorpdfstring{\emph{Workshop 1}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W01.\%20Python\%20Recap.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Workshop 1 Open In Colab}}\label{workshop-1-open-in-colab}}

\hypertarget{registering-a-github-account}{%
\section{Registering a GitHub
account}\label{registering-a-github-account}}

Before we get started, we need to set a few things up. GitHub is a
platform for software development and version control using Git,
allowing developers to store and manage their code. Think of it as
google docs but for code-- it will be very useful for collaborating on
your group projects later in the term, and in your future as a data
analyst.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Use \href{https://github.com/join}{this link} to register for a GitHub
  account if you don't already have one.
\item
  Once that's done, \href{https://github.com/new}{create a new github
  repository} called ``QM2''.
\item
  In this notebook, click ``File'' and then ``Save a copy in GitHub''.
\end{enumerate}

Voila! You now have a version of this notebook saved to your own GitHub
account. \emph{You will need to do step 3 for all the workshops!} Now,
on to python.

\hypertarget{using-python}{%
\section{Using Python}\label{using-python}}

In this course, we'll make extensive use of \emph{Python}, a programming
language used widely in scientific computing and on the web. We will be
using Python as a way to manipulate, plot and analyse data. This isn't a
course about learning Python, it's about working with data - but we'll
learning a little bit of programming along the way.

By now, you should have done the prerequisites for the module, and
understand a bit about how Python is structured, what different commands
do, and so on - this is a bit of a refresher to remind you of what we
need at the beginning of term.

The particular flavour of Python we're using is \emph{iPython}, which,
as we've seen, allows us to combine text, code, images, equations and
figures in a \emph{Notebook}. This is a \emph{cell}, written in
\emph{markdown} - a way of writing nice text. Contrast this with
\emph{code} cell, which executes a bit of Python:

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{print}\NormalTok{(}\DecValTok{2}\OperatorTok{+}\DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
4
\end{verbatim}

The Notebook format allows you to engage in what Don Knuth describes as
\href{http://en.wikipedia.org/wiki/Literate_programming}{Literate
Programming}:

\begin{quote}
{[}\ldots{]} Instead of writing code containing documentation, the
literate programmer writes documentation containing code. No longer does
the English commentary injected into a program have to be hidden in
comment delimiters at the top of the file, or under procedure headings,
or at the end of lines. Instead, it is wrenched into the daylight and
made the main focus. The ``program'' then becomes primarily a document
directed at humans, with the code being herded between ``code
delimiters'' from where it can be extracted and shuffled out sideways to
the language system by literate programming tools.
\href{http://www.literateprogramming.com/lpquotes.html}{Ross Williams}
\end{quote}

\hypertarget{libraries}{%
\section{Libraries}\label{libraries}}

We will work with a number of \emph{libraries}, which provide additional
functions and techniques to help us to carry out our tasks.

These include:

\emph{Pandas:} we'll use this a lot to slice and dice data

\emph{matplotlib}: this is our basic graphing software, and we'll also
use it for mapping

\emph{nltk}: The Natural Language Tool Kit will help us work with text

We aren't doing all this to learn to program. We could spend a whole
term learning how to use Python and never look at any data, maps,
graphs, or visualisations. But we do need to understand a few basics to
use Python for working with data. So let's revisit a few concepts that
you should have covered in your prerequisites.

\hypertarget{variables}{%
\section{Variables}\label{variables}}

Python can broadly be divided in verbs and nouns: things which \emph{do}
things, and things which \emph{are} things. In Python, the verbs can be
\emph{commands}, \emph{functions}, or \emph{methods}. We won't worry too
much about the distinction here - suffice it to say, they are the parts
of code which manipulate data, calculate values, or show things on the
screen.

The simplest proper noun object in Python is the \emph{variable}.
Variables are given names and store information. This can be, for
example, numeric, text, or boolean (true/false). These are all
statements setting up variables:

n = 1

t = ``hi''

b = True

Now let's try this in code:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{n }\OperatorTok{=} \DecValTok{1}

\NormalTok{t }\OperatorTok{=} \StringTok{"hi"}

\NormalTok{b }\OperatorTok{=} \VariableTok{True}
\end{Highlighting}
\end{Shaded}

Note that each command is on a new line; other than that, the
\emph{syntax} of Python should be fairly clear. We're setting these
variables equal to the letters and numbers and phrases and booleans.
\textbf{What's a boolean?}

The value of this is we now have values tied to these variables - so
every time we want to use it, we can refer to the variable:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{n}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{t}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
'hi'
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{b}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
True
\end{verbatim}

Because we've defined these variables in the early part of the notebook,
we can use them later on.

\emph{\textbf{Advanced}: where do \textbf{classes} fit into this
noun/verb picture of variables and commands?}

\hypertarget{where-is-my-data}{%
\section{Where is my data?}\label{where-is-my-data}}

When we work in excel and text editors, we're used to seeing the data
onscreen - and if we manipulate the data in some way (averaging or
summing up), we see both the inputs and outputs on screen. The big
difference in working with Python is that we don't see our variables all
of the time, or the effect we're having on them. They're there in the
background, but it's usually worth checking in on them from time to
time, to see whether our processes are doing what we think they're
doing.

This is pretty easy to do - we can just type the variable name, or
``print(\emph{variable name})'':

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{n }\OperatorTok{=}\NormalTok{ n}\OperatorTok{+}\DecValTok{1}
\BuiltInTok{print}\NormalTok{(n)}
\BuiltInTok{print}\NormalTok{(t)}
\BuiltInTok{print}\NormalTok{(b)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
2
hi
True
\end{verbatim}

\hypertarget{flow}{%
\section{Flow}\label{flow}}

Python, in common with all programming languages, executes commands in a
sequence - we might refer to this as the ``ineluctable march of the
machines'', but it's more common referred to as the \emph{flow} of the
code (we'll use the word ``code'' a lot - it just means commands written
in the programming language). In most cases, code just executes in the
order it's written. This is true within each \emph{cell} (each block of
text in the notebook), and it's true when we execute the cells in order;
that's why we can refer back to the variables we defined earlier:

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{print}\NormalTok{(n)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
2
\end{verbatim}

If we make a change to one of these variables, say n:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{n }\OperatorTok{=} \DecValTok{3}
\end{Highlighting}
\end{Shaded}

and execute the above ``print n'' command, you'll see that it has
changed n to 3. So if we go out of order, the obvious flow of the code
is confused. For this reason, try to write your code so it executes in
order, one cell at a time. At least for the moment, this will make it
easier to follow the logic of what you're doing to data.

\emph{Advanced}: what happens to this flow when you write
\emph{functions} to automate common tasks?

\textbf{\emph{Exercise - Setting up variables}}:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Create a new cell.
\item
  Create the variables ``name'', and assign your name to it.
\item
  Create a variable ``Python'' and assign a score out of 10 to how much
  you like Python.
\item
  Create a variable ``prior'' and if you've used Python before, assign
  True; otherwise assign False to the variable
\item
  Print these out to the screen
\end{enumerate}

\hypertarget{downloading-data}{%
\section{Downloading Data}\label{downloading-data}}

Lets fetch the data we will be using for this session. There are two
ways in which you can upload data to the Colab notebook. You can use the
following code to upload a CSV or similar data file.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{from}\NormalTok{ google.colab }\ImportTok{import}\NormalTok{ files}
\NormalTok{uploaded }\OperatorTok{=}\NormalTok{ files.upload()}
\end{Highlighting}
\end{Shaded}

Or you can use the following cell to fetch the data directly from the
QM2 server.

Let's create a folder that we can store all our data for this session

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{!}\NormalTok{mkdir data}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{!}\NormalTok{mkdir .}\OperatorTok{/}\NormalTok{data}\OperatorTok{/}\NormalTok{wk1}
\OperatorTok{!}\NormalTok{curl https:}\OperatorTok{//}\NormalTok{s3.eu}\OperatorTok{{-}}\NormalTok{west}\OperatorTok{{-}}\FloatTok{2.}\ErrorTok{amazonaws}\NormalTok{.com}\OperatorTok{/}\NormalTok{qm2}\OperatorTok{/}\NormalTok{wk1}\OperatorTok{/}\NormalTok{data.csv }\OperatorTok{{-}}\NormalTok{o .}\OperatorTok{/}\NormalTok{data}\OperatorTok{/}\NormalTok{wk1}\OperatorTok{/}\NormalTok{data.csv}
\OperatorTok{!}\NormalTok{curl https:}\OperatorTok{//}\NormalTok{s3.eu}\OperatorTok{{-}}\NormalTok{west}\OperatorTok{{-}}\FloatTok{2.}\ErrorTok{amazonaws}\NormalTok{.com}\OperatorTok{/}\NormalTok{qm2}\OperatorTok{/}\NormalTok{wk1}\OperatorTok{/}\NormalTok{sample\_group.csv }\OperatorTok{{-}}\NormalTok{o .}\OperatorTok{/}\NormalTok{data}\OperatorTok{/}\NormalTok{wk1}\OperatorTok{/}\NormalTok{sample\_group.csv}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   203  100   203    0     0   2872      0 --:--:-- --:--:-- --:--:--  3029
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   297  100   297    0     0   1844      0 --:--:-- --:--:-- --:--:--  1879
\end{verbatim}

\hypertarget{storing-and-importing-data}{%
\section{Storing and importing data}\label{storing-and-importing-data}}

Typically, data we look at won't be just one number, or one bit of text.
Python has a lot of different ways of dealing with a bunch of numbers:
for example, a list of values is called a \textbf{list}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{listy }\OperatorTok{=}\NormalTok{ [}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{,}\DecValTok{6}\NormalTok{,}\DecValTok{9}\NormalTok{]}
\BuiltInTok{print}\NormalTok{(listy)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
[1, 2, 3, 6, 9]
\end{verbatim}

A set of values \emph{linked} to an index (or key) is called a
\textbf{dictionary}; for example:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dicty }\OperatorTok{=}\NormalTok{ \{}\StringTok{\textquotesingle{}Bob\textquotesingle{}}\NormalTok{: }\FloatTok{1.2}\NormalTok{, }\StringTok{\textquotesingle{}Mike\textquotesingle{}}\NormalTok{: }\FloatTok{1.2}\NormalTok{, }\StringTok{\textquotesingle{}Coop\textquotesingle{}}\NormalTok{: }\FloatTok{1.1}\NormalTok{, }\StringTok{\textquotesingle{}Maddy\textquotesingle{}}\NormalTok{: }\FloatTok{1.3}\NormalTok{, }\StringTok{\textquotesingle{}Giant\textquotesingle{}}\NormalTok{: }\FloatTok{2.1}\NormalTok{\}}
\BuiltInTok{print}\NormalTok{(dicty)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
{'Bob': 1.2, 'Mike': 1.2, 'Coop': 1.1, 'Maddy': 1.3, 'Giant': 2.1}
\end{verbatim}

Notice that the list uses square brackets with values separated by
commas, and the dict uses curly brackets with pairs separated by commas,
and colons (:) to link a \emph{key} (index or address) with a value.

(You might notice that they haven't printed out in the order you entered
them)

*\textbf{Advanced}: Print out 1) The third element of \textbf{listy},
and 2) The element of \textbf{dicty} relating to Giant

We'll discuss different ways of organising data again soon, but for now
we'll look at \emph{dataframes} - the way our data-friendly
\emph{library} \textbf{Pandas} works with data. We'll be using Pandas a
lot this term, so it's good to get started with it early.

Let's start by importing pandas. We'll also import another library, but
we're not going to worry about that too much at the moment.

If you see a warning about `Building Font Cache' don't worry - this is
normal.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas}

\ImportTok{import}\NormalTok{ matplotlib}
\OperatorTok{\%}\NormalTok{matplotlib inline}
\end{Highlighting}
\end{Shaded}

Let's import a simple dataset and show it in pandas. We'll use a
pre-prepared ``.csv'' file, which needs to be in the same folder as our
code.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data }\OperatorTok{=}\NormalTok{ pandas.read\_csv(}\StringTok{\textquotesingle{}./data/wk1/data.csv\textquotesingle{}}\NormalTok{)}
\NormalTok{data.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& Name & First Appearance & Approx height & Gender & Law Enforcement \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & Bob & 1.2 & 6.0 & Male & False \\
1 & Mike & 1.2 & 5.5 & Male & False \\
2 & Coop & 1.1 & 6.0 & Male & True \\
3 & Maddy & 1.3 & 5.5 & Female & False \\
4 & Giant & 2.1 & 7.5 & Male & False \\
\end{longtable}

What we've done here is read in a .csv file into a dataframe, the object
pandas uses to work with data, and one that has lots of methods for
slicing and dicing data, as we will see over the coming weeks. The
head() command tells iPython to show the first few columns/rows of the
data, so we can start to get a sense of what the data looks like and
what sort of type of objects is represents.

A common first step for exploring our data is to sort it. In Pandas,
this can be done easily with the \texttt{sort\_values()} function. We
can specify which column to sort the data by, and whether we want to
sort in ascending or descending order, using the optional arguments
\texttt{by} and \texttt{ascending}, respectively. In the example below,
we're sorting in \emph{descending} order of height:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data.sort\_values(by}\OperatorTok{=}\StringTok{\textquotesingle{}Approx height\textquotesingle{}}\NormalTok{, ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{).head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& Name & First Appearance & Approx height & Gender & Law Enforcement \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
4 & Giant & 2.1 & 7.5 & Male & False \\
0 & Bob & 1.2 & 6.0 & Male & False \\
2 & Coop & 1.1 & 6.0 & Male & True \\
1 & Mike & 1.2 & 5.5 & Male & False \\
3 & Maddy & 1.3 & 5.5 & Female & False \\
\end{longtable}

\bookmarksetup{startatroot}

\hypertarget{supplementary-kaggle-exercises}{%
\chapter{Supplementary: Kaggle
exercises}\label{supplementary-kaggle-exercises}}

If you've gotten this far, congratulations! To further hone your skills,
try working your way through the five
\href{https://www.kaggle.com/learn/intro-to-programming}{intro to
programming notebooks on Kaggle}. These cover a range of skills that
we'll be using throughout the term. Kaggle is a very useful resource for
learning data science, so making an account may not be a bad idea!

\bookmarksetup{startatroot}

\hypertarget{assessed-question}{%
\chapter{Assessed Question}\label{assessed-question}}

The URL below contains a dataset of the most streamed songs on spotify
in 2023: https://storage.googleapis.com/qm2/wk1/spotify-2023.csv

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Download the dataset and save it in the \texttt{./data/wk1/}
  directory.
\item
  Load the dataset as a pandas dataframe, and inspect it. Two of the
  column names have accidentally been swapped around. Use common sense
  to figure out which ones these are before proceeding with your
  analysis.
\item
  Filter the dataset to only contain songs in the key of C sharp.
\item
  Sort the dataframe in descending order of streams.
\end{enumerate}

QUESTION: which artist has the song with the highest number of streams?

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# use this code cell to answer the question}
\end{Highlighting}
\end{Shaded}

\bookmarksetup{startatroot}

\hypertarget{intro-to-pandas}{%
\chapter{Intro to Pandas}\label{intro-to-pandas}}

\hypertarget{workshop-2-open-in-colab}{%
\section[\emph{Workshop 2} ]{\texorpdfstring{\emph{Workshop 2}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W02.\%20Pandas.ipynb}{\protect\includegraphics{notebooks/../colab-badge.png}}}{Workshop 2 Open In Colab}}\label{workshop-2-open-in-colab}}

In this workshop, our aim is to get used to working with more complex
data that we've imported from external files. We'll start to graph it,
and to slice and dice it, to select the bits we're interested in.

We will work with \emph{pandas} to manipulate the data, and to derive
measures and graphs that tell us a bit more than what the source data
files tell us.

\hypertarget{aims}{%
\subsection{Aims}\label{aims}}

\begin{itemize}
\tightlist
\item
  Learn to import data to python using pandas
\item
  Learn how access specific rows, columns and cells
\item
  Plot the data
\item
  Tidy up graphs to include axes
\end{itemize}

\hypertarget{introduction}{%
\section{Introduction}\label{introduction}}

We are going to work with some UK income data. The income data is
packaged as a .csv file. The Pandas package knows how to handle this and
put the data in a DataFrame, as we've seen. Let's examine the data and
start to see what we can say about it. First of all, we have to find
data - I'm interested in looking in data with a wide spread, so I looked
for data on income in the UK.

This data is collected by the Office for National Statistics(ONS) :
http://www.ons.gov.uk/ons/datasets-and-tables/index.html?pageSize=50\&sortBy=none\&sortDirection=none\&newquery=income+percentile
- but the exact data I want to see, income by percentile, is tricky to
find.

I ended up using data from 2011, generated from a study called the
Family Resources Survey and collated and tweaked by an independent
research unit called the Institute of Fiscal Studies (IFS). The
``tweaking'' they do tends to be around the size of the family unit, and
other factors which create economies of scale - hence they
``equivalise'' it. The IFS is quoted in UK Government documents, so we
can have some trust in their impartiality, or at least accuracy - of
course, if we were publishing research about this, that's not really
good enough and we'd want to reproduce, or at least understand and
critique, their methodology rather than just trusting it!

e.g.:

http://www.ifs.org.uk/wheredoyoufitin/about.php

https://en.wikipedia.org/wiki/Equivalisation

\hypertarget{downloading-the-data}{%
\section{Downloading the Data}\label{downloading-the-data}}

Let's grab our income data from our course website and save it into our
data folder. If you've not already created a data folder then do so
using the following command. Don't worry if it generates an error, that
means you've already got a data folder.

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{!}\NormalTok{mkdir data}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
mkdir: data: File exists
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{!}\NormalTok{mkdir data}\OperatorTok{/}\NormalTok{wk2}
\OperatorTok{!}\NormalTok{curl https:}\OperatorTok{//}\NormalTok{s3.eu}\OperatorTok{{-}}\NormalTok{west}\OperatorTok{{-}}\FloatTok{2.}\ErrorTok{amazonaws}\NormalTok{.com}\OperatorTok{/}\NormalTok{qm2}\OperatorTok{/}\NormalTok{wk2}\OperatorTok{/}\NormalTok{incomes.csv }\OperatorTok{{-}}\NormalTok{o .}\OperatorTok{/}\NormalTok{data}\OperatorTok{/}\NormalTok{wk2}\OperatorTok{/}\NormalTok{incomes.csv}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
mkdir: data/wk2: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 15154  100 15154    0     0   135k      0 --:--:-- --:--:-- --:--:--  143k
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas}
\ImportTok{import}\NormalTok{ pylab}
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\CommentTok{\# make the plots a little wider by default}
\OperatorTok{\%}\NormalTok{matplotlib inline}
\NormalTok{plt.style.use(}\StringTok{\textquotesingle{}ggplot\textquotesingle{}}\NormalTok{)}

\NormalTok{pylab.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\FloatTok{10.}\NormalTok{, }\FloatTok{8.}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data\_path }\OperatorTok{=} \StringTok{"./data/wk2/incomes.csv"}

\NormalTok{income }\OperatorTok{=}\NormalTok{  pandas.read\_csv(data\_path, index\_col}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
\NormalTok{income.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllllllllllllll@{}}
\toprule\noalign{}
& Net equivalised household income in 2010-11, week & Childless couple,
annual income & Couple, two children under 14 & Couple, three children
under 14 & Couple with one child under 14 & Couple with two children
aged 15 to 18 & Couple, two children under 14 plus dependent adult &
Single adult & Lone parent, one child under 14 & Lone parent, two
children under 14 & Lone parent, two children aged 15-18 & ANNOTATIONS &
1979 to 1996-97 & 1996-97 to 2009-10 & 1996-97 to 2010-11 \\
Percentile Point & & & & & & & & & & & & & & & \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
1 & 33.50 & 1,746.92 & 2,445.69 & 2,795.08 & 2,096.31 & 2,899.89 &
3,022.18 & 1,170.44 & 1,519.82 & 1,869.21 & 2,323.41 & NaN & NaN & NaN &
NaN \\
2 & 98.60 & 5,141.01 & 7,197.41 & 8,225.61 & 6,169.21 & 8,534.07 &
8,893.95 & 3,444.48 & 4,472.68 & 5,500.88 & 6,837.54 & NaN & -0.20\% &
-1.30\% & -0.50\% \\
3 & 128.56 & 6,703.11 & 9,384.36 & 10,724.98 & 8,043.74 & 11,127.17 &
11,596.39 & 4,491.09 & 5,831.71 & 7,172.33 & 8,915.14 & NaN & 0.40\% &
0.10\% & 0.10\% \\
4 & 151.05 & 7,875.75 & 11,026.05 & 12,601.20 & 9,450.90 & 13,073.75 &
13,625.05 & 5,276.75 & 6,851.90 & 8,427.05 & 10,474.75 & NaN & 0.50\% &
0.80\% & 0.60\% \\
5 & 166.32 & 8,671.91 & 12,140.68 & 13,875.06 & 10,406.30 & 14,395.38 &
15,002.41 & 5,810.18 & 7,544.57 & 9,278.95 & 11,533.65 & NaN & 0.70\% &
1.00\% & 0.90\% \\
\end{longtable}

This is a simple dataframe - we see the percentile and an income. Note
that I've told pandas to use the first column (the Percentile) as the
index to make life easier.

The percentile tells us how people on that income rank - so the final
category, 99\% (which is really binned, so 99\%\textless n\(\leq\)
100\%), is telling us how much ``the 1\%'' earn. Let's find out:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income.tail()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllllllllllllll@{}}
\toprule\noalign{}
& Net equivalised household income in 2010-11, week & Childless couple,
annual income & Couple, two children under 14 & Couple, three children
under 14 & Couple with one child under 14 & Couple with two children
aged 15 to 18 & Couple, two children under 14 plus dependent adult &
Single adult & Lone parent, one child under 14 & Lone parent, two
children under 14 & Lone parent, two children aged 15-18 & ANNOTATIONS &
1979 to 1996-97 & 1996-97 to 2009-10 & 1996-97 to 2010-11 \\
Percentile Point & & & & & & & & & & & & & & & \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
95 & 1075.73 & 56,088.56 & 78,523.99 & 89,741.70 & 67,306.27 & 93,107.01
& 97,033.21 & 37,579.34 & 48,797.05 & 60,014.76 & 74,597.79 & NaN &
2.90\% & 2.00\% & 1.30\% \\
96 & 1174.48 & 61,237.18 & 85,732.05 & 97,979.49 & 73,484.61 &
101,653.72 & 105,940.32 & 41,028.91 & 53,276.35 & 65,523.78 & 81,445.45
& NaN & 3.00\% & 2.00\% & 1.40\% \\
97 & 1302.74 & 67,925.07 & 95,095.10 & 108,680.12 & 81,510.09 &
112,755.62 & 117,510.37 & 45,509.80 & 59,094.81 & 72,679.83 & 90,340.35
& NaN & 3.20\% & 2.20\% & 1.60\% \\
98 & 1523.31 & 79,425.23 & 111,195.32 & 127,080.36 & 95,310.27 &
131,845.88 & 137,405.64 & 53,214.90 & 69,099.95 & 84,984.99 & 105,635.55
& NaN & 3.20\% & 2.70\% & 1.70\% \\
99 & 2090.35 & 108,990.74 & 152,587.04 & 174,385.19 & 130,788.89 &
180,924.64 & 188,553.99 & 73,023.80 & 94,821.95 & 116,620.10 &
144,957.69 & NaN & NaN & NaN & NaN \\
\end{longtable}

Well, they we have it - the 1\% earn, on average, about £2000 a week.
How does that compare to people in the 90\% decile? We can access
particular \emph{rows} in a dataframe using \textbf{.loc{[}row
index{]}}; because our index is the percentile point, we can just read
it off:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income.loc[}\DecValTok{90}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Net equivalised household income in 2010-11, week        845.54
Childless couple, annual income                       44,086.54
Couple, two children under 14                         61,721.15
Couple, three children under 14                       70,538.46
Couple with one child under 14                        52,903.85
Couple with two children aged 15 to 18                73,183.65
Couple, two children under 14 plus dependent adult    76,269.71
Single adult                                          29,537.98
Lone parent, one child under 14                       38,355.29
Lone parent, two children under 14                    47,172.60
Lone parent, two children aged 15-18                  58,635.10
ANNOTATIONS                                                 NaN
1979 to 1996-97                                           2.50%
1996-97 to 2009-10                                        1.70%
1996-97 to 2010-11                                        1.20%
Name: 90, dtype: object
\end{verbatim}

We can also select a range of values with the ``colon'' notation. This
will select the 90-95th percentiles, for example:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income.loc[}\DecValTok{90}\NormalTok{:}\DecValTok{95}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllllllllllllll@{}}
\toprule\noalign{}
& Net equivalised household income in 2010-11, week & Childless couple,
annual income & Couple, two children under 14 & Couple, three children
under 14 & Couple with one child under 14 & Couple with two children
aged 15 to 18 & Couple, two children under 14 plus dependent adult &
Single adult & Lone parent, one child under 14 & Lone parent, two
children under 14 & Lone parent, two children aged 15-18 & ANNOTATIONS &
1979 to 1996-97 & 1996-97 to 2009-10 & 1996-97 to 2010-11 \\
Percentile Point & & & & & & & & & & & & & & & \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
90 & 845.54 & 44,086.54 & 61,721.15 & 70,538.46 & 52,903.85 & 73,183.65
& 76,269.71 & 29,537.98 & 38,355.29 & 47,172.60 & 58,635.10 & NaN &
2.50\% & 1.70\% & 1.20\% \\
91 & 876.63 & 45,707.74 & 63,990.84 & 73,132.39 & 54,849.29 & 75,874.85
& 79,074.40 & 30,624.19 & 39,765.74 & 48,907.29 & 60,791.30 & NaN &
2.60\% & 1.70\% & 1.20\% \\
92 & 911.29 & 47,514.54 & 66,520.35 & 76,023.26 & 57,017.44 & 78,874.13
& 82,200.15 & 31,834.74 & 41,337.65 & 50,840.55 & 63,194.33 & NaN &
2.60\% & 1.80\% & 1.20\% \\
93 & 957.14 & 49,905.23 & 69,867.32 & 79,848.36 & 59,886.27 & 82,842.68
& 86,336.04 & 33,436.50 & 43,417.55 & 53,398.59 & 66,373.95 & NaN &
2.70\% & 1.80\% & 1.30\% \\
94 & 1016.37 & 52,993.38 & 74,190.73 & 84,789.40 & 63,592.05 & 87,969.00
& 91,678.54 & 35,505.56 & 46,104.24 & 56,702.91 & 70,481.19 & NaN &
2.90\% & 1.90\% & 1.30\% \\
95 & 1075.73 & 56,088.56 & 78,523.99 & 89,741.70 & 67,306.27 & 93,107.01
& 97,033.21 & 37,579.34 & 48,797.05 & 60,014.76 & 74,597.79 & NaN &
2.90\% & 2.00\% & 1.30\% \\
\end{longtable}

\hypertarget{accessing-parts-of-a-dataframe}{%
\section{Accessing parts of a
dataframe}\label{accessing-parts-of-a-dataframe}}

If we want to extract the actual value instead of just the whole row, we
need to reference the \emph{column} as well as the row. In pandas,
columns are referenced by \textbf{column name}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income[}\StringTok{\textquotesingle{}Net equivalised household income in 2010{-}11, week\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Percentile Point
1       33.50
2       98.60
3      128.56
4      151.05
5      166.32
       ...   
95    1075.73
96    1174.48
97    1302.74
98    1523.31
99    2090.35
Name: Net equivalised household income in 2010-11, week, Length: 99, dtype: float64
\end{verbatim}

So, to access a particular cell, we tell Python the row and the column
(this is pretty simple - the same way we tell excel to access cell
``A34'' meaning Column A, Row 34). One way we do that in pandas is to
select the column, and then use .loc{[}{]} on the index.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income[}\StringTok{\textquotesingle{}Net equivalised household income in 2010{-}11, week\textquotesingle{}}\NormalTok{].loc[}\DecValTok{90}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
845.54
\end{verbatim}

We've accessed row 90 of the column called `Net equivalised household
income in 2010-11, week'; can we access the data the other way around -
can we first take the row and then specify a column? Let's try:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income.loc[}\DecValTok{90}\NormalTok{][}\StringTok{\textquotesingle{}Net equivalised household income in 2010{-}11, week\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
845.54
\end{verbatim}

Yes, this seems to be working fine.

\hypertarget{extension}{%
\subsection{Extension}\label{extension}}

The reason for this is that selecting the column spits out a smaller
dataframe, and all dataframes use ``loc'', so we can use that. Another
way to do this would be to use an explicit variable for the dataframe,
along the lines of:

\texttt{smallDataFrame\ =\ income{[}\textquotesingle{}Net\ equivalised\ household\ income\ in\ 2010-11,\ week\textquotesingle{}{]}}\strut \\
\texttt{smallDataFrame.loc{[}90{]}}

by doing income

\texttt{{[}\textquotesingle{}Net\ equivalised\ household\ income\ in\ 2010-11,\ week\textquotesingle{}{]}.loc{[}90{]}}

we're taking the ``smallDataFrame'' object as an implicit (or hidden)
output

If we want to look at a few rows of data, we can use a range:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income[}\StringTok{\textquotesingle{}Net equivalised household income in 2010{-}11, week\textquotesingle{}}\NormalTok{].loc[}\DecValTok{90}\NormalTok{:}\DecValTok{95}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Percentile Point
90     845.54
91     876.63
92     911.29
93     957.14
94    1016.37
95    1075.73
Name: Net equivalised household income in 2010-11, week, dtype: float64
\end{verbatim}

So, to recap, we can now access a particular \textbf{row} using
\emph{loc{[}index number{]}}, a particular \textbf{column} with the
square brackets formalism \emph{dataframename{[}`column name'{]}}, or
both \emph{dataframename{[}`column name'{]}.loc{[}index number{]}}.
We've made a start at being able to get to the bits of data we need.

\hypertarget{exercise}{%
\section{Exercise:}\label{exercise}}

How do the equivalised incomes of single adults and childless couples
compare? Look at the 1st, 99th and 50th percentile and summarise what
this tells you about the value or price of coupling.

\hypertarget{examining-the-distribution}{%
\section{Examining the Distribution}\label{examining-the-distribution}}

Returning to the overall statistics, the 90\% percentile earns less than
half the top percentile (``the 1\%''); if you're taking home over £800
as a household, you're in the top 10\% of earners.

How does 1. The income of ``the 1\%'' compare with the mean and median
across the population, as a proportion? 2. How does the 1\% compare with
the 90th percentile (the 10\%)? 3. How does the 10\% compare with the
median and mean?

The 1\% earn about 60 times the poorest groups in society - and we've
made other comparisons. But that's not the whole story. Let's look at
the income graph.

In pandas, we can plot this fairly easily\ldots{}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income[}\StringTok{\textquotesingle{}Net equivalised household income in 2010{-}11, week\textquotesingle{}}\NormalTok{].plot()}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}UK Net Equivalised Income by Percentile per week, 2010{-}11\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Income Percentile\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Income (Net, Equivalised) [GBP]\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Text(0, 0.5, 'Income (Net, Equivalised) [GBP]')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W02. Pandas_files/figure-pdf/cell-13-output-2.png}

}

\end{figure}

We see a curve that is pretty linear in the middle region, but curves
rapidly upwards in the higher percentile and looks more like a power
law.

\hypertarget{exercise-means}{%
\subsection{Exercise: Means}\label{exercise-means}}

Where does the mean appear here? Draw in a horizontal line to show the
mean using \textbf{axhline}. Show the median on the same graph. What is
the meaning of the median in this context?

Hint: Recall that last time we used \emph{axvline} to highlight the mean
and standard deviation by drawing vertical lines on the axis. Here, we
use \emph{axhline} to draw horizontal lines.

\hypertarget{extension-accessing-cells}{%
\subsection{Extension: Accessing
cells}\label{extension-accessing-cells}}

There are a number of ways to access elements of the dataframe: we've
shown how to access columns by the {[}\emph{`name of column'}{]} method,
and rows via the .loc{[}\emph{index}{]} method; and how we can select a
range. There are also .iloc methods to select by number rather than
name; you should become familiar with these on the documentation page
for pandas.

\hypertarget{comparing-segments}{%
\section{Comparing segments}\label{comparing-segments}}

Earlier, we compared some summary statistics of single people and
couples. Let's look at the wider curve for more than one group, now:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#This is going to throw a load of errors}
\NormalTok{income[[}\StringTok{\textquotesingle{}Single adult\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Lone parent, one child under 14\textquotesingle{}}\NormalTok{]].plot()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
TypeError: no numeric data to plot
\end{verbatim}

\hypertarget{warning}{%
\section{Warning}\label{warning}}

This isn't looking good. There's a load of text and no graph. If you've
not seen this before, it's an error - something has gone wrong.
Generally, if we look at the \textbf{final} line, it should tell us
what's wrong, in this case there's ``no numeric data to plot'', which is
weird, because we've seen the data and have even plotted some of it.

\hypertarget{messy-data}{%
\section{Messy Data}\label{messy-data}}

DataFrames, as we are starting to see, give us the chance to plot, chop,
slice and data to help us make sense of it. Here, we will create a
\textbf{new} DataFrame to take only two columns of data, and get rid of
any blank cells and any cells which are not being read as numbers -
normally a sign of a missing value or a non-numerical character. Why
could this be happening? It could be

\begin{itemize}
\item
  due to blank spaces in the text file
\item
  due to letters where there should be numbers
\item
  due to characters (``,'', ``-'', etc) that shouldn't really be there
\end{itemize}

In general, there will be some detective work required to figure out
what's wrong in our text file. Your best bet is sometimes to open up the
data in a text editor, like I've done here:

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{from}\NormalTok{ IPython.display }\ImportTok{import}\NormalTok{ Image}

\NormalTok{data\_path }\OperatorTok{=} \StringTok{"https://s3.eu{-}west{-}2.amazonaws.com/qm2/wk2/data.png"}
\NormalTok{Image(data\_path)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W02. Pandas_files/figure-pdf/cell-15-output-1.png}

}

\end{figure}

That's a screenshot of our datafile, opened up in a text editor. As we
can see, these numbers are separated by commas and surrounded by
quotation marks - this is normal, and what .csv files are supposed to
look like. However, there are a lot of commas within the numbers - which
makes it easier for people to read, but confuses software. Luckily,
Python has a method for dealing with this - the ``replace'' method.

Unfortunately, this dataframe is quite messy, so I'm going to have to
extract just the columns of data I'm interested in to make it work. I'll
do that by creating a new dataframe:

\hypertarget{example-cleaning-data}{%
\section{Example: Cleaning data}\label{example-cleaning-data}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean }\OperatorTok{=}\NormalTok{ income[[}\StringTok{\textquotesingle{}Childless couple, annual income\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Couple, two children under 14\textquotesingle{}}\NormalTok{]]}
\NormalTok{clean.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lll@{}}
\toprule\noalign{}
& Childless couple, annual income & Couple, two children under 14 \\
Percentile Point & & \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
1 & 1,746.92 & 2,445.69 \\
2 & 5,141.01 & 7,197.41 \\
3 & 6,703.11 & 9,384.36 \\
4 & 7,875.75 & 11,026.05 \\
5 & 8,671.91 & 12,140.68 \\
\end{longtable}

We see those pesky commas. Now we can get on with cleaning up the data:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean}\OperatorTok{=}\NormalTok{clean.replace(}\StringTok{\textquotesingle{},\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{, regex}\OperatorTok{=}\VariableTok{True}\NormalTok{)}

\CommentTok{\# In addition, missing values are sometimes written as \textquotesingle{}{-}\textquotesingle{}, in order for Python to understand that it is just a missing numerical }
\CommentTok{\# value, all \textquotesingle{}{-}\textquotesingle{} need to be replaced with \textquotesingle{}NaN\textquotesingle{}.}
\NormalTok{clean }\OperatorTok{=}\NormalTok{ clean.replace(}\StringTok{\textquotesingle{}{-}\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}NaN\textquotesingle{}}\NormalTok{, regex}\OperatorTok{=}\VariableTok{True}\NormalTok{).astype(}\StringTok{\textquotesingle{}float\textquotesingle{}}\NormalTok{)}
\NormalTok{clean.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lll@{}}
\toprule\noalign{}
& Childless couple, annual income & Couple, two children under 14 \\
Percentile Point & & \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
1 & 1746.92 & 2445.69 \\
2 & 5141.01 & 7197.41 \\
3 & 6703.11 & 9384.36 \\
4 & 7875.75 & 11026.05 \\
5 & 8671.91 & 12140.68 \\
\end{longtable}

\textbf{Extension}: ``\textbf{Regex}'' refers to ``\textbf{Reg}ular
\textbf{Ex}pression'', which is a way of replacing and cleaning text.
It's a bit beyond the scope of this class, but worth looking into if
you're interested in programming more widely.

This seems to have done the job. We've also put a line in the code to
get rid of dashes - a way that data collectors will sometimes represent
missing data. Now let's plot this.

\hypertarget{asking-more-questions-of-the-data}{%
\section{Asking more questions of the
data}\label{asking-more-questions-of-the-data}}

For me, this data starts to beg further questions. How would we answer
these?

\begin{itemize}
\item
  If the top 20\% of income shows such a sharp increase, how do we know
  that there isn't a similar uptick \emph{within} the 1\%? We've already
  seen that the mean of the dataset as a whole is much less than the
  half the maximum category (it's 25\% of the maximum). What if that's
  true within the 1\%, and £2,000/week as a fraction of the 0.1\%, or
  the 0.01\%?
\item
  How does this break down for gender, or educational background, or
  other factors like ethnicity or country of origin?
\item
  Which parts of the income curve show greater gaps between these
  subgroups and what might it say about the underlying causal
  mechanisms?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean.plot()}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}A Modest Proposal: The fiscal benefits of childbirth\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Percentile\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Income Per Week [GBP]\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Text(0, 0.5, 'Income Per Week [GBP]')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W02. Pandas_files/figure-pdf/cell-18-output-2.png}

}

\end{figure}

\hypertarget{exercise-1}{%
\section{Exercise:}\label{exercise-1}}

Previously, we'd examined income gaps between single people and couples
(how very romantic). Repeat the above exercise (cleaning and plotting
income data) for the columns we used above for single people and
childless couples. Reflect and comment on the differences.

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{print}\NormalTok{(}\StringTok{"Enter your code here"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{Add your reflection here.}
\end{Highlighting}
\end{Shaded}

So far, we've dealt with selecting data in a particular row of column by
index or label. What if we now want to filter the data by \emph{value}?
For example, let's say I want to see the data for all Childless couples
who earn more than 50,000 (net equivalised) pounds every year. This
looks like:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean }\OperatorTok{=}\NormalTok{ income[[}\StringTok{\textquotesingle{}Childless couple, annual income\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Couple, two children under 14\textquotesingle{}}\NormalTok{]]}
\NormalTok{clean }\OperatorTok{=}\NormalTok{ clean.replace(}\StringTok{\textquotesingle{},\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{, regex}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\NormalTok{clean }\OperatorTok{=}\NormalTok{ clean.replace(}\StringTok{\textquotesingle{}{-}\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}NaN\textquotesingle{}}\NormalTok{, regex}\OperatorTok{=}\VariableTok{True}\NormalTok{).astype(}\StringTok{\textquotesingle{}float\textquotesingle{}}\NormalTok{)}
\NormalTok{clean[clean[}\StringTok{\textquotesingle{}Childless couple, annual income\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}}\DecValTok{50000}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

The key line of code for selection is:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean[clean[}\StringTok{\textquotesingle{}Childless couple, annual income\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}}\DecValTok{50000}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

Let's break this down: we're used to using \emph{dataframe}{[}\emph{some
selection}{]} from earlier. Here ``some selection'' is

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean[}\StringTok{\textquotesingle{}Childless couple, annual income\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}}\DecValTok{50000}
\end{Highlighting}
\end{Shaded}

In other words, this command is returning a set of indices where that
statement is true. We can see this explicitly:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean[}\StringTok{\textquotesingle{}Childless couple, annual income\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}}\DecValTok{50000}
\end{Highlighting}
\end{Shaded}

So python is picking the values where this statement is true -
i.e.~where the `Childless couple\ldots{}' column has values greater than
50000. Then this selection is passed to the dataframe, and the dataframe
shows the correct rows.

We won't dwell on comparative operative, here we've used
``\textgreater{}'' to mean ``is greater than''; you can also use:

\begin{itemize}
\tightlist
\item
  == to mean `is equal to' {[}why the double equals?{]}
\item
  \textless\textgreater{} or != to mean `is not equal to'
\item
  \textless{} to mean `is less than'
\item
  the symbol \textgreater= to mean `is greater than or equal to'
\item
  \textless= to mean `is less than or equal to'
\end{itemize}

\hypertarget{exercise-2}{%
\section{Exercise}\label{exercise-2}}

On an approporiately labelled graph, plot the incomes of all single
adults whose net equivalised income is less than or equal to £10,000.
What proportion of the population is this?

\bookmarksetup{startatroot}

\hypertarget{extension-web-scraping}{%
\chapter{Extension: Web Scraping}\label{extension-web-scraping}}

In this example, we've been working with a .csv file that contains all
the data we want. That's not always the case. Let's say we're interested
in getting the data from a table on a website. Websites are built using
HTML code, so what we need to figure out how to look inside the
website's code and pull out the data we want. Luckily, pandas has a
built in function that can automatically recognize HTML tables in
websites and turn them into dataframes.

Let's start with the \href{https://top10.netflix.com/}{Netflix Top 10}
website. Click on the link and have a look around. You'll notice two
tables: the first showing the top 10 films this week, and the second
(farther down) showing the most popular filsms based on their first 28
days on netflix.

We can download both of these tables into python using one pandas
function: read\_html

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{url}\OperatorTok{=}\StringTok{\textquotesingle{}https://top10.netflix.com/\textquotesingle{}}

\NormalTok{tables}\OperatorTok{=}\NormalTok{pandas.read\_html(url)}

\BuiltInTok{print}\NormalTok{(tables)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
[    #  \
0   1   
1   2   
2   3   
3   4   
4   5   
5   6   
6   7   
7   8   
8   9   
9  10   

  .css-ld8rqy-container{position:relative;box-sizing:border-box;min-width:0;}.css-7pg0cj-a11yText{z-index:9999;border:0;clip:rect(1px, 1px, 1px, 1px);height:1px;width:1px;position:absolute;overflow:hidden;padding:0;white-space:nowrap;}.css-3zcu7z-control{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;background-color:hsl(0, 0%, 100%);border-color:hsl(0, 0%, 80%);border-radius:0;border-style:solid;border-width:1px;box-shadow:none;cursor:pointer;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-flex-wrap:wrap;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;-webkit-box-pack:justify;-webkit-justify-content:space-between;justify-content:space-between;min-height:0rem;outline:0!important;position:relative;-webkit-transition:all 100ms;transition:all 100ms;box-sizing:border-box;background:transparent;border:none;padding:0px 3px;margin-left:-5px;}.css-3zcu7z-control:hover{border-color:rgba(255,255,255,0.9);}.css-zl2g27{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;display:grid;-webkit-flex:1;-ms-flex:1;flex:1;-webkit-box-flex-wrap:wrap;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;padding:0;-webkit-overflow-scrolling:touch;position:relative;overflow:hidden;box-sizing:border-box;}.css-hlu0h4-singleValue{color:white;grid-area:1/1/2/3;margin-left:2px;margin-right:2px;max-width:100%;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;box-sizing:border-box;}Films (English).css-1a9ai41{margin:0;padding-bottom:2px;padding-top:2px;visibility:visible;color:hsl(0, 0%, 20%);-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;display:inline-grid;grid-area:1/1/2/3;grid-template-columns:0 min-content;box-sizing:border-box;padding:0;}.css-1a9ai41:after{content:attr(data-value) " ";visibility:hidden;white-space:pre;grid-area:1/2;font:inherit;min-width:2px;border:0;margin:0;outline:0;padding:0;}.css-1wy0on6{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-align-self:stretch;-ms-flex-item-align:stretch;align-self:stretch;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;box-sizing:border-box;}.css-1hyfx7x{display:none;}.css-xhbtlw-indicatorContainer{color:hsl(0, 0%, 80%);display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;padding:8px;-webkit-transition:color 150ms;transition:color 150ms;box-sizing:border-box;-webkit-transform:scale(0.8);-moz-transform:scale(0.8);-ms-transform:scale(0.8);transform:scale(0.8);}.css-xhbtlw-indicatorContainer:hover{color:hsl(0, 0%, 60%);}.css-xhbtlw-indicatorContainer:hover{-webkit-transform:scale(1);-moz-transform:scale(1);-ms-transform:scale(1);transform:scale(1);}  \
0                                Luckiest Girl Alive                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
1                               Mr. Harrigan's Phone                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
2                                    Last Seen Alive                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
3                                             Blonde                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
4                                                Lou                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
5                                      The Boss Baby                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
6                                               Sing                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
7                                          Marauders                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
8                                    The Redeem Team                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
9                            Minions & More Volume 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

   Weeks in Top 10  Hours viewed  
0                1      43080000  
1                1      35420000  
2                2      18810000  
3                2      17410000  
4                3      12600000  
5                1       8510000  
6                1       8420000  
7                2       8350000  
8                1       7850000  
9                3       7090000  ,     #  \
0   1   
1   2   
2   3   
3   4   
4   5   
5   6   
6   7   
7   8   
8   9   
9  10   

  .css-ld8rqy-container{position:relative;box-sizing:border-box;min-width:0;}.css-7pg0cj-a11yText{z-index:9999;border:0;clip:rect(1px, 1px, 1px, 1px);height:1px;width:1px;position:absolute;overflow:hidden;padding:0;white-space:nowrap;}.css-3zcu7z-control{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;background-color:hsl(0, 0%, 100%);border-color:hsl(0, 0%, 80%);border-radius:0;border-style:solid;border-width:1px;box-shadow:none;cursor:pointer;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-flex-wrap:wrap;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;-webkit-box-pack:justify;-webkit-justify-content:space-between;justify-content:space-between;min-height:0rem;outline:0!important;position:relative;-webkit-transition:all 100ms;transition:all 100ms;box-sizing:border-box;background:transparent;border:none;padding:0px 3px;margin-left:-5px;}.css-3zcu7z-control:hover{border-color:rgba(255,255,255,0.9);}.css-zl2g27{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;display:grid;-webkit-flex:1;-ms-flex:1;flex:1;-webkit-box-flex-wrap:wrap;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;padding:0;-webkit-overflow-scrolling:touch;position:relative;overflow:hidden;box-sizing:border-box;}.css-hlu0h4-singleValue{color:white;grid-area:1/1/2/3;margin-left:2px;margin-right:2px;max-width:100%;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;box-sizing:border-box;}Films (English).css-1a9ai41{margin:0;padding-bottom:2px;padding-top:2px;visibility:visible;color:hsl(0, 0%, 20%);-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;display:inline-grid;grid-area:1/1/2/3;grid-template-columns:0 min-content;box-sizing:border-box;padding:0;}.css-1a9ai41:after{content:attr(data-value) " ";visibility:hidden;white-space:pre;grid-area:1/2;font:inherit;min-width:2px;border:0;margin:0;outline:0;padding:0;}.css-1wy0on6{-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-align-self:stretch;-ms-flex-item-align:stretch;align-self:stretch;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;box-sizing:border-box;}.css-1hyfx7x{display:none;}.css-xhbtlw-indicatorContainer{color:hsl(0, 0%, 80%);display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;padding:8px;-webkit-transition:color 150ms;transition:color 150ms;box-sizing:border-box;-webkit-transform:scale(0.8);-moz-transform:scale(0.8);-ms-transform:scale(0.8);transform:scale(0.8);}.css-xhbtlw-indicatorContainer:hover{color:hsl(0, 0%, 60%);}.css-xhbtlw-indicatorContainer:hover{-webkit-transform:scale(1);-moz-transform:scale(1);-ms-transform:scale(1);transform:scale(1);}  \
0                                         Red Notice                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
1                                      Don't Look Up                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
2                                           Bird Box                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
3                                       The Gray Man                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
4                                   The Adam Project                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
5                                         Extraction                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
6                                      Purple Hearts                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
7                                   The Unforgivable                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
8                                       The Irishman                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
9                                The Kissing Booth 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

   Hours viewed in first 28 days  
0                      364020000  
1                      359790000  
2                      282020000  
3                      253870000  
4                      233160000  
5                      231340000  
6                      228690000  
7                      214700000  
8                      214570000  
9                      209250000  ]
\end{verbatim}

When we print the results of what was scraped, it's pretty ugly. One of
the reasons is that the \texttt{tables} variable is actually a
\emph{list} of dataframes. Because there were two tables on our website,
\texttt{read\_html} has returned both of those tables and put them in a
list. let's save the first table as a new dataframe called
\texttt{top10} and have a closer look.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{top10}\OperatorTok{=}\NormalTok{tables[}\DecValTok{0}\NormalTok{]}
\NormalTok{top10}
\end{Highlighting}
\end{Shaded}

This looks more like the dataframes we were looking at earlier. There's
a big chunk of text (this is HTML code, the language websites are built
with) where the name of the second column should be. \texttt{read\_html}
is usually pretty smart, and can actually read the column names from the
tables on the website. It seems to have gotten confused for this one
column. If we print the columns from the We can rename that column using
the \texttt{rename} function. Since we know it's the second column, we
can select it with \texttt{top10.columns{[}1{]}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{top10.rename(columns}\OperatorTok{=}\NormalTok{\{top10.columns[}\DecValTok{1}\NormalTok{]: }\StringTok{"Title"}\NormalTok{ \}, inplace }\OperatorTok{=} \VariableTok{True}\NormalTok{)}
\NormalTok{top10}
\end{Highlighting}
\end{Shaded}

And there we have it; a nicely formatted dataframe ready for analysis,
straight from a website.

\bookmarksetup{startatroot}

\hypertarget{assessed-question-1}{%
\chapter{Assessed Question}\label{assessed-question-1}}

Using the following URL
https://en.wikipedia.org/wiki/List\_of\_Nobel\_laureates\_in\_Chemistry
create a plot of the top 10 countries in terms of nobel laureates.
First, follow the steps below:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# scrape the table of Nobel Laureates in Chemistry using read\_html. remember, this gives us a LIST of dataframes! lets call this list chem\_tables}

\CommentTok{\# select the first dataframe from this list and call it chem}
\end{Highlighting}
\end{Shaded}

I'll help you out with this next bit. We'll be using the
\texttt{groupby} function in pandas to group our dataframe such that
each row is a country (rather than a person, as it currently is). We do
this by using
\texttt{\textless{}dataframe\textgreater{}.groupby(\textquotesingle{}\textless{}column\ name\textgreater{}\textquotesingle{})}.
Since we're aggregating, we need to tell python how we want it to
aggregate our values. In this case, we just want to count the number of
rows for each country; we can do this using \texttt{.size()}. You can
use many different aggregation functions, e.g.~\texttt{.mean()} if you
wanted to calculate the average of a specific column.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a new dataframe called \textquotesingle{}country\textquotesingle{} in which each row is a country, and the values represent the number of nobel laureates. }

\CommentTok{\# now sort it in descending order}

\CommentTok{\# finally, plot the top 10 countries }
\end{Highlighting}
\end{Shaded}

\emph{QUESTION: What country has the sixth most nobel prizes in
chemistry?}

\bookmarksetup{startatroot}

\hypertarget{spatiotemporal-data}{%
\chapter{Spatiotemporal Data}\label{spatiotemporal-data}}

\hypertarget{workshop-3-open-in-colab}{%
\section[\emph{Workshop 3} ]{\texorpdfstring{\emph{Workshop 3}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W03.\%20Spatial\%20Data.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Workshop 3 Open In Colab}}\label{workshop-3-open-in-colab}}

Sometimes the data we work with references points on the earth's
surface, unlocking a rich set of analytical possibilities. In today's
workshop, we're going to be exploring the effect of the 2020 California
Wildfires on air quality across the state. We'll be using real air
quality data collected by sensors and combining it with satellite
imagery to show how toxic smoke from wildfires swept over America's
largest state.

\hypertarget{aims-1}{%
\subsection{Aims}\label{aims-1}}

\begin{itemize}
\tightlist
\item
  Understanding spatiotemporal data
\item
  Grouping data in pandas
\item
  Manipulating and plotting geographic data
\end{itemize}

\hypertarget{background}{%
\section{Background}\label{background}}

\includegraphics{index_files/mediabag/106695701-1599664926.jpg}

The \href{https://en.wikipedia.org/wiki/2020_California_wildfires}{2020
California wildfire season} was record-setting. By the end of the year,
9,917 fires had burned more than 4\% of the state's area, making 2020
the largest wildfire season recorded in California's modern history.
California's August Complex fire has been described as the first
``gigafire'', burning over 1 million acres across seven counties, an
area larger than the state of Rhode Island. The fires destroyed over
10,000 structures and cost over \$12.079 billion (2020 USD) in damages,
including over \$10 billion in property damage and \$2.079 billion in
fire suppression costs. The intensity of the fire season has been
attributed to a combination of more than a century of poor forest
management and higher temperatures resulting from climate change.

The fires also had a
\href{https://epic.uchicago.edu/news/pollution-from-californias-2020-wildfires-likely-offset-decades-of-air-quality-gains/}{profound
effect on air quality}: ``Places that are experiencing frequent or more
frequent wildfires are going to experience higher air pollution levels,
not just for a couple of days or weeks, but it could impact the annual
level of exposure,'' said Christa Hasenkopf, director of air quality
programs at the University of Chicago institute. ``It can bump up that
average to unsafe and unhealthy levels that really do have an impact on
people's health. When we think of wildfires, we think of short-term
events --- and hopefully they are --- but they can have long-term
consequences considering your overall air pollution exposure.''

\hypertarget{getting-started}{%
\section{Getting Started}\label{getting-started}}

Let's begin by installing some libraries that we'll be working with
today.

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{\%\%}\NormalTok{capture}
\OperatorTok{!}\NormalTok{pip install Basemap}
\OperatorTok{!}\NormalTok{pip install ipyleaflet}
\end{Highlighting}
\end{Shaded}

\hypertarget{importing-libraries}{%
\section{Importing Libraries}\label{importing-libraries}}

The first step in any python script is to import the necessary
libraries:

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ matplotlib}
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\ImportTok{import}\NormalTok{ pylab}
\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ datetime}

\OperatorTok{\%}\NormalTok{matplotlib inline}
\NormalTok{pylab.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\DecValTok{10}\NormalTok{, }\DecValTok{8}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{downloading-data-1}{%
\section{Downloading Data}\label{downloading-data-1}}

The next step is to import the data that we need for our analysis. This
week we'll be using real data collected in 2020 by the
\href{https://www.epa.gov/outdoor-air-quality-data/download-daily-data}{Environmental
Protection Agency (EPA)}. I've generated a .csv file containing the data
that I want using the dropdown menus. The EPA also has an
\href{https://aqs.epa.gov/aqsweb/documents/data_api.html}{Application
Programming Interface} for air quality data, which you could use to pull
in data directly into python without having to download a .csv!

Let's open the .csv file and have a look at it:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{=}\NormalTok{pd.read\_csv(}\StringTok{\textquotesingle{}https://qm2.s3.eu{-}west{-}2.amazonaws.com/wk3/california\_aqi.csv\textquotesingle{}}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllllll@{}}
\toprule\noalign{}
& Date & Site ID & POC & PM & AQI & Site Name & CBSA\_NAME & COUNTY &
latitude & longitude \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 1/1/20 & 60010007 & 3 & 8.6 & 36 & Livermore & San
Francisco-Oakland-Hayward, CA & Alameda & 37.687526 & -121.784217 \\
1 & 1/2/20 & 60010007 & 3 & 4.5 & 19 & Livermore & San
Francisco-Oakland-Hayward, CA & Alameda & 37.687526 & -121.784217 \\
2 & 1/3/20 & 60010007 & 3 & 14.2 & 55 & Livermore & San
Francisco-Oakland-Hayward, CA & Alameda & 37.687526 & -121.784217 \\
3 & 1/4/20 & 60010007 & 3 & 10.9 & 45 & Livermore & San
Francisco-Oakland-Hayward, CA & Alameda & 37.687526 & -121.784217 \\
4 & 1/5/20 & 60010007 & 3 & 7.8 & 33 & Livermore & San
Francisco-Oakland-Hayward, CA & Alameda & 37.687526 & -121.784217 \\
... & ... & ... & ... & ... & ... & ... & ... & ... & ... & ... \\
55686 & 11/29/20 & 61131003 & 1 & 20.3 & 68 & Woodland-Gibson Road &
Sacramento-\/-Roseville-\/-Arden-Arcade, CA & Yolo & 38.661210 &
-121.732690 \\
55687 & 12/18/20 & 61131003 & 1 & 2.8 & 12 & Woodland-Gibson Road &
Sacramento-\/-Roseville-\/-Arden-Arcade, CA & Yolo & 38.661210 &
-121.732690 \\
55688 & 12/20/20 & 61131003 & 1 & 22.4 & 73 & Woodland-Gibson Road &
Sacramento-\/-Roseville-\/-Arden-Arcade, CA & Yolo & 38.661210 &
-121.732690 \\
55689 & 12/23/20 & 61131003 & 1 & 11.8 & 49 & Woodland-Gibson Road &
Sacramento-\/-Roseville-\/-Arden-Arcade, CA & Yolo & 38.661210 &
-121.732690 \\
55690 & 12/29/20 & 61131003 & 1 & 5.6 & 23 & Woodland-Gibson Road &
Sacramento-\/-Roseville-\/-Arden-Arcade, CA & Yolo & 38.661210 &
-121.732690 \\
\end{longtable}

Each row in this dataset is an individual reading from an air quality
sensor. The first row is a reading from sensor number 60010007 on
January 1st 2020. It is located in Alameda County, and recorded an Air
Quality Index (AQI) reading of 36. So for each sensor (uniquely
identified by the Site ID column) we will have 365 readings. We also
have the latitude and longitude of each one of these air quality
sensors. The presence of these fields makes this
\textbf{spatio-temporal} data. We'll first analyze the temporal
dimension of our data, before adding in the spatial dimension

\hypertarget{temporal-data}{%
\section{Temporal Data}\label{temporal-data}}

Before we go any further, we need to focus on a very special column in
our dataset: the ``Date'' column. We'll be relying heavily on this
dimension of our dataset. Whenever we have temporal data, the first
thing we want to do is check whether pandas is storing it as datetime
information or as a string (text). We can do this using the
\texttt{dtype} function.

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Prior to cleaning, the data type of the "Date" column is:\textquotesingle{}}\NormalTok{, df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{].dtype)}

\NormalTok{df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{pd.to\_datetime(df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{])}

\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Now, it is stored as: \textquotesingle{}}\NormalTok{, df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{].dtype)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Prior to cleaning, the data type of the "Date" column is: object
Now, it is stored as:  datetime64[ns]
\end{verbatim}

Once we've stored the Date column as datetime information, we can do all
sorts of useful things with it. For example, we can quickly extract the
month from the date, or even the ``day of year'' (i.e., how many days
since January 1st of that year have passed). Try doing that in one line
of code if your ``Date'' column is stored as text!

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# we can extract the month from the Date column and save it as a new column }
\NormalTok{df[}\StringTok{\textquotesingle{}Month\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{].dt.month}
\CommentTok{\# we can do the same for the day of year. }
\NormalTok{df[}\StringTok{\textquotesingle{}Day\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{].dt.dayofyear}

\BuiltInTok{print}\NormalTok{(df[[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Month\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Day\textquotesingle{}}\NormalTok{]])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
            Date  Month  Day
0     2020-01-01      1    1
1     2020-01-02      1    2
2     2020-01-03      1    3
3     2020-01-04      1    4
4     2020-01-05      1    5
...          ...    ...  ...
55686 2020-11-29     11  334
55687 2020-12-18     12  353
55688 2020-12-20     12  355
55689 2020-12-23     12  358
55690 2020-12-29     12  364

[55691 rows x 3 columns]
\end{verbatim}

When I print the new columns we've made (``Month'' and ``Day'') next to
the original ``Date'' column, we can see that everything is working as
it should. First date (January 1st, 2020), has a value of 1 in the month
column, and a 1 in the day column. The last row in the dataset was a
sensor reading raken on December 29th, 2020. It has a month of 12, and
day-of-year value of 364. Great.

\hypertarget{exercise-3}{%
\subsection{Exercise}\label{exercise-3}}

\href{https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.dayofyear.html}{Here's}
the documentation for the pandas function that allowed us to extract the
day of year from the datetime column. Using the documentation on this
page, create a new column in the dataframe that contains the week of
year.

\hypertarget{grouping-data}{%
\subsection{Grouping Data}\label{grouping-data}}

We can now use the new temporal columns we've created to analyze our
data further. The broadest possible question we're interested in today
is ``What was the effect of the 2020 wildfires on air quality in
California?'' This involves looking at air quality over time, and
comparing pre/post wildfire air quality reading.

To translate that into python, we effectively want to calculate the
average AQI value for all of the sensors in California each day. We can
accomplish this using the \texttt{.groupby()} function in pandas.
\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html}{Here}
is the documentation page for the function, give it a quick read.

Remember, each row in our dataframe \texttt{df} is an individual sensor
reading on a given day. We now want a dataframe in which each row is
\emph{one day}, representing the average of \emph{all AQI sensors}. We
can accomplish that using the following line of code, which has four
parts:

\texttt{df.groupby(\textquotesingle{}Day\textquotesingle{}){[}\textquotesingle{}AQI\textquotesingle{}{]}.mean()}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{df}: the dataframe we want to use
\item
  \texttt{.groupby(\textquotesingle{}Day\textquotesingle{})}: the
  groupby function, and the name of the column that we want to group our
  data by. In this case, we want each row in our new dataset to be one
  day, so we're using the ``Day'' column.
\item
  \texttt{{[}\textquotesingle{}AQI\textquotesingle{}{]}}: the data that
  we want to aggregate. Remember, our dataframe has many columns, but we
  want to calculate the average daily value of AQI.
\item
  \texttt{.mean()}: the method of aggregation. We're calculating the
  average in this case, but we could also want to take the maximum value
  (\texttt{.max()}), minimum value (\texttt{.min()}), median
  (\texttt{.median()}), etc.
\end{enumerate}

Let's look at the output from the line of code above. Remember, whenever
we make something new, we must store it somewhere or it disappears! I'm
storing this as a new dataframe called ``daily''.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{daily}\OperatorTok{=}\NormalTok{df.groupby(}\StringTok{\textquotesingle{}Day\textquotesingle{}}\NormalTok{)[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{].mean()}
\NormalTok{daily}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Day
1      50.255682
2      43.300000
3      50.437500
4      47.224299
5      39.240602
         ...    
362    33.500000
363    23.358209
364    30.610256
365    39.492754
366    42.532374
Name: AQI, Length: 366, dtype: float64
\end{verbatim}

Now we can see that our dataframe has 366 rows, one for each day of the
year (2020 was actually a leap year!). Let's plot the daily average of
the AQI sensors, along with a dashed vertical line indicating the day a
State of Emergency was declared (August 18th).

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# plot the daily data}
\NormalTok{daily.plot(color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{)}

\CommentTok{\#add title and axis labels}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}Daily Air Quality Index readings in California, 2020\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Day of Year\textquotesingle{}}\NormalTok{)}

\CommentTok{\# add a dashed black line on August 18th (the 231st day of the year)}
\NormalTok{plt.axvline(}\DecValTok{231}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}{-}{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}State of Emergency\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.legend()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<matplotlib.legend.Legend at 0x142151040>
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W03. Spatial Data_files/figure-pdf/cell-9-output-2.png}

}

\end{figure}

Pretty cool! We can clearly see some spikes in AQI that correspond
directly to when the state of emergency was declared. Our data is
matching expectations about reality: even though there's no information
about the state of emergency or the wildfires in our dataframe
(remember, it's just a bunch of air quality readings from sensors), we
observe a relationship between our variables (presence of wildfires and
air quality) that conforms to our expectations.

\hypertarget{exercise-4}{%
\subsection{Exercise}\label{exercise-4}}

Now, repeat the above plot but aggregate the dataframe by month rather
than by day. Store the monthly data as a new dataframe called
``monthly''.

\hypertarget{geographic-disparities}{%
\subsection{Geographic Disparities}\label{geographic-disparities}}

OK. We've got a good sense of how the wildfires affected air quality
readings across the whole state. But California is huge; there are
probably geographic disparities in how bad air quality was as a result
of the fires. Let's see which counties were worst affected by the
wildfires.

In our original dataframe, each row was a reading from a given sensor on
a given day. We grouped this data by day to create a dataframe that took
the average of \emph{all} sensors in california for each day as follows:

\texttt{df.groupby(\textquotesingle{}Day\textquotesingle{}){[}\textquotesingle{}AQI\textquotesingle{}{]}.mean()}

Now, we want to plot the average daily air quality by county; this will
involve aggregating both by day \emph{and by county}. Intuitively, we
can accomplish this changing
\texttt{\textquotesingle{}Day\textquotesingle{}} to
\texttt{{[}\textquotesingle{}Day\textquotesingle{},\textquotesingle{}COUNTY\textquotesingle{}{]}},
like so:

\texttt{df.groupby({[}\textquotesingle{}Day\textquotesingle{},\textquotesingle{}COUNTY\textquotesingle{}{]}){[}\textquotesingle{}AQI\textquotesingle{}{]}.mean()}

Let's store this new dataframe and call it ``county\_daily'':

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{county\_daily}\OperatorTok{=}\NormalTok{df.groupby([}\StringTok{\textquotesingle{}Day\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}COUNTY\textquotesingle{}}\NormalTok{,])[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{].mean().reset\_index()}
\NormalTok{county\_daily}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llll@{}}
\toprule\noalign{}
& Day & COUNTY & AQI \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 1 & Alameda & 44.500000 \\
1 & 1 & Butte & 66.666667 \\
2 & 1 & Calaveras & 63.000000 \\
3 & 1 & Colusa & 78.000000 \\
4 & 1 & Contra Costa & 46.000000 \\
... & ... & ... & ... \\
17314 & 366 & Tehama & 52.000000 \\
17315 & 366 & Trinity & 36.000000 \\
17316 & 366 & Tulare & 62.666667 \\
17317 & 366 & Ventura & 23.666667 \\
17318 & 366 & Yolo & 35.000000 \\
\end{longtable}

\hypertarget{exercise-5}{%
\section{Exercise}\label{exercise-5}}

Using the \texttt{groupby} function, create a new dataframe called
``counties'' in which each row is a county, and each value is the
\textbf{maximum} AQI value in that county during the entire year. Then,
sort this dataframe in descending order using
\texttt{.sort\_values(ascending=False)}

Which county had the highest maximum AQI value? Which county had the
lowest? store the names of these counties as varables called ``highest''
and ``lowest'', shown below:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{highest}\OperatorTok{=}\StringTok{\textquotesingle{}\textquotesingle{}}
\NormalTok{lowest}\OperatorTok{=}\StringTok{\textquotesingle{}\textquotesingle{}}

\CommentTok{\# Filter the county{-}level daily AQI readings for the worst{-}affected county}
\NormalTok{worst\_county}\OperatorTok{=}\NormalTok{county\_daily[county\_daily[}\StringTok{\textquotesingle{}COUNTY\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\NormalTok{highest]}

\CommentTok{\# Filter the county{-}level daily AQI readings for the least{-}affected county}
\NormalTok{best\_county}\OperatorTok{=}\NormalTok{county\_daily[county\_daily[}\StringTok{\textquotesingle{}COUNTY\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\NormalTok{lowest]}
\end{Highlighting}
\end{Shaded}

Using those two variables, lets plot the AQI values for each of these
counties individually:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# plot the data from the worst affected county}
\NormalTok{plt.plot(worst\_county[}\StringTok{\textquotesingle{}Day\textquotesingle{}}\NormalTok{], worst\_county[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{], label}\OperatorTok{=}\NormalTok{highest)}

\CommentTok{\# plot the data from the least affected county}
\NormalTok{plt.plot(best\_county[}\StringTok{\textquotesingle{}Day\textquotesingle{}}\NormalTok{], best\_county[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{], label}\OperatorTok{=}\NormalTok{lowest)}

\CommentTok{\#add title and axis labels}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}Daily Air Quality Index readings in California, 2020\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Day of Year\textquotesingle{}}\NormalTok{)}

\CommentTok{\# add a dashed black line on August 18th (the 231st day of the year)}
\NormalTok{plt.axvline(}\DecValTok{231}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}{-}{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}State of Emergency\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.legend()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<matplotlib.legend.Legend at 0x142444c40>
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W03. Spatial Data_files/figure-pdf/cell-14-output-2.png}

}

\end{figure}

We can see that the worst affected county suffered a massive spike in
AQI following the wildfires, while the least affected county experienced
a much smaller increase in AQI.

\hypertarget{bringing-in-geography}{%
\section{Bringing in Geography}\label{bringing-in-geography}}

We can explore some limited geographic variation using the ``COUNTY''
column in our dataframe. But we actually have the latitude and longitude
of each individual sensor. We can visualize latitude and longitude data
quite simply as a scatterplot.

Remember, in our original dataframe each row is a reading from a given
sensor on a given day. The sensor's location does not vary over time, so
if we simply plot our original dataframe, we'll have loads of points on
top of each other. Let's pick a specific date, take a slice of our
dataframe on that one date, and plot it. I've picked September 9th based
on the plots above (looks like air quality was really bad).

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a variable with the date of interest, September 9th 2020. }
\NormalTok{date}\OperatorTok{=}\StringTok{\textquotesingle{}09{-}09{-}2020\textquotesingle{}}

\CommentTok{\# filter the original dataframe using this date}
\NormalTok{one\_day}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\NormalTok{date]}

\CommentTok{\# create a scatterplot of sensor locations using latitude and longitude }
\NormalTok{plt.scatter(}
\NormalTok{    x}\OperatorTok{=}\NormalTok{one\_day[}\StringTok{\textquotesingle{}longitude\textquotesingle{}}\NormalTok{],}
\NormalTok{    y}\OperatorTok{=}\NormalTok{one\_day[}\StringTok{\textquotesingle{}latitude\textquotesingle{}}\NormalTok{])}

\CommentTok{\# as always, label our axes and the plot!}
\NormalTok{plt.xlabel(}\StringTok{"Longitude"}\NormalTok{)}
\NormalTok{plt.ylabel(}\StringTok{"Latitude"}\NormalTok{)}
\NormalTok{plt.title(}\StringTok{"Geographic Distribution of AQI sensors in California"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Text(0.5, 1.0, 'Geographic Distribution of AQI sensors in California')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W03. Spatial Data_files/figure-pdf/cell-15-output-2.png}

}

\end{figure}

If you close your eyes and imagine the shape of California, you can
probably see its outline roughly traced in the points above. This plot
leaves a number of things to be desired.

\hypertarget{basemaps}{%
\subsection{Basemaps}\label{basemaps}}

First, we may want to add in a base map of some kind so we can have a
better sense of where each sensor is. For this, we have to import an
extra library called ``Basemap''

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# import Basemap library}
\ImportTok{from}\NormalTok{ mpl\_toolkits.basemap }\ImportTok{import}\NormalTok{ Basemap}

\CommentTok{\# create a basemap, call it \textquotesingle{}map\textquotesingle{}}
\BuiltInTok{map} \OperatorTok{=}\NormalTok{ Basemap(projection}\OperatorTok{=}\StringTok{\textquotesingle{}lcc\textquotesingle{}}\NormalTok{, resolution}\OperatorTok{=}\StringTok{\textquotesingle{}l\textquotesingle{}}\NormalTok{, }\CommentTok{\# this selects the projection of the map.}
\NormalTok{            lat\_0}\OperatorTok{=}\FloatTok{37.5}\NormalTok{, lon\_0}\OperatorTok{={-}}\DecValTok{119}\NormalTok{, }\CommentTok{\# this sets the center of the map }
\NormalTok{            width}\OperatorTok{=}\FloatTok{1E6}\NormalTok{, height}\OperatorTok{=}\FloatTok{1.2E6}\NormalTok{) }\CommentTok{\# this sets the window that we\textquotesingle{}re looking at, in meters.}

\CommentTok{\# We can add features to our blank basemap, including coastlines, as well as state and country boundaries. }
\BuiltInTok{map}\NormalTok{.drawcoastlines(color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{)}
\BuiltInTok{map}\NormalTok{.drawcountries(color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{)}
\BuiltInTok{map}\NormalTok{.drawstates(color}\OperatorTok{=}\StringTok{\textquotesingle{}gray\textquotesingle{}}\NormalTok{)}

\CommentTok{\# Finally, we add in our AQI sensor data on top of the basemap.}
\BuiltInTok{map}\NormalTok{.scatter(}
\NormalTok{    one\_day[}\StringTok{\textquotesingle{}longitude\textquotesingle{}}\NormalTok{], }
\NormalTok{    one\_day[}\StringTok{\textquotesingle{}latitude\textquotesingle{}}\NormalTok{], }
\NormalTok{    latlon}\OperatorTok{=}\VariableTok{True}\NormalTok{)}

\CommentTok{\# as always, title your figure}
\NormalTok{plt.title(}\StringTok{"Geographic Distribution of AQI sensors in California"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
ImportError: dlopen(/Users/ollieballinger/miniconda3/envs/geo/lib/python3.9/site-packages/_geoslib.cpython-39-darwin.so, 0x0002): tried: '/Users/ollieballinger/miniconda3/envs/geo/lib/python3.9/site-packages/_geoslib.cpython-39-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64')), '/System/Volumes/Preboot/Cryptexes/OS/Users/ollieballinger/miniconda3/envs/geo/lib/python3.9/site-packages/_geoslib.cpython-39-darwin.so' (no such file), '/Users/ollieballinger/miniconda3/envs/geo/lib/python3.9/site-packages/_geoslib.cpython-39-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))
\end{verbatim}

That's looking a bit better! We now have a much better sense of the
actual distribution of these sensors within california. People who know
the area will recognize clusters of sensors around San Francisco and Los
Angeles; This makes sense, given that these areas have a higher
population density. However, our plot is still missing some pretty
important information: the actual AQI readings!

\hypertarget{colormaps}{%
\subsection{Colormaps}\label{colormaps}}

The whole point of plotting these sensors is to understand the spatial
distribution of air pollution from the 2020 wildfires.

The EPA published the following
\href{https://www.airnow.gov/aqi/aqi-basics/}{table} on their website,
which creates a color-coded scale of AQI values that corresponds to the
impact thereof on human health.

\begin{itemize}
\tightlist
\item
  AQI under 50 is colored green, and indicates ``Good'' air quality.
\item
  AQI between 100 and 200 is generally unhealthy
\item
  AQI over 300 is deemed hazardous.
\end{itemize}

With this in mind, quickly scroll back up to the AQI plots over time. If
you did everything correctly, you should notice that the \emph{average}
AQI value across all sensors in the worst affected county was over 600!

We'll be using the table from the EPA website to build our own color
map. In the code below, I scrape the table and turn it into a
``colormap'' (basically, a dictionary that associates numbers with
colors) that we'll use to color the AQI sensors later.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# scrape the table of AQI values and corresponding colors }
\CommentTok{\# save it as a dataframe called colors}
\NormalTok{colors}\OperatorTok{=}\NormalTok{pd.read\_html(}\StringTok{\textquotesingle{}https://www.airnow.gov/aqi/aqi{-}basics/\textquotesingle{}}\NormalTok{)[}\DecValTok{0}\NormalTok{]}

\CommentTok{\# create a numerical column for AQI values by splitting the test in the "values of index" column. }
\CommentTok{\# pull out the first string, and convert it to integer}
\NormalTok{colors[}\StringTok{\textquotesingle{}aqi\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{colors[}\StringTok{\textquotesingle{}Values of Index\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.split(}\StringTok{\textquotesingle{} \textquotesingle{}}\NormalTok{).}\BuiltInTok{str}\NormalTok{[}\DecValTok{0}\NormalTok{].astype(}\BuiltInTok{int}\NormalTok{)}

\CommentTok{\# print three columns from the dataframe }
\BuiltInTok{print}\NormalTok{(colors[[}\StringTok{\textquotesingle{}aqi\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Daily AQI Color\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Levels of Concern\textquotesingle{}}\NormalTok{]])}

\CommentTok{\# create a "colormap" from this dataframe using the "Daily AQI Color" column, and the "aqi" column }
\NormalTok{aqi\_colors}\OperatorTok{=}\NormalTok{matplotlib.colors.LinearSegmentedColormap.from\_list(colors[}\StringTok{\textquotesingle{}aqi\textquotesingle{}}\NormalTok{],colors[}\StringTok{\textquotesingle{}Daily AQI Color\textquotesingle{}}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
   aqi Daily AQI Color               Levels of Concern
0    0           Green                            Good
1   51          Yellow                        Moderate
2  101          Orange  Unhealthy for Sensitive Groups
3  151             Red                       Unhealthy
4  201          Purple                  Very Unhealthy
5  301          Maroon                       Hazardous
\end{verbatim}

Now, we can use this ``aqi\_colors'' object as a color palette later
when we plot the AQI sensors. This way, we will know that green and
yellow points are OK, while red and purple points represent hazardous
levels of air pollution. I've annotated the code above, but it's ok if
you don't get all of it. You could simply load a different colormap in
one line of code; check out the documentation
\href{https://matplotlib.org/stable/tutorials/colors/colormaps.html}{here}.

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{map} \OperatorTok{=}\NormalTok{ Basemap(projection}\OperatorTok{=}\StringTok{\textquotesingle{}lcc\textquotesingle{}}\NormalTok{, resolution}\OperatorTok{=}\StringTok{\textquotesingle{}l\textquotesingle{}}\NormalTok{, }
\NormalTok{            lat\_0}\OperatorTok{=}\FloatTok{37.5}\NormalTok{, lon\_0}\OperatorTok{={-}}\DecValTok{119}\NormalTok{,}
\NormalTok{            width}\OperatorTok{=}\FloatTok{1E6}\NormalTok{, height}\OperatorTok{=}\FloatTok{1.2E6}\NormalTok{)}

\BuiltInTok{map}\NormalTok{.drawcoastlines(color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{)}
\BuiltInTok{map}\NormalTok{.drawcountries(color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{)}
\BuiltInTok{map}\NormalTok{.drawstates(color}\OperatorTok{=}\StringTok{\textquotesingle{}gray\textquotesingle{}}\NormalTok{)}

\BuiltInTok{map}\NormalTok{.scatter(}
\NormalTok{      one\_day[}\StringTok{\textquotesingle{}longitude\textquotesingle{}}\NormalTok{], }
\NormalTok{      one\_day[}\StringTok{\textquotesingle{}latitude\textquotesingle{}}\NormalTok{], }
\NormalTok{      latlon}\OperatorTok{=}\VariableTok{True}\NormalTok{, }
\NormalTok{      c}\OperatorTok{=}\NormalTok{one\_day[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{], }\CommentTok{\# We\textquotesingle{}re adding that }
\NormalTok{      cmap}\OperatorTok{=}\NormalTok{aqi\_colors, }
\NormalTok{      vmin}\OperatorTok{=}\DecValTok{0}\NormalTok{, }
\NormalTok{      vmax}\OperatorTok{=}\DecValTok{300}\NormalTok{)}


\NormalTok{plt.title(}\StringTok{\textquotesingle{}Air Quality on September 9th, 2020\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.colorbar(label}\OperatorTok{=}\StringTok{\textquotesingle{}Air Quality Index\textquotesingle{}}\NormalTok{)}\OperatorTok{;}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W03. Spatial Data_files/figure-pdf/cell-18-output-1.png}

}

\end{figure}

This plot gives us a good sense of which areas were worst affected by
the wildfires on September 9th, 2020. Areas in the central valley
suffered particularly bad air quality, with AQI reaching hazardous
levels in some areas.

\hypertarget{exercise-6}{%
\subsection{Exercise}\label{exercise-6}}

So far, we've been plotting data from one day, using a dataframe we
generated by filtering the date column like so:
\texttt{one\_day=df{[}df{[}\textquotesingle{}Date\textquotesingle{}{]}==\textquotesingle{}09-09-2020\textquotesingle{}{]}}
(date format is day-month-year).

Using the code from the previous cell, generate a plot of AQI on March
2nd, 2020. After that, use the groupby function to generate a plot of
the maximum AQI reading for each sensor and plot it.

If you've followed along this far, well done! we've come a long way from
a spreadsheet full of sensor readings. But we can go even further!

\hypertarget{advanced-satellite-imagery-and-interactivity}{%
\section{Advanced: Satellite Imagery and
Interactivity}\label{advanced-satellite-imagery-and-interactivity}}

The AQI plots we've generated above give us a good sense of where the
worst air pollution was on a given day; but we're still basically
\emph{inferring} the presence of fires. Luckily, we don't have to do
that. The plumes of smoke generated by the fires were so vast that they
were visible from space. There are a variety of satellites that image
the earth each day (some, like GOES-17, take a picture every few
minutes!).

NASA's Moderate Resolution Imaging Spectroradiometer (MODIS) satellites
take a picture of the same spot on earth nearly every day. So far, we've
been looking at September 9th as a particularly bad day for air quality
in California. Let's have a look at a satellite image from that day. A
Python library called ipyleaflet contains some useful functions that let
us pull up an interactive map of satellite imagery.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# import the map making modules from ipyleaflet}
\ImportTok{from}\NormalTok{ ipyleaflet }\ImportTok{import}\NormalTok{ Map, Marker, basemaps, basemap\_to\_tiles,Circle}
\ImportTok{from}\NormalTok{ ipywidgets }\ImportTok{import}\NormalTok{ HTML}

\CommentTok{\# let create an interactive Map object called "satellite\_map"}
\NormalTok{satellite\_map }\OperatorTok{=}\NormalTok{ Map(}
\NormalTok{  basemap}\OperatorTok{=}\NormalTok{basemap\_to\_tiles( }\CommentTok{\#this function lets us pick from a list of basemaps for our interactive map}
\NormalTok{    basemaps.NASAGIBS.ModisTerraTrueColorCR, }\StringTok{"2020{-}09{-}09"} \CommentTok{\# here we\textquotesingle{}re specifying that we want MODIS imagery, and the date that we want it from  }
\NormalTok{  ),}
\NormalTok{  center}\OperatorTok{=}\NormalTok{(}\FloatTok{36.77}\NormalTok{, }\OperatorTok{{-}}\FloatTok{119.41}\NormalTok{), }\CommentTok{\# then, we want to center the map on california. these coordinates do that}
\NormalTok{  zoom}\OperatorTok{=}\DecValTok{5}\NormalTok{, }\CommentTok{\#finally, we want to set the zoom level of the map. }
\NormalTok{)}

\CommentTok{\# once we\textquotesingle{}ve created the map object we can make it bigger or smaller. let\textquotesingle{}s make it 700 pixels tall. }
\NormalTok{satellite\_map.layout.height }\OperatorTok{=} \StringTok{\textquotesingle{}700px\textquotesingle{}}

\CommentTok{\# now, we visualize it.}
\NormalTok{satellite\_map}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Map(center=[36.77, -119.41], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom…
\end{verbatim}

This is a pretty striking image of the West Coast of the U.S. We can see
fluffy white clouds to the East and West, but in the center of the map
plumes of brown smoke emanate from wildfires in California and Oregon.
Use the + - keys in the top left to zoom in, see if you can spot some
wildfires.

\hypertarget{exercise-7}{%
\subsection{Exercise}\label{exercise-7}}

Try changing the code in the cell above to display an image from
September 15th. You could even try importing a different basemap (like
nighttime lights) using this
\href{https://ipyleaflet.readthedocs.io/en/latest/map_and_basemaps/basemaps.html}{list
of basemaps}.

\hypertarget{combining-sensors-and-satellite-images}{%
\section{Combining sensors and satellite
images}\label{combining-sensors-and-satellite-images}}

A cool part of working with spatial data is that we can combine two
completeley different datasets using spatial information. We can add the
AQI sensor data as points to this map.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# grab the first row from our September 9th dataframe}
\NormalTok{row}\OperatorTok{=}\NormalTok{one\_day.iloc[}\DecValTok{0}\NormalTok{]}
\BuiltInTok{print}\NormalTok{(row)}

\CommentTok{\# This part uses the AQI value in this row (72), and looks up the corresponding color in the colormap we created earlier }
\NormalTok{color}\OperatorTok{=}\NormalTok{matplotlib.colors.rgb2hex(aqi\_colors(row[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{]))}

\CommentTok{\# Now we create a Circle object using the latitude and longitude from the row, and color it using the color we just selected}
\NormalTok{point}\OperatorTok{=}\NormalTok{Circle(location}\OperatorTok{=}\NormalTok{(row[}\StringTok{\textquotesingle{}latitude\textquotesingle{}}\NormalTok{],row[}\StringTok{\textquotesingle{}longitude\textquotesingle{}}\NormalTok{]), color}\OperatorTok{=}\NormalTok{color)}

\CommentTok{\# Add this as a layer to the satellite\_map object}
\NormalTok{satellite\_map.add\_layer(point)}

\CommentTok{\# Display the updated map}
\NormalTok{satellite\_map}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Date                       2020-09-09 00:00:00
Site ID                               60010007
POC                                          3
PM                                        22.3
AQI                                         72
Site Name                            Livermore
CBSA_NAME    San Francisco-Oakland-Hayward, CA
COUNTY                                 Alameda
latitude                             37.687526
longitude                          -121.784217
Month                                        9
Day                                        253
Name: 249, dtype: object
\end{verbatim}

\begin{verbatim}
Map(center=[36.77, -119.41], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom…
\end{verbatim}

It's a bit hard to see, but we've plotted an AQI sensor! its under the
cloud of smoke in the center of the map. You can zoom in to get a closer
look. looks like AQI was pretty bad at this location.

Having plotted one point, we can now plot all the points on September
9th! to do so, we can use the \texttt{iterrows} function in Pandas
which, suprisingly, lets you iterate over rows in a dataframe. The first
line of code below allows us to iterate over the rows in the
\texttt{one\_day} dataframe. It will then run everything in the indented
block for each row; in other words, for each row, it will:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  use the row's value in the AQI value to select a color for the point
\item
  create a point object using the latitude and longitude columns
\item
  add that point to the satellite map.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{for}\NormalTok{ index, row }\KeywordTok{in}\NormalTok{ one\_day.iterrows():}
\NormalTok{  color}\OperatorTok{=}\NormalTok{matplotlib.colors.rgb2hex(aqi\_colors(row[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{]))}
\NormalTok{  point}\OperatorTok{=}\NormalTok{Circle(location}\OperatorTok{=}\NormalTok{(row[}\StringTok{\textquotesingle{}latitude\textquotesingle{}}\NormalTok{],row[}\StringTok{\textquotesingle{}longitude\textquotesingle{}}\NormalTok{]), color}\OperatorTok{=}\NormalTok{color)}
\NormalTok{  satellite\_map.add\_layer(point)}

\CommentTok{\# display the map}
\NormalTok{satellite\_map}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Map(center=[36.77, -119.41], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom…
\end{verbatim}

Theres a pretty striking trend in this data. If you zoom in, you'll see
that the AQI sensors to the East are all green since they are up-wind
from the fires. A few kilometers downwind of the fires, the AQI sensors
display very high readings. Remember, our AQI data and the satellite
imagery are derived from totally different sources, and are totally
different types of data, but they seem to be telling us the same story.
They actually complement each other in important ways. In our original
plot of the AQI sensors without satellite imagery, we could tell that
there was bad air quality on September 9th, but some sensors were green
and others were red. The satellite image shows us that the variation in
AQI across California on September 9th was due to the direction of the
wind, blowing the smoke from the wildfires westward.

\hypertarget{extension-1}{%
\section{Extension}\label{extension-1}}

Now, to save some hassle we can package all the code we used to generate
this map into one clean function. Because we're effectively just
changing the date, we can configure this function so that we can feed it
a different date, and it will grab a satellite image and filter our
dataframe for values occuring on that day. Then, we can draw a new map
in one line of code.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ satellite\_plot(date):}
  
\NormalTok{  ymd}\OperatorTok{=}\NormalTok{datetime.strptime(date, }\StringTok{\textquotesingle{}}\SpecialCharTok{\%d}\StringTok{{-}\%m{-}\%Y\textquotesingle{}}\NormalTok{).strftime(}\StringTok{\textquotesingle{}\%Y{-}\%m{-}}\SpecialCharTok{\%d}\StringTok{\textquotesingle{}}\NormalTok{)}

\NormalTok{  satellite\_map }\OperatorTok{=}\NormalTok{ Map(}
\NormalTok{    basemap}\OperatorTok{=}\NormalTok{basemap\_to\_tiles(}
\NormalTok{      basemaps.NASAGIBS.ModisTerraTrueColorCR, ymd}
\NormalTok{    ),}
\NormalTok{    center}\OperatorTok{=}\NormalTok{(}\FloatTok{36.77}\NormalTok{, }\OperatorTok{{-}}\FloatTok{119.41}\NormalTok{),}
\NormalTok{    zoom}\OperatorTok{=}\DecValTok{6}\NormalTok{,}
\NormalTok{  )}

\NormalTok{  satellite\_map.layout.height }\OperatorTok{=} \StringTok{\textquotesingle{}700px\textquotesingle{}}

\NormalTok{  one\_day}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\NormalTok{date]}

  \ControlFlowTok{for}\NormalTok{ index, row }\KeywordTok{in}\NormalTok{ one\_day.iterrows():}
\NormalTok{    color}\OperatorTok{=}\NormalTok{matplotlib.colors.rgb2hex(aqi\_colors(row[}\StringTok{\textquotesingle{}AQI\textquotesingle{}}\NormalTok{]))}
\NormalTok{    point}\OperatorTok{=}\NormalTok{Circle(location}\OperatorTok{=}\NormalTok{(row[}\StringTok{\textquotesingle{}latitude\textquotesingle{}}\NormalTok{],row[}\StringTok{\textquotesingle{}longitude\textquotesingle{}}\NormalTok{]), color}\OperatorTok{=}\NormalTok{color)}
\NormalTok{    point.popup }\OperatorTok{=}\NormalTok{ HTML(}\BuiltInTok{str}\NormalTok{(row[}\StringTok{\textquotesingle{}Site Name\textquotesingle{}}\NormalTok{]))    }
\NormalTok{    satellite\_map.add\_layer(point)}
  \ControlFlowTok{return}\NormalTok{ satellite\_map}
\end{Highlighting}
\end{Shaded}

Now, we can simply change the date in the function and view both
satellite imagery and AQI sensor data from a given day. Look at this
clear day from February 3rd.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{satellite\_plot(}\StringTok{\textquotesingle{}02{-}03{-}2020\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Map(center=[36.77, -119.41], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom…
\end{verbatim}

All the AQI sensors are showing green values, indicating generally good
air quality. The satellite image shows a few wispy clouds, but no thick
yellow smoke. Now change the date to September 15th, and see what
happens!

\bookmarksetup{startatroot}

\hypertarget{assessed-question-2}{%
\chapter{Assessed Question}\label{assessed-question-2}}

Earlier, we created a dataframe called \texttt{daily} in which we
calculated the average daily AQI across the state for every day of the
year. (hint: try re-generating this dataframe using the \texttt{Date}
column rather than the \texttt{Day} column in the \texttt{.groupby()}
function)

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Sort that dataframe to figure out which day had the \textbf{second
  worst} AQI.
\item
  Plug that date into the \texttt{satellite\_plot()} function to
  visualize the corresponding satellite image. If you've done things
  correctly so far, you should see a big plume of smoke emanating from
  the mountains of california spreading northwards.
\item
  Clicking on a sensor will reveal its name-- which sensor is closest to
  the plume's origin?
\end{enumerate}

\bookmarksetup{startatroot}

\hypertarget{natural-language-processing}{%
\chapter{Natural Language
Processing}\label{natural-language-processing}}

\hypertarget{workshop-4-open-in-colab}{%
\section[\emph{Workshop 4} ]{\texorpdfstring{\emph{Workshop 4}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W04.\%20Natural\%20Language\%20Processing.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Workshop 4 Open In Colab}}\label{workshop-4-open-in-colab}}

\hypertarget{background-1}{%
\section{Background}\label{background-1}}

Exxon Mobil is the 4th largest oil company in the world. In 1978, an
Exxon scientist named James Black wrote an
\href{https://insideclimatenews.org/documents/james-black-1977-presentation/}{internal
briefing} called ``The Greenhouse Effect'' in which he warned: ``Present
thinking holds that man has a time window of five to ten years before
the need for hard decisions regarding changes in energy strategies might
become critical.''

Rather than acting on this information, Exxon spent the next
\href{https://news.harvard.edu/gazette/story/2021/09/oil-companies-discourage-climate-action-study-says/}{forty
years aggressively funding climate denial}. Recently,
\href{https://www.theguardian.com/environment/2022/may/24/exxon-trial-climate-crimes-fossil-fuels-global-heating}{a
U.S. court ruled} that ExxonMobil must face trial over accusations that
it lied about the climate crisis and covered up the fossil fuel
industry's role in worsening environmental devastation.

\hypertarget{earnings-calls}{%
\subsection{Earnings Calls}\label{earnings-calls}}

Every three months, Exxon conducts an
\href{https://www.investopedia.com/terms/e/earnings-call.asp}{``earnings
call''}; a conference call between the management of a public company,
analysts, investors, and the media to discuss the company's financial
results during a given reporting period, such as a quarter or a fiscal
year.

You can
\href{https://globalmeet.webcasts.com/starthere.jsp?ei=1488251\&tp_key=440e363aaf}{register}
to attend their next one if you want! No worries if you miss it, they
provide
\href{https://corporate.exxonmobil.com/Investors/Investor-relations/Investor-materials-archive\#Quarterlyearningsmaterials}{transcripts}
on their website.

These transcripts provide an intimate window into the company's
dealings. We can see how much pressure investors are putting on the
company to tackle climate change, and how the company responds.

We'll be working with transcripts spanning nealry 20 years and over 10
million words; that's like reading the Harry Potter series 10 times.
Then, we'll look at a sample of 100,000 tweets that use the \#ExxonKnew
hashtag, and analyze public pressure on the company.

I gathered the data used in this lab for two papers:

\begin{itemize}
\item
  \href{https://discovery.ucl.ac.uk/id/eprint/10196075/1/ssrn-3694447.pdf}{Transition,
  Hedge, or Resist? Understanding Political and Economic Behavior toward
  Decarbonization in the Oil and Gas Industry}
\item
  \href{https://muse.jhu.edu/article/847625}{Using Earnings Calls to
  Understand the Political Behavior of Major Polluters}
\end{itemize}

\includegraphics{index_files/mediabag/business.jpg}

\hypertarget{downloading-the-data-1}{%
\section{Downloading the Data}\label{downloading-the-data-1}}

Let's grab the data we will need this week from our course website and
save it into our data folder. If you've not already created a data
folder then do so using the following command.

Don't worry if it generates an error, that means you've already got a
data folder.

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{\%\%}\NormalTok{capture}
\OperatorTok{!}\NormalTok{pip install spacy}
\OperatorTok{!}\NormalTok{pip install scattertext}
\OperatorTok{!}\NormalTok{pip install tika}
\OperatorTok{!}\NormalTok{pip install spacytextblob}
\OperatorTok{!}\NormalTok{python }\OperatorTok{{-}}\NormalTok{m spacy download en\_core\_web\_sm}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ spacy}
\ImportTok{import}\NormalTok{ json}
\ImportTok{import}\NormalTok{ pylab}
\ImportTok{from}\NormalTok{ IPython.core.display }\ImportTok{import}\NormalTok{ display, HTML}
\ImportTok{import}\NormalTok{ nltk}
\ImportTok{from}\NormalTok{ tika }\ImportTok{import}\NormalTok{ parser}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{from}\NormalTok{ spacytextblob.spacytextblob }\ImportTok{import}\NormalTok{ SpacyTextBlob}

\OperatorTok{\%}\NormalTok{matplotlib inline}
\NormalTok{pylab.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\FloatTok{10.}\NormalTok{, }\FloatTok{8.}\NormalTok{)}
\NormalTok{nlp }\OperatorTok{=}\NormalTok{ spacy.load(}\StringTok{"en\_core\_web\_sm"}\NormalTok{)}
\NormalTok{nlp.add\_pipe(}\StringTok{\textquotesingle{}spacytextblob\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
/var/folders/v8/1pmfp2n52yg0xbnknpm7_7xh0000gn/T/ipykernel_83777/1724223035.py:4: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display
  from IPython.core.display import display, HTML
\end{verbatim}

\begin{verbatim}
<spacytextblob.spacytextblob.SpacyTextBlob at 0x295bf4290>
\end{verbatim}

\hypertarget{downloading-and-reading-one-earnings-call}{%
\section{Downloading and reading one earnings
call}\label{downloading-and-reading-one-earnings-call}}

Exxon host earnings calls on their website in PDF form. Usually, working
with PDFs is a real pain as they are not machine-readable. Using a
python package called
\href{https://www.geeksforgeeks.org/parsing-pdfs-in-python-with-tika/}{tika},
we can ``parse'' a pdf, turning it into machine-readable text:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# define the URL where your PDF lives. You could also upload your own pdf.}
\CommentTok{\#url=\textquotesingle{}https://corporate.exxonmobil.com/{-}/media/Global/Files/investor{-}relations/quarterly{-}earnings/earnings{-}transcripts/2022{-}earnings{-}transcripts/1Q22{-}XOM{-}Earnings{-}Call{-}Transcript{-}4{-}29{-}22.pdf\textquotesingle{}}
\NormalTok{url}\OperatorTok{=}\StringTok{\textquotesingle{}https://d1io3yog0oux5.cloudfront.net/\_74d009918ead0ec6acdd6bbaf27a8316/exxonmobil/db/2288/22123/earnings\_release/XOM+2Q23+Earnings+Press+Release+Website.pdf\textquotesingle{}}
\CommentTok{\# parse the pdf by feeding tika the URL and store the text in an object called "raw" }
\NormalTok{raw }\OperatorTok{=}\NormalTok{ parser.from\_file(url)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
2024-10-25 10:27:34,041 [MainThread  ] [INFO ]  Retrieving https://d1io3yog0oux5.cloudfront.net/_74d009918ead0ec6acdd6bbaf27a8316/exxonmobil/db/2288/22123/earnings_release/XOM+2Q23+Earnings+Press+Release+Website.pdf to /var/folders/v8/1pmfp2n52yg0xbnknpm7_7xh0000gn/T/_74d009918ead0ec6acdd6bbaf27a8316-exxonmobil-db-2288-22123-earnings_release-xom-2q23-earnings-press-release-website.pdf.
2024-10-25 10:27:34,668 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /var/folders/v8/1pmfp2n52yg0xbnknpm7_7xh0000gn/T/tika-server.jar.
2024-10-25 10:27:36,009 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /var/folders/v8/1pmfp2n52yg0xbnknpm7_7xh0000gn/T/tika-server.jar.md5.
2024-10-25 10:27:36,403 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2024-10-25 10:27:41,409 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
\end{verbatim}

Now, we have an object called ``raw'' that contains some useful
information. Notice the squiggly brackets; this is a dictionary. It
contains several fields, including some useful metadata such as the
author

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{raw[}\StringTok{\textquotesingle{}metadata\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
{'pdf:unmappedUnicodeCharsPerPage': ['0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 'pdf:PDFVersion': '1.4',
 'pdf:docinfo:title': 'f8k2Q23991',
 'xmp:CreatorTool': 'Workiva',
 'pdf:hasXFA': 'false',
 'access_permission:modify_annotations': 'true',
 'access_permission:can_print_degraded': 'true',
 'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser',
  'org.apache.tika.parser.pdf.PDFParser'],
 'X-TIKA:content_handler': 'ToTextContentHandler',
 'dc:creator': 'anonymous',
 'pdf:num3DAnnotations': '0',
 'dcterms:created': '2023-07-27T20:47:01Z',
 'dcterms:modified': '2023-07-27T20:47:01Z',
 'dc:format': 'application/pdf; version=1.4',
 'pdf:docinfo:creator_tool': 'Workiva',
 'pdf:overallPercentageUnmappedUnicodeChars': '0.0',
 'access_permission:fill_in_form': 'true',
 'pdf:docinfo:modified': '2023-07-27T20:47:01Z',
 'pdf:hasCollection': 'false',
 'pdf:encrypted': 'false',
 'dc:title': 'f8k2Q23991',
 'pdf:containsNonEmbeddedFont': 'false',
 'Content-Length': '390379',
 'pdf:hasMarkedContent': 'false',
 'Content-Type': 'application/pdf',
 'pdf:docinfo:creator': 'anonymous',
 'pdf:producer': 'Wdesk Fidelity Content Translations Version 008.001.041',
 'pdf:totalUnmappedUnicodeChars': '0',
 'access_permission:extract_for_accessibility': 'true',
 'access_permission:assemble_document': 'true',
 'xmpTPg:NPages': '21',
 'resourceName': "b'_74d009918ead0ec6acdd6bbaf27a8316-exxonmobil-db-2288-22123-earnings_release-xom-2q23-earnings-press-release-website.pdf'",
 'pdf:hasXMP': 'false',
 'pdf:charsPerPage': ['3253',
  '1836',
  '2622',
  '2094',
  '2793',
  '2139',
  '1705',
  '2838',
  '6498',
  '6705',
  '3175',
  '2213',
  '1628',
  '1616',
  '997',
  '2786',
  '995',
  '1292',
  '994',
  '1189',
  '719'],
 'access_permission:extract_content': 'true',
 'access_permission:can_print': 'true',
 'X-TIKA:Parsed-By': ['org.apache.tika.parser.DefaultParser',
  'org.apache.tika.parser.pdf.PDFParser'],
 'X-TIKA:parse_time_millis': '1968',
 'X-TIKA:embedded_depth': '0',
 'access_permission:can_modify': 'true',
 'pdf:docinfo:producer': 'Wdesk Fidelity Content Translations Version 008.001.041',
 'pdf:docinfo:created': '2023-07-27T20:47:01Z',
 'pdf:containsDamagedFont': 'false'}
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{date}\OperatorTok{=}\NormalTok{raw[}\StringTok{\textquotesingle{}metadata\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}dcterms:created\textquotesingle{}}\NormalTok{]}
\NormalTok{title}\OperatorTok{=}\NormalTok{raw[}\StringTok{\textquotesingle{}metadata\textquotesingle{}}\NormalTok{][}\StringTok{\textquotesingle{}dc:title\textquotesingle{}}\NormalTok{]}
\NormalTok{raw\_text}\OperatorTok{=}\NormalTok{raw[}\StringTok{\textquotesingle{}content\textquotesingle{}}\NormalTok{]}

\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Date: \textquotesingle{}}\NormalTok{, date)}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Title: \textquotesingle{}}\NormalTok{, title)}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Word Count: \textquotesingle{}}\NormalTok{, }\BuiltInTok{len}\NormalTok{(raw\_text))}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Text:\textquotesingle{}}\NormalTok{)}
\NormalTok{raw\_text}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Date:  2023-07-27T20:47:01Z
Title:  f8k2Q23991
Word Count:  52501
Text:
\end{verbatim}

\begin{verbatim}
"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nf8k2Q23991\n\n\n2Q 2023 Earnings Release\nFOR IMMEDIATE RELEASE July 28, 2023\n\nExxonMobil Announces Second-Quarter 2023 Results\n• Structural earnings improvements contributed to strong second-quarter earnings of $7.9 billion\n\n• Achieved record quarterly production in the Permian and Guyana, demonstrating excellent operational \nperformance\n\n• Highest second-quarter global refinery throughput in the last 15 years1\n\n• Expanded leadership in carbon capture and storage by agreeing to acquire Denbury and reaching 5 million \nmetric tons per year of CO2 offtake contracts with industrial customers2 \n\nResults Summary\n\n2Q23 1Q23\nChange\n\nvs \n1Q23\n\n2Q22\nChange \n\nvs \n2Q22\n\nDollars in millions (except per share data) YTD \n2023\n\nYTD \n2022\n\nChange \nvs YTD\n2022\n\n 7,880  11,430  -3,550  17,850  -9,970 Earnings (U.S. GAAP)  19,310  23,330  -4,020 \n\n 7,874  11,618  -3,744  17,551  -9,677 Earnings Excluding Identified Items (non-GAAP)  19,492  26,384  -6,892 \n\n 1.94  2.79  -0.85  4.21  -2.27 Earnings Per Common Share 3  4.73  5.49  -0.76 \n\n 1.94  2.83  -0.89  4.14  -2.20 Earnings Excl. Identified Items Per Common Share 3  4.77  6.21  -1.44 \n\n 6,166  6,380  -214  4,609  +1,557 Capital and Exploration Expenditures  12,546  9,513  +3,033 \n\nSPRING, Texas – July 28, 2023 – Exxon Mobil Corporation today announced second-quarter 2023 earnings of $7.9 \nbillion, or $1.94 per share assuming dilution. Capital and exploration expenditures were $6.2 billion in the second \nquarter and $12.5 billion for the first half of 2023, in line with the company's full-year guidance of $23 billion to $25 \nbillion.\n\n“The work we've been doing to improve our underlying profitability is reflected in our second-quarter results, which \ndoubled from what we earned in a comparable industry commodity price environment4 just five years ago,” said \nDarren Woods, chairman and chief executive officer.\n\n“Earnings totaled more than $19 billion during the first half of the year, and we are on track to structurally reduce \ncosts by $9 billion at year end compared to 2019. Production is up 20% year-over-year in Guyana and the Permian, \nand we are playing a leading role in the industry's energy transition with an agreement to acquire Denbury and with \nthree world-scale CO2 offtake agreements. This reflects the significant opportunity to profitably grow our Low Carbon \nSolutions business by creating a compelling customer decarbonization proposition with the potential to reduce Gulf \nCoast industrial emissions by 100 million metric tons per year5.”\n\n1 Highest second-quarter global refinery throughput in the last 15 years (2009-2023) based on current refinery circuit.\n2 Based on contracts to move 5 MTA starting in 2025 subject to additional investment by ExxonMobil and permitting for carbon capture and storage projects.\n3 Assuming dilution.\n4 Based on ExxonMobil's assessment of historical industry commodity prices and margins referencing Intercontinental Exchange (ICE), S&P Global Platts, IHS \n\nMarkit as well as company estimates and analysis, the second-quarter 2023 industry commodity price environment is comparable to the second-quarter of \n2018. General industry commodity price environment comparisons may not be a complete match for individual segments.\n\n5 Subject to additional investment by ExxonMobil and permitting for carbon capture and storage projects.\n\n1\n\n\n\n1Q23 to 2Q23 Factor Analysis\n\nEarnings/(Loss) ($M)\n\n11,430 +188 11,618\n\n-3,958\n\n+501\n\n-141 -38 -108\n\n7,874 +6 7,880\n\n1Q23 \nEarnings\n\n(U.S. \nGAAP)\n\nIdentified \nItems\n\n1Q23 \nEarnings \n\nExcl. \nIdentified \n\nItems \n(non-\n\nGAAP)\n\nPrice /\nMargin\n\nUnsettled\nDerivatives\n\n(MTM)\n\nVolume \n/ Mix\n\nExpenses Other 2Q23 \nEarnings \n\nExcl. \nIdentified \n\nItems \n(non-\n\nGAAP)\n\nIdentified \nItems\n\n2Q23 \nEarnings \n\n(U.S. \nGAAP)\n\nCash Flow ($G)\n\n32.7\n+9.4\n\n-5.6\n\n+1.3 +0.1\n\n-8.0 -0.2\n\n29.6\n\n1Q23 Cash\n(U.S. GAAP)\n\nCFO Cash Capex Asset Sales Debt Shareholder \nDistributions\n\nOther 2Q23 Cash\n(U.S. GAAP)\n\nSecond-Quarter 2023 Financial Highlights\n• Earnings were $7.9 billion compared with first-quarter earnings of $11.4 billion. Excluding the identified item \n\nassociated with additional European taxes on the energy sector, earnings were $7.9 billion compared with $11.6 \nbillion in the prior quarter. \n\n• Lower natural gas realizations and industry refining margins adversely impacted earnings. Results benefited \nfrom the absence of prior quarter unfavorable derivative mark-to-market impacts.\n\n• The company remains on track to deliver $9 billion of structural cost savings by the end of 2023 relative to 2019, \nhaving achieved cumulative structural cost savings of $8.3 billion to date.\n\n• Cash flow from operations totaled $9.4 billion and free cash flow was $5.0 billion, which includes a net working \ncapital impact of $3.6 billion primarily driven by higher seasonal cash tax payments. Cash flow from operations \nexcluding working capital was $13.0 billion. The company's debt-to-capital ratio remained at 17% and net-debt-\nto-capital ratio was 5%, reflecting a period-end cash balance of $29.6 billion.\n\n• The three new central organizations formed this past quarter, Global Business Solutions, ExxonMobil Supply \nChain, and Global Trading, are off to a good start, further leveraging the company's scale and integrated \nbusiness model to lower cost and improve performance.\n\n2\n\n\n\nShareholder Distributions\n• Second-quarter shareholder distributions of $8.0 billion included $4.3 billion of share repurchases and $3.7 \n\nbillion of dividends.\n\n• The Corporation declared a third-quarter dividend of $0.91 per share, payable on Sept. 11, 2023, to shareholders \nof record of Common Stock at the close of business on Aug. 16, 2023.\n\nADVANCING CLIMATE SOLUTIONS\n\nCarbon Capture and Storage1\n\n• Already a global leader in carbon capture and storage (CCS), ExxonMobil expanded its position further by \nentering into a definitive agreement to acquire Denbury Inc. The planned acquisition provides ExxonMobil with \none of the largest owned and operated carbon dioxide (CO2) pipeline networks in the United States at 1,300 \nmiles, most of which is located along the U.S. Gulf Coast, one of the largest U.S. markets for CO2 emissions. \nThe planned acquisition includes 10 strategically located onshore sequestration sites as well as Denbury's 20-\nplus years of expertise in transporting and storing CO2. An established, cost-efficient transportation and storage \nsystem accelerates CCS deployment for ExxonMobil and third-party customers and underpins multiple low-\ncarbon value chains including CCS, hydrogen, ammonia, biofuels, and direct air capture.\n\n• ExxonMobil and Nucor Corporation, one of North America’s largest steel producers, have entered into a long-\nterm commercial agreement in which ExxonMobil, subject to government permitting, will capture, transport, and \nstore up to 800,000 metric tons of CO2 per year from Nucor’s steel manufacturing site in Convent, Louisiana. \nThe project, expected to start up in 2026, will tie into the same CO2 infrastructure that will be used by the \ncompany’s project with CF Industries.\n\nThe agreement with Nucor is the third CCS agreement announced in the past twelve months and brings the total \ncontracted CO2 to transport and store for third-party customers to 5 million metric tons per year. That is \nequivalent to replacing approximately 2 million gasoline-powered cars with electric vehicles2, which is roughly \nequal to the number of electric vehicles on U.S. roads today.\n\n1 The emission reduction outcome of these projects is subject to the timing and regulatory approval of necessary permits, acquisition of rights of way, changes in \nregulatory policy, supply chain disruptions, and other market conditions.\n\n2 ExxonMobil analysis based on assumptions for U.S. in 2022, including average distance traveled, fuel efficiency, average power grid carbon intensity, electric \nvehicle charging efficiency and other factors. Gas-powered cars include light-duty vehicles (cars, light trucks and SUVs).\n\n3\n\n\n\n.\n\nEARNINGS AND VOLUME SUMMARY BY SEGMENT\n\nUpstream\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n\n2023\nYTD \n2022\n\nEarnings/(Loss) (U.S. GAAP)\n 920  1,632  3,749 United States  2,552  6,125 \n 3,657  4,825  7,622 Non-U.S.  8,482  9,734 \n 4,577  6,457  11,371 Worldwide  11,034  15,859 \n\nEarnings/(Loss) Excluding Identified Items (non-GAAP)\n 920  1,632  3,450 United States  2,552  5,826 \n 3,669  4,983  7,622 Non-U.S.  8,652  12,989 \n 4,589  6,615  11,072 Worldwide  11,204  18,815 \n\n 3,608  3,831  3,732 Production (koebd)  3,719  3,704 \n\n• Upstream second-quarter earnings were $4.6 billion, a decrease of $1.9 billion from the first quarter. The main \nfactors were lower natural gas prices, which declined 40%, and seasonally higher scheduled maintenance. \nIdentified items unfavorably impacted earnings by $12 million this quarter, down from $158 million in the \nprevious quarter. Earnings excluding identified items decreased from $6.6 billion in the first quarter to $4.6 \nbillion in the second quarter.\n\n• Compared to the same quarter last year, earnings decreased $6.8 billion. Excluding identified items, earnings \ndeclined $6.5 billion, driven by lower crude and natural gas realizations. Production in Guyana and the Permian \ngrew by a combined 20% compared to the prior-year quarter. The increase was offset by impacts from \ndivestments, the Sakhalin-1 expropriation, and government-mandated curtailments.\n\n• Year-to-date earnings were $11.0 billion, a decrease of $4.8 billion versus the first half of 2022. The prior-year \nperiod was negatively impacted by an identified item associated with the Sakhalin-1 expropriation. Excluding \nidentified items, earnings declined $7.6 billion year-over-year. Higher production from advantaged projects in \nGuyana and the Permian provided a partial offset to lower crude and natural gas realizations. Year-to-date \nproduction was 3.7 million oil-equivalent barrels per day. Excluding divestments, entitlements, government \nmandates, and the Sakhalin-1 expropriation, net production grew by more than 160,000 oil-equivalent barrels \nper day driven by Guyana and the Permian.   \n\n4\n\n\n\nEnergy Products\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n\n2023\nYTD \n2022\n\nEarnings/(Loss) (U.S. GAAP)\n 1,528  1,910  2,655 United States  3,438  3,144 \n 782  2,273  2,617 Non-U.S.  3,055  1,933 \n 2,310  4,183  5,273 Worldwide  6,493  5,077 \n\nEarnings/(Loss) Excluding Identified Items (non-GAAP)\n 1,528  1,910  2,655 United States  3,438  3,144 \n 764  2,303  2,617 Non-U.S.  3,067  1,933 \n 2,292  4,213  5,273 Worldwide  6,505  5,077 \n\n 5,658  5,277  5,310 Energy Products Sales (kbd)  5,469  5,211 \n\n• Energy Products second-quarter earnings totaled $2.3 billion, down $1.9 billion from the first quarter. Industry \nmargins declined sequentially from a strong first quarter on weaker diesel margins as Russian supply concerns \neased. Lower margins were partially offset by higher volumes from the first full quarter of the Beaumont refinery \nexpansion, lower scheduled maintenance, and continued strong reliability.\n\n• Compared to the same quarter last year, earnings decreased $3.0 billion from lower industry refining margins, \npartly offset by increased marketing and trading contributions. \n\n• Year-to-date earnings were $6.5 billion, an increase of $1.4 billion versus the first half of 2022. Margins \nimproved as higher marketing and trading contributions more than offset declining industry refining margins. In \naddition, the impact from higher volumes, mainly from the start-up of the Beaumont refinery expansion and  \nimproved reliability, was partly offset by higher planned maintenance expense.\n\nChemical Products\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n\n2023\nYTD \n2022\n\nEarnings/(Loss) (U.S. GAAP)\n 486  324  625 United States  810  1,395 \n 342  47  450 Non-U.S.  389  1,086 \n 828  371  1,076 Worldwide  1,199  2,481 \n\nEarnings/(Loss) Excluding Identified Items (non-GAAP)\n 486  324  625 United States  810  1,395 \n 342  47  450 Non-U.S.  389  1,086 \n 828  371  1,076 Worldwide  1,199  2,481 \n\n 4,849  4,649  4,811 Chemical Products Sales (kt)  9,498  9,829 \n\n• Chemical Products second-quarter earnings were $828 million, up from $371 million in the first quarter, mainly \non improved margins from lower feed costs. Earnings also benefited from lower planned maintenance expense \nand increased sales volumes.\n\n• Compared to the same quarter last year, earnings decreased $248 million on weaker industry margins and \nunfavorable volume/mix effects.\n\n• Year-to-date earnings were $1.2 billion, a decrease of $1.3 billion versus the first half of 2022, driven by weaker \nindustry margins, lower sales volumes reflecting softer market fundamentals in the first quarter, and higher \nplanned maintenance.\n\n• The Baytown chemical expansion project, which will add 750 kta of performance chemicals production, \nachieved mechanical completion in the second quarter, with a phased start-up expected in the third quarter this \nyear.\n\n5\n\n\n\nSpecialty Products\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n\n2023\nYTD \n2022\n\nEarnings/(Loss) (U.S. GAAP)\n 373  451  232 United States  824  478 \n 298  323  185 Non-U.S.  621  415 \n 671  774  417 Worldwide  1,445  893 \n\nEarnings/(Loss) Excluding Identified Items (non-GAAP)\n 373  451  232 United States  824  478 \n 298  323  185 Non-U.S.  621  415 \n 671  774  417 Worldwide  1,445  893 \n\n 1,905  1,940  2,100 Specialty Products Sales (kt)  3,845  4,107 \n\n• Specialty Products earnings were $671 million, down $103 million from the first quarter. Lower basestock \nmargins and higher scheduled maintenance expense were partly offset by favorable tax items.\n\n• Compared to the same quarter last year, earnings increased by $254 million. Stronger finished lubes and \nbasestock margins were partially offset by lower sales volumes. \n\n• Year-to-date earnings were $1.4 billion, an increase of $552 million versus the first half of 2022. Both basestock \nand finished lubes margins improved from lower feed costs, partially offset by lower sales volumes.\n\n• During the second quarter, ExxonMobil announced it is planning to build a lubricants manufacturing plant in \nRaigad, India. The new plant is expected to produce 159,000 kiloliters of finished lubricants per year to help \nmeet demand growth in India, with start-up expected by year-end 2025.\n\nCorporate and Financing\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n\n2023\nYTD \n2022\n\n (506)  (355)  (286) Earnings/(Loss) (U.S. GAAP)  (861)  (980) \n (506)  (355)  (286) Earnings/(Loss) Excluding Identified Items (non-GAAP)  (861)  (882) \n\n• Corporate and Financing reported net charges of $506 million. This was an increase of $151 million versus the \nfirst quarter driven by unfavorable foreign exchange impacts and tax items.  \n\n• Compared to the same quarter last year, net charges increased $220 million. Unfavorable tax items and foreign \nexchange impacts were partly offset by lower financing costs.\n\n• Year-to-date charges were $861 million, a decrease of $119 million compared to the first half of 2022. Excluding \nthe identified item associated with the Sakhalin-1 expropriation, net charges decreased $21 million. \n\n6\n\n\n\n.\n\nCASH FLOW FROM OPERATIONS AND ASSET SALES EXCLUDING WORKING \nCAPITAL\n\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n2023\n\nYTD \n2022\n\n 8,153  11,843  18,574 Net income/(loss) including noncontrolling interests  19,996  24,324 \n 4,242  4,244  4,451 Depreciation and depletion (includes impairments)  8,486  13,334 \n (3,583)  (302)  (2,747) Changes in operational working capital, excluding cash and debt  (3,885)  (1,661) \n 571  556  (315) Other  1,127  (1,246) \n 9,383  16,341  19,963 Cash Flow from Operating Activities (U.S. GAAP)  25,724  34,751 \n\n 1,287  854  939 Proceeds from asset sales and returns of investments  2,141  1,232 \n 10,670  17,195  20,902 Cash Flow from Operations and Asset Sales (non-GAAP)  27,865  35,983 \n\n 3,583  302  2,747 Exclude changes in operational working capital, excluding cash and debt  3,885  1,661 \n\n 14,253  17,497  23,649 Cash Flow from Operations and Asset Sales excluding Working Capital\n(non-GAAP)  31,750  37,644 \n\n (1,287)  (854)  (939) Exclude proceeds from asset sales and returns of investments  (2,141)  (1,232) \n 12,966  16,643  22,710 Cash Flow from Operations excluding Working Capital (non-GAAP)  29,609  36,412 \n\nFREE CASH FLOW\n\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n2023\n\nYTD \n2022\n\n 9,383  16,341  19,963 Cash Flow from Operating Activities (U.S. GAAP)  25,724  34,751 \n (5,359)  (5,412)  (3,837) Additions to property, plant and equipment  (10,771)  (7,748) \n (389)  (445)  (226) Additional investments and advances  (834)  (643) \n 105  78  60 Other investing activities including collection of advances  183  150 \n 1,287  854  939 Proceeds from asset sales and returns of investments  2,141  1,232 \n 5,027  11,416  16,899 Free Cash Flow (non-GAAP)  16,443  27,742 \n\n7\n\n\n\nCALCULATION OF STRUCTURAL COST SAVINGS\n\nDollars in billions (unless otherwise noted) Twelve Months\nEnded December 31,\n\nSix Months \nEnded June 30,\n\n2019 2022 2022 2023\nComponents of Operating Costs\nFrom ExxonMobil’s Consolidated Statement of Income\n(U.S. GAAP)\nProduction and manufacturing expenses  36.8  42.6  20.9  18.3 \nSelling, general and administrative expenses  11.4  10.1  4.9  4.8 \nDepreciation and depletion (includes impairments)  19.0  24.0  13.3  8.5 \nExploration expenses, including dry holes  1.3  1.0  0.5  0.3 \nNon-service pension and postretirement benefit expense  1.2  0.5  0.2  0.3 \nSubtotal  69.7  78.2  39.9  32.2 \nExxonMobil’s share of equity company expenses (non-GAAP)  9.1  13.0  5.8  5.0 \nTotal Adjusted Operating Costs (non-GAAP)  78.8  91.2  45.7  37.2 \n\nTotal Adjusted Operating Costs (non-GAAP)  78.8  91.2  45.7  37.2 \nLess:\nDepreciation and depletion (includes impairments)  19.0  24.0  13.3  8.5 \nNon-service pension and postretirement benefit expense  1.2  0.5  0.2  0.3 \nOther adjustments (includes equity company depreciation \nand depletion)  3.6  3.5  1.8  1.5 \n\nTotal Cash Operating Expenses (Cash Opex) (non-GAAP)  55.0  63.2  30.4  26.9 \n\nEnergy and production taxes (non-GAAP)  11.0  23.8  11.0  7.5 \nTotal Cash Operating Expenses (Cash Opex) excluding \nEnergy and Production Taxes (non-GAAP)  44.0  39.4  19.4  19.4 \n\nChange\n vs \n\n2019\n\nChange \nvs \n\n2022\n\nEstimated  \nCumulative \n\nvs \n2019\n\nTotal Cash Operating Expenses (Cash Opex) excluding \nEnergy and Production Taxes (non-GAAP) -4.6 0.0\n\nMarket +2.7 +0.4\nActivity/Other +0.1 +0.5\nStructural Savings -7.4 -0.9 -8.3\n\nThis press release also references structural cost savings. Structural cost savings describe decreases in cash opex excluding \nenergy and production taxes as a result of operational efficiencies, workforce reductions, and other cost-saving measures that \nare expected to be sustainable compared to 2019 levels. Relative to 2019, estimated cumulative structural cost savings totaled \n$8.3 billion, which included an additional $0.9 billion in the first six months of 2023. The total change between periods in \nexpenses above will reflect both structural cost savings and other changes in spend, including market factors, such as inflation \nand foreign exchange impacts, as well as changes in activity levels and costs associated with new operations. Estimates of \ncumulative annual structural savings may be revised depending on whether cost reductions realized in prior periods are \ndetermined to be sustainable compared to 2019 levels. For example, in 2Q23 we recognized an additional $0.5 billion of prior \nperiod reductions that we now view as structurally sustainable. Structural cost savings are stewarded internally to support \nmanagement's oversight of spending over time. This measure is useful for investors to understand the Corporation's efforts to \noptimize spending through disciplined expense management.\n\n8\n\n\n\nExxonMobil will discuss financial and operating results and other matters during a webcast at 7:30 a.m. Central Time \non July 28, 2023. To listen to the event or access an archived replay, please visit www.exxonmobil.com.\n\nImportant Information about the Transaction and Where to Find It \nIn connection with the proposed transaction between Exxon Mobil Corporation (“ExxonMobil”) and Denbury Inc. (“Denbury”), \nExxonMobil and Denbury will file relevant materials with the Securities and Exchange Commission (the “SEC”), including a \nregistration statement on Form S-4 filed by ExxonMobil that will include a proxy statement of Denbury that also constitutes a \nprospectus of ExxonMobil. A definitive proxy statement/prospectus will be mailed to stockholders of Denbury. This \ncommunication is not a substitute for the registration statement, proxy statement or prospectus or any other document that \nExxonMobil or Denbury (as applicable) may file with the SEC in connection with the proposed transaction. BEFORE MAKING \nANY VOTING OR INVESTMENT DECISION, INVESTORS AND SECURITY HOLDERS OF EXXONMOBIL AND DENBURY \nARE URGED TO READ THE REGISTRATION STATEMENT, THE PROXY STATEMENT/PROSPECTUS AND ANY OTHER \nRELEVANT DOCUMENTS THAT ARE FILED OR WILL BE FILED WITH THE SEC, AS WELL AS ANY AMENDMENTS OR \nSUPPLEMENTS TO THESE DOCUMENTS, CAREFULLY AND IN THEIR ENTIRETY WHEN THEY BECOME AVAILABLE \nBECAUSE THEY CONTAIN OR WILL CONTAIN IMPORTANT INFORMATION ABOUT THE PROPOSED TRANSACTION \nAND RELATED MATTERS.  Investors and security holders may obtain free copies of the registration statement and the proxy \nstatement/prospectus (when they become available), as well as other filings containing important information about ExxonMobil \nor Denbury, without charge at the SEC’s Internet website (http://www.sec.gov). Copies of the documents filed with the SEC by \nExxonMobil will be available free of charge on ExxonMobil’s internet website at www.exxonmobil.com under the tab “investors” \nand then under the tab “SEC Filings” or by contacting ExxonMobil’s Investor Relations Department at \ninvestor.relations@exxonmobil.com. Copies of the documents filed with the SEC by Denbury will be available free of charge on \nDenbury’s internet website at https://investors.denbury.com/investors/financial-information/sec-filings/ or by directing a request \nto Denbury Inc., ATTN: Investor Relations, 5851 Legacy Circle, Suite 1200, Plano, TX 75024, Tel. No. (972) 673-2000. The \ninformation included on, or accessible through, ExxonMobil’s or Denbury’s website is not incorporated by reference into this \ncommunication.\n\nParticipants in the Solicitation\nExxonMobil, Denbury, their respective directors and certain of their respective executive officers may be deemed to be \nparticipants in the solicitation of proxies in respect of the proposed transaction. Information about the directors and executive \nofficers of Denbury is set forth in its proxy statement for its 2023 annual meeting of stockholders, which was filed with the SEC \non April 18, 2023, and in its Form 10-K for the year ended December 31, 2022, which was filed with the SEC on February 23, \n2023.  Information about the directors and executive officers of ExxonMobil is set forth in its proxy statement for its 2023 annual \nmeeting of stockholders, which was filed with the SEC on April 13, 2023, and in its Form 10-K for the year ended December 31, \n2022, which was filed with the SEC on February 22, 2023.  Additional information regarding the participants in the proxy \nsolicitations and a description of their direct or indirect interests, by security holdings or otherwise, will be contained in the proxy \nstatement/prospectus and other relevant materials filed with the SEC when they become available.  \nNo Offer or Solicitation\nThis communication is for informational purposes and is not intended to, and shall not, constitute an offer to sell or the \nsolicitation of an offer to buy any securities or a solicitation of any vote or approval, nor shall there be any offer, solicitation or \nsale of securities in any jurisdiction in which such offer, solicitation or sale would be unlawful prior to registration or qualification \nunder the securities laws of any such jurisdiction. No offering of securities shall be made except by means of a prospectus \nmeeting the requirements of Section 10 of the U.S. Securities Act of 1933, as amended.\n\nCautionary Statement\n\nStatements related to outlooks; projections; descriptions of strategic, operating, and financial plans and objectives; statements \nof future ambitions and plans; and other statements of future events or conditions in this release, are forward-looking \nstatements. Similarly, discussion of future carbon capture, transportation and storage, as well as biofuel, hydrogen and other \nplans to reduce emissions are dependent on future market factors, such as continued technological progress, policy support and \ntimely rule-making and permitting, and represent forward-looking statements. Actual future results, including financial and \noperating performance; total capital expenditures and mix, including allocations of capital to low carbon solutions; structural \nearnings improvement and structural cost reductions and efficiency gains, including the ability to offset inflationary pressure; \nplans to reduce future emissions and emissions intensity; ambitions to reach Scope 1 and Scope 2 net zero from operated \nassets by 2050, plans to reach net zero Scope 1 and 2 emissions in Upstream Permian Basin unconventional operated assets \nby 2030, eliminating routine flaring in-line with World Bank Zero Routine Flaring, reaching near-zero methane emissions from its \noperations, meeting ExxonMobil’s emission reduction goals and plans, divestment and start-up plans, and associated project \nplans as well as technology efforts; timing and outcome of projects related to the capture, transportation and storage of CO2, \nand produced biofuels, including completion of the Denbury acquisition; changes in law, taxes, or regulation including \nenvironmental and tax regulations, trade sanctions, and timely granting of governmental permits and certifications; timing and \noutcome of hydrogen projects; cash flow, dividends and shareholder returns, including the timing and amounts of share \nrepurchases; future debt levels and credit ratings; business and project plans, timing, costs, capacities and returns; and \nresource recoveries and production rates, could differ materially due to a number of factors. These include global or regional \n\n9\n\n\n\nchanges in the supply and demand for oil, natural gas, petrochemicals, and feedstocks and other market factors, economic \nconditions and seasonal fluctuations that impact prices and differentials for our products; government policies supporting lower \ncarbon investment opportunities such as the U.S. Inflation Reduction Act or policies limiting the attractiveness of future \ninvestment such as the additional European taxes on the energy sector; variable impacts of trading activities on our margins and \nresults each quarter; actions of competitors and commercial counterparties; the outcome of commercial negotiations, including \nfinal agreed terms and conditions; the ability to access debt markets; the ultimate impacts of COVID-19 or other public health \ncrises, including the effects of government responses on people and economies; reservoir performance, including variability and \ntiming factors applicable to unconventional resources; the level and outcome of exploration projects and decisions to invest in \nfuture reserves; timely completion of development and other construction projects; final management approval of future projects \nand any changes in the scope, terms, or costs of such projects as approved; government policies and support and market \ndemand for low carbon technologies; war, civil unrest, attacks against the company or industry and other political or security \ndisturbances; expropriations, seizure, or capacity, insurance or shipping limitations by foreign governments or laws; \nopportunities for potential acquisitions, investments or divestments and satisfaction of applicable conditions to closing, including \ntimely regulatory approvals; the capture of efficiencies within and between business lines and the ability to maintain near-term \ncost reductions as ongoing efficiencies; unforeseen technical or operating difficulties and unplanned maintenance; the \ndevelopment and competitiveness of alternative energy and emission reduction technologies; the results of research programs \nand the ability to bring new technologies to commercial scale on a cost-competitive basis; and other factors discussed under \nItem 1A. Risk Factors of ExxonMobil’s 2022 Form 10-K.  \n\nForward-looking and other statements regarding our environmental, social and other sustainability efforts and aspirations are \nnot an indication that these statements are necessarily material to investors or requiring disclosure in our filing with the SEC. In \naddition, historical, current, and forward-looking environmental, social and sustainability-related statements may be based on \nstandards for measuring progress that are still developing, internal controls and processes that continue to evolve, and \nassumptions that are subject to change in the future, including future rule-making.\n\nFrequently Used Terms and Non-GAAP Measures\n\nThis press release includes cash flow from operations and asset sales (non-GAAP). Because of the regular nature of our asset \nmanagement and divestment program, the company believes it is useful for investors to consider proceeds associated with the \nsales of subsidiaries, property, plant and equipment, and sales and returns of investments together with cash provided by \noperating activities when evaluating cash available for investment in the business and financing activities. A reconciliation to net \ncash provided by operating activities for the 2022 and 2023 periods is shown on page 7.\n\nThis press release also includes cash flow from operations excluding working capital (non-GAAP), and cash flow from \noperations and asset sales excluding working capital (non-GAAP). The company believes it is useful for investors to consider \nthese numbers in comparing the underlying performance of the company's business across periods when there are significant \nperiod-to-period differences in the amount of changes in working capital. A reconciliation to net cash provided by operating \nactivities for the 2022 and 2023 periods is shown on page 7. \n\nThis press release also includes earnings/(loss) excluding identified items (non-GAAP), which are earnings/(loss) excluding \nindividually significant non-operational events with, typically, an absolute corporate total earnings impact of at least $250 million \nin a given quarter. The earnings/(loss) impact of an identified item for an individual segment may be less than $250 million when \nthe item impacts several periods or several segments. Earnings/(loss) excluding identified items does include non-operational \nearnings events or impacts that are generally below the $250 million threshold utilized for identified items. When the effect of \nthese events is significant in aggregate, it is indicated in analysis of period results as part of quarterly earnings press release \nand teleconference materials. Management uses these figures to improve comparability of the underlying business across \nmultiple periods by isolating and removing significant non-operational events from business results. The Corporation believes \nthis view provides investors increased transparency into business results and trends and provides investors with a view of the \nbusiness as seen through the eyes of management. Earnings excluding identified items is not meant to be viewed in isolation or \nas a substitute for net income/(loss) attributable to ExxonMobil as prepared in accordance with U.S. GAAP. A reconciliation to \nearnings is shown for 2023 and 2022 periods in Attachments II-a and II-b. Corresponding per share amounts are shown on \npage 1 and in Attachment II-a, including a reconciliation to earnings/(loss) per common share – assuming dilution (U.S. GAAP).\n\nThis press release also includes total taxes including sales-based taxes. This is a broader indicator of the total tax burden on the \nCorporation’s products and earnings, including certain sales and value-added taxes imposed on and concurrent with revenue-\nproducing transactions with customers and collected on behalf of governmental authorities (“sales-based taxes”). It combines \n“Income taxes” and “Total other taxes and duties” with sales-based taxes, which are reported net in the income statement. The \ncompany believes it is useful for the Corporation and its investors to understand the total tax burden imposed on the \nCorporation’s products and earnings. A reconciliation to total taxes is shown in Attachment I-a.\n\nThis press release also references free cash flow (non-GAAP). Free cash flow is the sum of net cash provided by operating \nactivities and net cash flow used in investing activities. This measure is useful when evaluating cash available for financing \nactivities, including shareholder distributions, after investment in the business. Free cash flow is not meant to be viewed in \n\n10\n\n\n\nisolation or as a substitute for net cash provided by operating activities. A reconciliation to net cash provided by operating \nactivities for the 2022 and 2023 periods is shown on page 7. \n\nReferences to resources or resource base may include quantities of oil and natural gas classified as proved reserves, as well as \nquantities that are not yet classified as proved reserves, but that are expected to be ultimately recoverable. The term “resource \nbase” or similar terms are not intended to correspond to SEC definitions such as “probable” or “possible” reserves. A \nreconciliation of production excluding divestments, entitlements, and government mandates to actual production is contained in \nthe Supplement to this release included as Exhibit 99.2 to the Form 8-K filed the same day as this news release.\n\nThe term “project” as used in this news release can refer to a variety of different activities and does not necessarily have the \nsame meaning as in any government payment transparency reports. Projects or plans may not reflect investment decisions \nmade by the company. Individual opportunities may advance based on a number of factors, including availability of supportive \npolicy, technology for cost-effective abatement, and alignment with our partners and other stakeholders. The company may refer \nto these opportunities as projects in external disclosures at various stages throughout their progression.\n\nGovernment mandates are changes to ExxonMobil’s sustainable production levels as a result of production limits or sanctions \nimposed by governments.\n\nThis press release also references structural cost savings, for more details see page 8.\n\nReference to Earnings\n\nReferences to corporate earnings mean net income attributable to ExxonMobil (U.S. GAAP) from the consolidated income \nstatement. Unless otherwise indicated, references to earnings, Upstream, Energy Products, Chemical Products, Specialty \nProducts and Corporate and Financing segment earnings, and earnings per share are ExxonMobil’s share after excluding \namounts attributable to noncontrolling interests. \n\nExxon Mobil Corporation has numerous affiliates, many with names that include ExxonMobil, Exxon, Mobil, Esso, and XTO. For \nconvenience and simplicity, those terms and terms such as Corporation, company, our, we, and its are sometimes used as \nabbreviated references to specific affiliates or affiliate groups. Similarly, ExxonMobil has business relationships with thousands \nof customers, suppliers, governments, and others. For convenience and simplicity, words such as venture, joint venture, \npartnership, co-venturer, and partner are used to indicate business and other relationships involving common activities and \ninterests, and those words may not indicate precise legal relationships. ExxonMobil's ambitions, plans and goals do not \nguarantee any action or future performance by its affiliates or Exxon Mobil Corporation's responsibility for those affiliates' actions \nand future performance, each affiliate of which manages its own affairs.  \n\nThroughout this press release, both Exhibit 99.1 as well as Exhibit 99.2, due to rounding, numbers presented may not add up \nprecisely to the totals indicated.\n\n11\n\n\n\n.\n\nATTACHMENT I-a\n\nCONDENSED CONSOLIDATED STATEMENT OF INCOME\n(Preliminary)\n\nDollars in millions (unless otherwise noted)\nThree Months Ended \n\nJune 30,\nSix Months Ended\n\nJune 30,\n2023 2022 2023 2022\n\nRevenues and other income\nSales and other operating revenue  80,795  111,265  164,439  198,999 \nIncome from equity affiliates  1,382  3,688  3,763  6,226 \nOther income  737  728  1,276  956 \nTotal revenues and other income  82,914  115,681  169,478  206,181 \nCosts and other deductions\nCrude oil and product purchases  47,598  65,613  93,601  118,001 \nProduction and manufacturing expenses  8,860  10,686  18,296  20,927 \nSelling, general and administrative expenses  2,449  2,530  4,839  4,939 \nDepreciation and depletion (includes impairments)  4,242  4,451  8,486  13,334 \nExploration expenses, including dry holes  133  286  274  459 \nNon-service pension and postretirement benefit expense  164  120  331  228 \nInterest expense  249  194  408  382 \nOther taxes and duties  7,563  6,868  14,784  14,422 \nTotal costs and other deductions  71,258  90,748  141,019  172,692 \nIncome/(Loss) before income taxes  11,656  24,933  28,459  33,489 \nIncome tax expense/(benefit)  3,503  6,359  8,463  9,165 \nNet income/(loss) including noncontrolling interests  8,153  18,574  19,996  24,324 \nNet income/(loss) attributable to noncontrolling interests  273  724  686  994 \nNet income/(loss) attributable to ExxonMobil  7,880  17,850  19,310  23,330 \n\nOTHER FINANCIAL DATA\n\nDollars in millions (unless otherwise noted)\nThree Months Ended \n\nJune 30,\nSix Months Ended\n\nJune 30,\n2023 2022 2023 2022\n\nEarnings per common share (U.S. dollars)  1.94  4.21  4.73  5.49 \nEarnings per common share - assuming dilution (U.S. dollars)  1.94  4.21  4.73  5.49 \n\nDividends on common stock\nTotal  3,701  3,727  7,439  7,487 \nPer common share (U.S. dollars)  0.91  0.88  1.82  1.76 \n\nMillions of common shares outstanding\nAverage - assuming dilution  4,066  4,233  4,084  4,248 \n\nTaxes\nIncome taxes  3,503  6,359  8,463  9,165 \nTotal other taxes and duties  8,328  7,779  16,423  16,228 \nTotal taxes  11,831  14,138  24,886  25,393 \nSales-based taxes  6,281  6,857  12,313  12,957 \nTotal taxes including sales-based taxes  18,112  20,995  37,199  38,350 \n\nExxonMobil share of income taxes of equity companies  498  2,133  1,733  3,180 \n\n12\n\n\n\n.\n\nATTACHMENT I-b\n\nCONDENSED CONSOLIDATED BALANCE SHEET\n(Preliminary)\n\n \n\nDollars in millions (unless otherwise noted) June\n 30, 2023\n\nDecember\n31, 2022\n\nASSETS\nCurrent assets\nCash and cash equivalents  29,528  29,640 \nCash and cash equivalents – restricted  29  25 \nNotes and accounts receivable – net  35,915  41,749 \nInventories\n\nCrude oil, products and merchandise  20,006  20,434 \nMaterials and supplies  4,243  4,001 \n\nOther current assets  2,039  1,782 \nTotal current assets  91,760  97,631 \nInvestments, advances and long-term receivables  47,273  49,793 \nProperty, plant and equipment – net  206,736  204,692 \nOther assets, including intangibles – net  17,479  16,951 \nTotal Assets  363,248  369,067 \n\nLIABILITIES\nCurrent liabilities\nNotes and loans payable  3,929  634 \nAccounts payable and accrued liabilities  54,404  63,197 \nIncome taxes payable  3,482  5,214 \nTotal current liabilities  61,815  69,045 \nLong-term debt  37,567  40,559 \nPostretirement benefits reserves  10,278  10,045 \nDeferred income tax liabilities  23,460  22,874 \nLong-term obligations to equity companies  2,036  2,338 \nOther long-term obligations  21,095  21,733 \nTotal Liabilities  156,251  166,594 \n\nEQUITY\nCommon stock without par value\n(9,000 million shares authorized, 8,019 million shares issued)  16,029  15,752 \nEarnings reinvested  444,731  432,860 \nAccumulated other comprehensive income  (12,657)  (13,270) \nCommon stock held in treasury \n(4,016 million shares at June 30, 2023, and 3,937 million shares at December 31, 2022)  (249,057)  (240,293) \nExxonMobil share of equity  199,046  195,049 \nNoncontrolling interests  7,951  7,424 \nTotal Equity  206,997  202,473 \nTotal Liabilities and Equity  363,248  369,067 \n\n13\n\n\n\n.\n\nATTACHMENT I-c\n\nCONDENSED CONSOLIDATED STATEMENT OF CASH FLOWS\n(Preliminary)\n\n \n\nDollars in millions (unless otherwise noted)\nSix Months Ended \n\nJune 30,\n2023 2022\n\nCASH FLOWS FROM OPERATING ACTIVITIES\nNet income/(loss) including noncontrolling interests  19,996  24,324 \nDepreciation and depletion (includes impairments)  8,486  13,334 \nChanges in operational working capital, excluding cash and debt  (3,885)  (1,661) \nAll other items – net  1,127  (1,246) \nNet cash provided by operating activities  25,724  34,751 \n\nCASH FLOWS FROM INVESTING ACTIVITIES\nAdditions to property, plant and equipment  (10,771)  (7,748) \nProceeds from asset sales and returns of investments  2,141  1,232 \nAdditional investments and advances  (834)  (643) \nOther investing activities including collection of advances  183  150 \nNet cash used in investing activities  (9,281)  (7,009) \n\nCASH FLOWS FROM FINANCING ACTIVITIES\nAdditions to long-term debt  136  — \nReductions in long-term debt  (6)  — \nReductions in short-term debt  (172)  (2,336) \nAdditions/(Reductions) in debt with three months or less maturity  (172)  1,303 \nContingent consideration payments  (68)  (58) \nCash dividends to ExxonMobil shareholders  (7,439)  (7,487) \nCash dividends to noncontrolling interests  (293)  (123) \nChanges in noncontrolling interests  11  (697) \nCommon stock acquired  (8,680)  (5,986) \nNet cash provided by (used in) financing activities  (16,683)  (15,384) \nEffects of exchange rate changes on cash  132  (299) \nIncrease/(Decrease) in cash and cash equivalents  (108)  12,059 \nCash and cash equivalents at beginning of period  29,665  6,802 \nCash and cash equivalents at end of period  29,557  18,861 \n\n14\n\n\n\n.\n\nATTACHMENT II-a\n\nKEY FIGURES: IDENTIFIED ITEMS\n\n2Q23 1Q23 2Q22 Dollars in Millions (unless otherwise noted) YTD \n2023\n\nYTD \n2022\n\n 7,880  11,430  17,850 Earnings/(Loss) (U.S. GAAP)  19,310  23,330 \n\nIdentified Items\n —  —  — Impairments  —  (2,975) \n —  —  299 Gain/(Loss) on sale of assets  —  299 \n 6  (188)  — Tax-related items  (182)  — \n —  —  — Other  —  (378) \n 6  (188)  299 Total Identified Items  (182)  (3,054) \n\n 7,874  11,618  17,551 Earnings/(Loss) Excluding Identified Items (non-GAAP)  19,492  26,384 \n\n2Q23 1Q23 2Q22 Dollars Per Common Share YTD \n2023\n\nYTD \n2022\n\n 1.94  2.79  4.21 Earnings/(Loss) Per Common Share ¹ (U.S. GAAP)  4.73  5.49 \n\nIdentified Items Per Common Share ¹\n —  —  — Impairments  —  (0.70) \n —  —  0.07 Gain/(Loss) on sale of assets  —  0.07 \n\n0.00  (0.04)  — Tax-related items  (0.04)  — \n —  —  — Other  —  (0.09) \n\n0.00  (0.04)  0.07 Total Identified Items Per Common Share ¹  (0.04)  (0.72) \n\n 1.94  2.83  4.14 Earnings/(Loss) Excl. Identified Items Per Common Share ¹ (non-GAAP)  4.77  6.21 \n¹ Assuming dilution.\n\n15\n\n\n\n.\nATTACHMENT II-b\n\nKEY FIGURES: IDENTIFIED ITEMS BY SEGMENT\nSecond Quarter 2023 Upstream Energy Products Chemical Products Specialty Products Corporate\n\n&\nFinancing\n\nTotal\nDollars in millions (unless otherwise noted) U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S.\nEarnings/(Loss) (U.S. GAAP)  920  3,657  1,528  782  486  342  373  298  (506)  7,880 \n\nIdentified Items\nTax-related items  —  (12)  —  18  —  —  —  —  —  6 \n\nTotal Identified Items  —  (12)  —  18  —  —  —  —  —  6 \n\nEarnings/(Loss) Excl. Identified Items \n(non-GAAP)  920  3,669  1,528  764  486  342  373  298  (506)  7,874 \n\nFirst Quarter 2023 Upstream Energy Products Chemical Products Specialty Products Corporate\n&\n\nFinancing\nTotal\n\nDollars in millions (unless otherwise noted) U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S.\nEarnings/(Loss) (U.S. GAAP)  1,632  4,825  1,910  2,273  324  47  451  323  (355)  11,430 \n\nIdentified Items\nTax-related items  —  (158)  —  (30)  —  —  —  —  —  (188) \n\nTotal Identified Items  —  (158)  —  (30)  —  —  —  —  —  (188) \n\nEarnings/(Loss) Excl. Identified Items \n(non-GAAP)  1,632  4,983  1,910  2,303  324  47  451  323  (355)  11,618 \n\nSecond Quarter 2022 Upstream Energy Products Chemical Products Specialty Products Corporate\n&\n\nFinancing\nTotal\n\nDollars in millions (unless otherwise noted) U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S.\nEarnings/(Loss) (U.S. GAAP)  3,749  7,622  2,655  2,617  625  450  232  185  (286)  17,850 \n\nIdentified Items\nGain/(Loss) on sale of assets  299  —  —  —  —  —  —  —  —  299 \n\nTotal Identified Items  299  —  —  —  —  —  —  —  —  299 \n\nEarnings/(Loss) Excl. Identified Items \n(non-GAAP)  3,450  7,622  2,655  2,617  625  450  232  185  (286)  17,551 \n\nYTD 2023 Upstream Energy Products Chemical Products Specialty Products Corporate\n&\n\nFinancing\nTotal\n\nDollars in millions (unless otherwise noted) U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S.\nEarnings/(Loss) (U.S. GAAP)  2,552  8,482  3,438  3,055  810  389  824  621  (861)  19,310 \n\nIdentified Items\nTax-related items  —  (170)  —  (12)  —  —  —  —  —  (182) \n\nTotal Identified Items  —  (170)  —  (12)  —  —  —  —  —  (182) \n\nEarnings/(Loss) Excl. Identified Items \n(non-GAAP)  2,552  8,652  3,438  3,067  810  389  824  621  (861)  19,492 \n\nYTD 2022 Upstream Energy Products Chemical Products Specialty Products Corporate\n&\n\nFinancing\nTotal\n\nDollars in millions (unless otherwise noted) U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S. U.S. Non-U.S.\nEarnings/(Loss) (U.S. GAAP)  6,125  9,734  3,144  1,933  1,395  1,086  478  415  (980)  23,330 \n\nIdentified Items\nImpairments  —  (2,877)  —  —  —  —  —  —  (98)  (2,975) \nGain/(Loss) on sale of assets  299  —  —  —  —  —  —  —  —  299 \nOther  —  (378)  —  —  —  —  —  —  —  (378) \n\nTotal Identified Items  299  (3,255)  —  —  —  —  —  —  (98)  (3,054) \n\nEarnings/(Loss) Excl. Identified Items \n(non-GAAP)  5,826  12,989  3,144  1,933  1,395  1,086  478  415  (882)  26,384 \n\n16\n\n\n\n.\nATTACHMENT III\n\nKEY FIGURES: UPSTREAM VOLUMES\n\n2Q23 1Q23 2Q22 Net production of crude oil, natural gas liquids, bitumen and \nsynthetic oil, thousand barrels per day (kbd)\n\nYTD \n2023\n\nYTD \n2022\n\n 785  820  777 United States  802  765 \n 618  670  556 Canada/Other Americas  645  516 \n 4  4  4 Europe  4  4 \n 206  220  224 Africa  213  240 \n 702  749  691 Asia  725  714 \n 38  32  46 Australia/Oceania  35  43 \n 2,353  2,495  2,298 Worldwide  2,424  2,282 \n\n2Q23 1Q23 2Q22 Net natural gas production available for sale, million cubic feet per \nday (mcfd)\n\nYTD \n2023\n\nYTD \n2022\n\n 2,346  2,367  2,699 United States  2,357  2,738 \n 97  94  180 Canada/Other Americas  94  180 \n 375  548  825 Europe  461  798 \n 86  134  67 Africa  110  63 \n 3,350  3,597  3,320 Asia  3,473  3,330 \n 1,275  1,276  1,515 Australia/Oceania  1,276  1,421 \n 7,529  8,016  8,606 Worldwide  7,771  8,530 \n\n 3,608  3,831  3,732 Oil-equivalent production (koebd)¹  3,719  3,704 \n\n1 Natural gas is converted to an oil-equivalent basis at six million cubic feet per one thousand barrels.\n\n17\n\n\n\n.\nATTACHMENT IV\n\nKEY FIGURES: MANUFACTURING THROUGHPUT AND SALES\n\n2Q23 1Q23 2Q22 Refinery throughput, thousand barrels per day (kbd) YTD \n2023\n\nYTD \n2022\n\n 1,944  1,643  1,686 United States  1,794  1,686 \n 388  417  413 Canada  403  406 \n 1,209  1,189  1,164 Europe  1,199  1,179 \n 463  565  532 Asia Pacific  514  534 \n 169  184  193 Other  176  180 \n 4,173  3,998  3,988 Worldwide  4,086  3,985 \n\n2Q23 1Q23 2Q22 Energy Products sales, thousand barrels per day (kbd) YTD \n2023\n\nYTD \n2022\n\n 2,743  2,459  2,452 United States  2,601  2,358 \n 2,916  2,818  2,858 Non-U.S.  2,867  2,853 \n 5,658  5,277  5,310 Worldwide  5,469  5,211 \n\n 2,401  2,177  2,208 Gasolines, naphthas  2,290  2,161 \n 1,842  1,770  1,755 Heating oils, kerosene, diesel  1,806  1,739 \n 344  312  350 Aviation fuels  328  319 \n 228  215  228 Heavy fuels  221  238 \n 844  803  769 Other energy products  823  753 \n 5,658  5,277  5,310 Worldwide  5,469  5,211 \n\n2Q23 1Q23 2Q22 Chemical Products sales, thousand metric tons (kt) YTD \n2023\n\nYTD \n2022\n\n 1,725  1,561  1,998 United States  3,286  4,030 \n 3,124  3,088  2,812 Non-U.S.  6,212  5,798 \n 4,849  4,649  4,811 Worldwide  9,498  9,829 \n\n2Q23 1Q23 2Q22 Specialty Products sales, thousand metric tons (kt) YTD \n2023\n\nYTD \n2022\n\n 514  476  590 United States  991  1,111 \n 1,391  1,464  1,511 Non-U.S.  2,855  2,995 \n 1,905  1,940  2,100 Worldwide  3,845  4,107 \n\n18\n\n\n\n.\n\nATTACHMENT V\n\nKEY FIGURES: CAPITAL AND EXPLORATION EXPENDITURES\n\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n2023\n\nYTD \n2022\n\nUpstream\n 2,206  2,108  1,644 United States  4,314  3,013 \n 2,403  2,473  1,983 Non-U.S.  4,876  4,493 \n 4,609  4,581  3,627 Total  9,190  7,506 \n\nEnergy Products\n 349  358  300 United States  707  692 \n 382  327  206 Non-U.S.  709  380 \n 731  685  506 Total  1,416  1,072 \n\nChemical Products\n 152  285  250 United States  437  481 \n 507  546  169 Non-U.S.  1,053  374 \n 659  831  419 Total  1,490  855 \n\nSpecialty Products\n 14  11  14 United States  25  19 \n 89  80  42 Non-U.S.  169  60 \n 103  91  56 Total  194  79 \n\nOther\n 64  192  1 Other  256  1 \n\n 6,166  6,380  4,609 Worldwide  12,546  9,513 \n\nCASH CAPITAL EXPENDITURES\n\n2Q23 1Q23 2Q22 Dollars in millions (unless otherwise noted) YTD \n2023\n\nYTD \n2022\n\n 5,359  5,412  3,837 Additions to property, plant and equipment  10,771  7,748 \n 284  367  166 Net investments and advances  651  493 \n 5,643  5,779  4,003 Total Cash Capital Expenditures  11,422  8,241 \n\n19\n\n\n\n.\nATTACHMENT VI\n\nKEY FIGURES: YEAR-TO-DATE EARNINGS/(LOSS)\n\n \n\nResults Summary\n\n2Q23 1Q23\nChange \n\nvs \n1Q23\n\n2Q22\nChange\n\nvs \n2Q22\n\nDollars in millions (except per share data) YTD \n2023\n\nYTD \n2022\n\nChange\nvs YTD \n2022\n\n 7,880  11,430  -3,550  17,850  -9,970 Earnings (U.S. GAAP)  19,310  23,330  -4,020 \n\n 7,874  11,618  -3,744  17,551  -9,677 Earnings Excluding Identified Items (non-GAAP)  19,492  26,384  -6,892 \n\n 1.94  2.79  -0.85  4.21  -2.27 Earnings Per Common Share ¹  4.73  5.49  -0.76 \n\n 1.94  2.83  -0.89  4.14  -2.20 Earnings Excl. Identified Items Per Common Share ¹  4.77  6.21  -1.44 \n\n 6,166  6,380  -214  4,609  +1,557 Capital and Exploration Expenditures  12,546  9,513  +3,033 \n\n¹ Assuming dilution.\n\nYear-to-date Factor Analysis\n\nEarnings/(Loss) ($M)\n\n23,330\n+3,054 26,384\n\n-6,107 -682\n\n+380\n\n-760\n\n+277 19,492\n\n-182\n\n19,310\n\nYTD 2022 \nEarnings \n\n(U.S. \nGAAP)\n\nIdentified \nItems\n\nYTD 2022 \nEarnings \n\nExcl. \nIdentified \n\nItems \n(non-\n\nGAAP)\n\nPrice / \nMargin\n\nUnsettled\nDerivatives\n\n(MTM)\n\nVolume / \nMix\n\nExpenses Other YTD 2023 \nEarnings \n\nExcl. \nIdentified \n\nItems \n(non-\n\nGAAP)\n\nIdentified \nItems\n\nYTD 2023 \nEarnings \n\n(U.S. \nGAAP)\n\nCash Flow ($G)\n\n29.7\n\n+25.7\n\n-11.4\n\n+2.1\n\n-0.2\n\n-16.1 -0.2\n\n29.6\n\nYear-End\n 2022\nCash\n\n(U. S. GAAP)\n\nCFO Cash Capex Asset Sales Debt Shareholder \nDistributions\n\nOther 2Q23 Cash\n(U.S. GAAP)\n\n20\n\n\n\n.\nATTACHMENT VII\n\nKEY FIGURES: EARNINGS/(LOSS) BY QUARTER\n \n\nDollars in millions (unless otherwise noted) 2023 2022 2021 2020 2019\n\nFirst Quarter  11,430  5,480  2,730  (610)  2,350 \nSecond Quarter  7,880  17,850  4,690  (1,080)  3,130 \nThird Quarter  —  19,660  6,750  (680)  3,170 \nFourth Quarter  —  12,750  8,870  (20,070)  5,690 \nFull Year  —  55,740  23,040  (22,440)  14,340 \n\nDollars per common share ¹ 2023 2022 2021 2020 2019\n\nFirst Quarter  2.79  1.28  0.64  (0.14)  0.55 \nSecond Quarter  1.94  4.21  1.10  (0.26)  0.73 \nThird Quarter  —  4.68  1.57  (0.15)  0.75 \nFourth Quarter  —  3.09  2.08  (4.70)  1.33 \nFull Year  —  13.26  5.39  (5.25)  3.36 \n1 Computed using the average number of shares outstanding during each period; assuming dilution.\n\n21\n\n\n\t99.1 Header\n\t2Q-4Q Exhibit 99.1\n\tQ to Q Factor Analysis\n\tFinancial Highlights\n\t2Q-4Q Results and Volume Summary\n\t2Q-4Q Cash Flow from Operations and Asset Sales excluding Working Capital\n\t1Q-3Q  Cal of Structural Cost Savings\n\tFinancial Updates\n\t2Q-4Q Attachment I-a -  Income Statement\n\t1Q-3Q Attachment I-b - Balance Sheet\n\tAttachment I-c - Cash Flow Statement\n\t2Q-4Q Attachment II-a\n\t2Q-4Q Attachment II-b\n\t2Q-4Q Attachment III\n\t2Q-4Q Attachment IV\n\t2Q-4Q Attachment V\n\t2Q-4Q Attachment VI\n\tY to Y Factor Analysis\n\tAttachment VII  \n\n"
\end{verbatim}

look at that! we're beginning to give some structure to our text data.
But suppose I wanted to analyze multiple earnings calls; I need to
organize this data so that it can accomodate new entries. As always, we
want to \textbf{tabularize} our data. Let's create a dataframe with
three columns (Date, Title, and Text) in which each row is one earnings
call:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a dataframe using the above data }
\NormalTok{call}\OperatorTok{=}\NormalTok{pd.DataFrame(\{}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{:[date],}\StringTok{\textquotesingle{}Title\textquotesingle{}}\NormalTok{:[title],}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{:[raw\_text]\})}

\CommentTok{\# remember, datetime information almost always reaches us as text. }
\CommentTok{\# we need to explicitly convert it to the datetime data type. }
\NormalTok{call[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{pd.to\_datetime(call[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{])}

\CommentTok{\# Let\textquotesingle{}s see what we\textquotesingle{}ve got.}
\NormalTok{call}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llll@{}}
\toprule\noalign{}
& Date & Title & Text \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 2023-07-27 20:47:01+00:00 & f8k2Q23991 &
\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n\textbackslash n... \\
\end{longtable}

Now, if we were so inclined, we could use a loop to repeat this process
for a large number of earnings calls, yielding a neatly organized
dataframe containing the date, title, and text of earnings calls over
time. I've done this so you don't have to, and stored it as a file
called ``Exxon.json''. It spans 2002-2019, and contains over 10 million
words' worth of earnings calls. Let's take a peek:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{=}\NormalTok{pd.read\_json(}\StringTok{\textquotesingle{}https://storage.googleapis.com/qm2/wk4/Exxon.json\textquotesingle{}}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llll@{}}
\toprule\noalign{}
& Title & Date & Text \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & Exxon Mobil Corp at Barclays CEO EnergyPower ... & 2019-09-04 & Mr.
Woods joined ExxonMobil International in 1... \\
1 & Q2 2019 Exxon Mobil Corp Earnings Call - Final & 2019-08-02 & NEIL
A. HANSEN, VP OF IR \& SECRETARY, EXXON MO... \\
2 & Event Brief of Q2 2019 Exxon Mobil Corp Earn... & 2019-08-02 & .
Neil A. Hansen - Exxon Mobil Corporation,VP ... \\
3 & Exxon Mobil Corp at JPMorgan Energy Conferenc... & 2019-06-18 & So
with that, I\textquotesingle ll turn it over to you. Thank ... \\
4 & Exxon Mobil Corp Annual Shareholders Meeting ... & 2019-05-29 &
DARREN W. WOODS, CHAIRMAN \& CEO, EXXON MOBIL C... \\
... & ... & ... & ... \\
177 & Event Brief of Q3 2002 Exxon Mobil Corporati... & 2002-10-31 &
OVERVIEW \textbackslash n\textbackslash n XOM reported normalized
earnings... \\
178 & Q3 2002 Exxon Mobil Corporation Earnings Con... & 2002-10-31 & In
particular, I refer you to factors affectin... \\
179 & Q2 2002 Exxon Mobil Corporation Earnings Con... & 2002-08-01 &
Welcome to Exxon Mobil\textquotesingle s teleconference and we... \\
180 & Abstract of Q2 2002 Exxon Mobil Corporation ... & 2002-08-01 &
OVERVIEW \textbackslash n\textbackslash n XOM: 2Q02 net income was
\$2.64b.... \\
181 & Exxon Mobil Corporation First Quarter 2002 Re... & 2002-04-23 & We
also signed a memorandum of understanding t... \\
\end{longtable}

Great-- we've got a structured dataset of earnings calls. But even
though the data has \emph{structure}, the data in the ``Text'' column
still needs some cleaning and processing.

\hypertarget{dirty-words}{%
\section{Dirty Words}\label{dirty-words}}

Text often comes `unclean' either containing tags such as HTML (or XML),
or has other issues. We've already done a bit of tidying, but it's been
relatively straightforward. Be cautious when committing to a text
analysis project - you may spend a great deal of time tidying up your
text.

For example, you may have noticed ``\n\n\n\n\n\n\n\n\ldots{}'' in the
text of the first earnings call we downloaded. This is a character (just
like ``a'' or ``\$'') except it indicates that we want to create a new
line. It's part of the formatting of the pdf. That's not really useful
information to us. Let start by selecting an earnings call; i've chosen
the 38th in this dataframe:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{call}\OperatorTok{=}\NormalTok{df.iloc[}\DecValTok{38}\NormalTok{]}

\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Date: \textquotesingle{}}\NormalTok{, call[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{])}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Title: \textquotesingle{}}\NormalTok{, call[}\StringTok{\textquotesingle{}Title\textquotesingle{}}\NormalTok{])}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Word Count: \textquotesingle{}}\NormalTok{, }\BuiltInTok{len}\NormalTok{(call[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{]))}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Text:\textquotesingle{}}\NormalTok{)}
\NormalTok{call[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Date:  2016-05-25 00:00:00
Title:  Exxon  Mobil Corp Annual Shareholders Meeting - Final
Word Count:  125746
Text:
\end{verbatim}

\begin{verbatim}
'I\'m Rex Tillerson, I\'m the Chairman and Chief Executive Officer of the Exxon Mobil Corporation. And I am pleased to welcome each of you that made the effort to join us today in person. I also, though, want to welcome our shareholders around the world who are joining us by way of the Internet.\n\nI do hope you had the opportunity to meet some of our employees in person while visiting the displays in the foyer this morning. These Exxon Mobil employees are among the over 73,000 people who are working 24 hours a day, seven days a week, 365 days a year on your behalf. And many of them are working in challenging locations to deliver the energy and products needed by consumers around the world.\n\nThe financial and operating results that I bring to you today are really their results, and I have the privilege of presenting them to you on their behalf.\n\nSeated next to me is Jeff Woodbury, Vice President of Investor Relations and our Corporate Secretary. He will assist me in running the meeting today. I\'ll introduce the other members of the board to you a little later in the meeting.\n\nAs mentioned on Page 2 of the proxy statement, it is the policy of the corporation to provide confidential voting to shareholders. For shareholders who returned their proxy cards without written comments, the voted proxies have not been seen by nor reported to the corporation except in aggregate numbers. Anyone turning in a proxy card at this meeting who wishes to keep his or her votes secret, may obtain an envelope from the ushers. Proxy cards will be collected later in the meeting. A list of shareholders entitled to vote at this meeting or at any adjournment thereof is available for inspection. If anyone wishes to examine this list, an usher will be pleased to direct you to the proper location.\n\nShirley Nessralla and Paula Buckley of Computershare Trust Company have been appointed Inspectors of the Election for this meeting. They have taken an oath of office that has been delivered to the Secretary for filing with the minutes of the meeting. Notice of this meeting has been properly given, and the Inspectors of Election have determined that a quorum is present. There are 3.5 billion shares represented at this meeting, equating to approximately 85% of the issued and outstanding shares of stock of the corporation that are entitled to vote. I directed the inspector\'s written determination as to the number of shares entitled to vote at the meeting be filed with the minutes. I declare a quorum present and the meeting ready for business.\n\nI\'d now like to explain our plan for conducting the meeting today. First, Secretary Woodbury will outline the rules of conduct and how to gain recognition. Then, I\'ll make some brief comments about our business results and the future we see for your company. After that, the items of business comprised of 14 proposals from the Board of Directors and shareholders will be presented. As described in the Annual Meeting program, discussion on the items of business will be deferred until all items have been presented. Time permitting, we may also have time to respond to some of the questions submitted ahead of time via proxy cards and the Internet. Upon completion of the discussion on the items of business and voting, the polls will be closed, the formal business of this year\'s Annual Meeting will be concluded, and the Inspectors of Election will prepare this preliminary voting report.\n\nWhile this is occurring, there may be time for additional comments or questions regarding our business. When the inspectors are ready, I\'ll ask them to give us their voting report. We will then conclude the meeting.\n\nShown in this slide is the list of board and shareholder proposals that will be presented and voted on this morning. The 14 items of business will be presented beginning with the election of directors, the ratification of independent auditors and the advisory vote to approve executive compensation as required by law. Then, we will continue with the 11 shareholder proposals shown in the proxy statement.\n\nAt this time, I would like to turn the meeting over to Jeff Woodbury to discuss the rules of conduct.\n\nJEFF WOODBURY, VP OF INVESTOR RELATIONS, CORPORATE SECRETARY, EXXONMOBIL CORPORATION: Thank you, Mr. Chairman. Ladies and gentlemen, good morning. I\'d like to take this opportunity to familiarize everyone with the safety features of the auditorium. In case of emergency, we will be notified through the public address system. Emergency exits for the ground level, as shown on the screen behind me, are situated at the rear of the auditorium, where you entered, and down in front on either side. If we do need to evacuate, please proceed to the nearest exit and the Meyerson personnel will guide you to the best way out. In addition, for safety reasons, please do not stand in aisles or in the back of the hall and do not block the exits.\n\nTo ensure that the meeting is conducted in the interest of all shareholders, there are certain rules of conduct governing this meeting. These rules are included in the program and shown on the screen behind me.\n\nLet me now just cover several of the rules. The distribution of pamphlets and other literature, banners, signs and other displays is strictly prohibited. Anyone who intentionally obstructs or interferes with this lawful meeting by physical action or verbal utterance is in violation of Texas law. Any persons engaging in such conduct will be asked to cease, and if they refuse, they will be escorted from the meeting.\n\nThe laws of New Jersey, where Exxon Mobil is incorporated, provide that no business can be brought up for a vote unless proper notice has been given to all shareholders. Therefore, in the fairness of other shareholders not in attendance and in keeping with the laws that govern our Annual Meeting, formal business at today\'s meeting is restricted to the items included in this year\'s proxy statement. As such, additional proposals may not be introduced on the floor.\n\nIn order to present a proposal, you must have checked in at the admission\'s desk in the lobby and verified that you are the proponent or a duly authorized proxy under New Jersey law. Presenters whose credentials have been verified will be given a blue presenter\'s pass. If neither the proponent nor an authorized proxy has checked in and obtained a presenter\'s pass, we will presume the proponent is not present, and I will move the proposal for the purposes of orderly conduct of the meeting and so that shareholder votes cast may be recorded. However, I will not be acting as a representative of the proponent. The authorized presenter of a shareholder proposal will have up to 3 minutes to present the proposal, and time may not be shared with another speaker. As the Chairman indicated, discussion on all items of business will be deferred until the discussion period later in the meeting.\n\nOnly shareholders as of record date or their properly appointed proxies are entitled to speak at this Annual Meeting of Exxon Mobil shareholders. Shareholders making comments during the meeting must speak or have the words translated into English so that the majority of shareholders present can understand what is being said. Comments that are offensive or otherwise inappropriate will not be permitted. Also, we ask that any issues of personal interest that\'s not relevant to all shareholders be raised directly with appropriate company representatives outside of the Annual Meeting. We request that individual shareholders respect the right of others to speak and keep their comments as brief as possible. As noted in the proxy statement, the Chairman has broad authority to conduct the meeting in an orderly and timely manner.\n\nIf you wish to make comments, you must first fill out a speaker identification card that is included in the program and was provided to you as you entered the lobby. This card confirms that you meet the requirements to speak at this meeting. Please give the completed card to the usher when you are recognized to address the meeting.\n\nTo ensure as many shareholders as possible who want to address the meeting today have the opportunity to do so, we ask you to follow these additional instructions. If you would like to address the meeting, move to a reserved aisle seat, remain seated, raise your hand holding your speaker identification card to indicate to the Chairman that you wish to speak. When recognized by the Chairman, give your completed speaker card to the usher, and a microphone will be provided. Please stand and begin by stating your name. You may speak for up to 2 minutes.\n\nNow due to the large number of items on today\'s agenda and a need to conclude the meeting within a reasonable period of time, we cannot assure that every shareholder who wishes to speak will be able to do so. First priority will be given to those who have not yet had an opportunity to speak.\n\nAs we\'ve done in the past, we\'ve provided a timing system with lights that will help speakers manage their time, and we\'ll demonstrate the system at this time.\n\nWhen the Chairman recognizes a speaker and a microphone has been provided by the usher, a green light will come on at the displays on both sides of the stage. The hall microphones will be activated only after the speaker has been recognized by the Chairman. When the speaker\'s time remaining reaches 30 seconds, a yellow light will turn on, and finally, a red light will indicate the speaker is at the end of their allowed time. And at that point, we ask you to conclude your comments.\n\nFinally, as we typically note at the outset of similar meetings, I would like to draw your attention to our cautionary statement that\'s shown on this slide. The statement contains information regarding today\'s presentation and discussion. You may also refer to the corporate website for additional information on factors affecting future results as well as supplementary information defining key terms that we use today throughout the meeting.\n\nAnd now I will turn the podium back to the Chairman to provide an overview on the business. Thank you.\n\nREX TILLERSON: Thank you, Jeff. We will address our items of business shortly. However, first, I will share with you some financial and operating highlights from 2015, provide an update on our annual outlook for energy and review key elements of our integrated businesses. The photo you see behind me is the Antwerp refinery, one of the most efficient in Europe, where we are progressing construction of a new delayed coker. Exxon Mobil\'s strategy, constancy of purpose and competitive advantages continue to create superior, long-term shareholder value. Regardless of the business environment, we maintain a relentless focus on the fundamentals, those factors that we control. Our integrated business model remains resilient through the commodity price cycle. We are continuing a disciplined and paced investment approach focused on creating value while maintaining our commitment to a reliable and growing dividend.\n\nExxon Mobil is demonstrating differentiated performance that has become all the more visible this past year. We lead the industry in return on capital employed and most importantly, long-term shareholder returns.\n\nOver the next few slides, I\'ll highlight how we have delivered on our 2015 commitments and discuss our industry-leading performance against several key metrics. This photograph of the Deepwater Champion drilling vessel, which successfully drilled the Liza-1 exploration discovery well offshore Guyana, highlights one example of our approach to long-term resource capture and value generation. We\'re pleased with the initial results, and I\'ll discuss our plans for Guyana later in the presentation.\n\nSince our shareholder meeting last year, the business environment continued to deteriorate with a sharp decrease in crude oil and natural gas prices. The graphic is a stark reminder of the volatile and cyclical nature of a commodity prices business. However, the corporation is uniquely suited to endure these conditions and outperform the competition.\n\nRegardless of prices, we remain focused on the following business fundamentals. Operational integrity is an organizational imperative. Operating safely, reliably and effectively is the primary objective. Maximizing the reliability of our facilities, lowering cost and increasing efficiency maximizes the value of our existing asset base.\n\nOur integrated businesses have a distinct competitive advantage which enables us to create additional value as market conditions change. Our investment decisions are based on a long-term view informed by our energy outlook and tested across various economic parameters, including a broad range of commodity prices.\n\nFinally, once we decide to invest, our world-class project execution delivers a competitive, reliable asset base. These fundamentals, applied daily throughout our portfolio, deliver leading results that grow shareholder value throughout the business cycle.\n\n2015 results demonstrate the strength of our integrated business in a weaker commodity price environment. Our approach to business, operations and corporate citizenship is grounded in a commitment to integrity in everything we do. Nowhere is that more evident than in our commitment to safe operations. As the chart shows, our workforce safety performance for both employees and our contractors remains strong relative to the industry.\n\nAs to financial results, total corporate earnings were $16.2 billion with a return on capital employed of 7.9%. We generated $32.7 billion of cash flow from operations and asset sales, and selectively invested $31 billion back into the business. Total shareholder distributions including dividends and share repurchases were $15 billion.\n\nRisk management is at the core of our business. Everything we do has an element of risk, whether technical, operational, financial, geopolitical or environmental. We have a systematic approach guided by our Operations Integrity Management System, also known as OIMS, which helps us manage safety, security, health and environmental risk.\n\nOIMS guides the activities of our employers and contractors to achieve excellence in our operational performance. It is a proven approach, rigorously applied to all work processes at all levels. Everyone is expected to operate in the same safe way every day, everywhere. OIMS provides appropriate emphasis on both personnel and process safety, helping us achieve superior reliability and operating performance, which ultimately leads to better business performance.\n\nNow let\'s take a look at our environmental performance. At Exxon Mobil, we recognize our dual responsibility to both expand energy supplies, but do so in a way that reflects our shared commitment to environmental stewardship. To minimize our environmental impact, we carefully identify, assess, manage and monitor potential environmental risk.\n\nOur comprehensive approach has continued to reduce emissions and releases, mitigating the environmental impact of our operations.\n\nOver the past five years, 8.8 million tons of additional greenhouse gas emissions were avoided through self-help initiatives. This is equivalent to eliminating the annual greenhouse gas emissions of nearly 2 million passenger vehicles. We\'re currently working at the operational level to minimize flaring and venting, and continue to implement proven reduction technologies such as selective deployment of cogeneration capacity in more than 100 installations at more than 30 locations around the world. Currently, our gross capacity for cogeneration is 5.5 gigawatts, enough to meet the annual electricity needs of 2.5 million U.S. homes. These investments have made our operations some of the most productive and energy efficient in the world.\n\nExxon Mobil is also a global leader in carbon capture and sequestration with working interest in more than one-third of the world\'s current capacity. In total, since the year 2000, we have invested nearly $7 billion in an array of technologies across the company to reduce our emissions.\n\nAs this chart illustrates, our approaches are broad based and targeted at delivering practical solutions in the near term and the mid-term while continuing to make long-term investments to advance emerging energy technologies on a host of frontiers.\n\nMost recently, we announced an expanded research program to pursue the novel application of carbonate fuel cells to capture emissions from power plants. We are in early days, but making carbon capture economic could lead to large-scale applications around the world, reducing emissions and mitigating the risk of climate change.\n\nExxon Mobil also continues to research potential game-changing technologies in the areas of advanced biofuels, including from algae, whole cellulosic biomass and cellulose-derived sugars, which avoid adversely affecting water use, land use or world food supplies. But to ensure investment and innovation can progress, society will need one more thing, sound public policy. Exxon Mobil has worked for decades to support and advance scientific understanding, to contribute to substantive and thoughtful policy discourse and to reach out to stakeholders and government officials at every level wherever we operate. Nowhere is this comprehensive cooperative approach more important than addressing the risk of climate change.\n\nFor many years now, Exxon Mobil has held the view that the risks of climate change are serious, and they do warrant thoughtful action. We have long worked collaboratively with academic and institutional efforts to support research, improve climate models and advance the scientific understanding of climate change. Our scientists have been participating with the UN\'s intergovernmental panel on climate change from its founding and have produced hundreds of publicly available papers including more than 50-peer-reviewed publications.\n\nOn the policy front, we believe that addressing the risk of climate change is a global issue, which will require the involvement of governments, companies and all energy consumers. To enable the most efficient and effective investments in lower carbon technologies, cost of carbon policies must invite and promote global participation. They must ensure a uniform and predictable cost of greenhouse gas emissions in every economy. They must allow market prices to drive the selection of solutions. They must minimize complexity and administrative cost, and maximize transparency and provide flexibility for policy adjustments in light of economic performance or new discoveries and breakthroughs in climate science. In years ahead, the world\'s ability to meet our shared need for energy and our shared aspirations for the environment will depend on sound policy, and that is why Exxon Mobil will continue to discuss the key principles supporting a revenue-neutral carbon tax as a possible broad-based approach.\n\nTurning to financial results. Exxon Mobil\'s return on capital employed consistently leads the competition. In 2015, our ROCE of 7.9% was significantly impacted by the drop in oil and gas prices, but was still nearly 4 percentage points higher than our nearest competitor. Over the past five years, our ROCE averaged 18% or about 5 percentage points higher than the next best competitor. Sustained leadership in capital efficiency reflects our commitment to a disciplined investment approach, strong project management and innovative technologies to grow a well-balanced portfolio across a broad range of business conditions.\n\nRegardless of the business environment, we strive to lead the cost curve and currently are benefiting from both ongoing efficiencies as well as cost deflation. As you can see by the graphic, we are achieving significant market savings in the current business climate. We achieved a total net reduction of $11.5 billion in both capital and cash operating cost in 2015. Our Upstream unit costs are down 9% and in refining, our unit cash costs are 15% lower than the industry average. This focus on cost leadership also reduces project cost, which improve long-term returns and free cash flow. Robust operating performance and relentless attention to cost control enable our ability to generate industry-leading free cash flow over the long term.\n\nExxon Mobil generated $6.5 billion of free cash flow in 2015 and almost $100 billion over the past five years, outpacing competitors. We continue to pay a reliable and growing dividend and pursue investments in attractive opportunities to create long-term shareholder value. Although the share buyback program was tapered in 2015, we maintain industry-leading distributions. On average, $0.48 of every dollar generated by the business over the last five years was distributed to you, our shareholders, while we continue to make significant investments in the business.\n\nWe continue to grow the dividend. The corporation\'s long-term dividend growth rate exceeds the S&P 500 and our industry competitors with only Exxon Mobil materially increasing the dividend in 2015, up 6.7% to $2.88 per share. And over the past 10 years from 2006 through 2015, annual dividends have increased 10% per year. We have increased our dividend 34 consecutive years including this year\'s second quarter increase to $0.75 per share. Our dividend payments are made with a view to building long-term shareholder value and providing a reliable dividend growth.\n\nShare purchases are an efficient and flexible way of returning cash to our shareholders. The corporation purchased $3 billion in 2015 and prudently tapered the program consistent with changes in the business environment and the corporation\'s cash requirements. Through the first half of 2016, Exxon Mobil has limited share purchases to amounts needed to offset dilution related to our benefits plans and programs. While the future pace of share purchases will reflect the market environment and cash flow at the time, we continue to regard them as a flexible method to return value to the shareholders and manage our long-term capital structure.\n\nSince the Exxon and Mobil merger, we have reduced total shares outstanding by 40%, including the impacts of shares we issued to purchase XTO in 2010. Over this period, total shareholder distributions including dividends were $357 billion. To put this number in perspective, these distributions to you are larger than the individual market capitalization values of 497 of the S&P 500 companies.\n\nNow every year, Exxon Mobil shares its long-term view of global energy demand and supply, which guides our company\'s business strategies and our investments, and we publish that as our outlook for energy. This document confirms the wisdom of these investments and help provide the world with reliable and affordable energy necessary to advance economic prosperity and improve living standards well into the future.\n\nBy the year 2040, the world\'s population is expected to increase from 7 billion people today to more than 9 billion people. This population growth will more than double global economic output sparking a dramatic expansion of the middle class. These fundamentals will drive energy demand growth. As a result, Exxon Mobil\'s 2016 outlook for energy forecasts an increase in global energy demand of about 25% during this period, even while taking into account the offsetting impact of significant energy efficiency gains.\n\nNon-OECD nations, shown in blue on the chart, are expected to drive growth in GDP and therefore, energy use, accounting for about 70% of global energy demand by the year 2040. The world\'s middle class is expected to increase by about 3 billion people, mostly from non-OECD nations. This means better living standards for billions, and it will be enabled in large part by improving access to modern technology and affordable, reliable energy. They\'re going to be able to buy microwaves. They\'re going to be able to buy refrigeration for their food and their medicines and one day, they\'re going to be able to buy a car. Even so, energy use per capita in developing countries will still remain below the OECD.\n\nThe demand outlook for OECD countries shown in red highlights the large-scale impact of energy efficiency. GDP in these countries is expected to grow about 70% between now and 2040. However, energy demand is likely to remain essentially flat as expanded use of energy-efficient technologies and practices lead to significant energy savings. Without these efficiency gains, we estimate global demand growth would be about four times our projected 25%.\n\nFinally, I\'ll note, our energy outlook assumptions are consistent with the aggregation of Paris climate agreement commitments.\n\nSo now we\'ll move to discussion of energy supplies needed to meet this growing demand. As the world\'s population grows and living standards improve, we expect to see an evolving energy landscape, driven by advances in technology, available resources and trade. Together, oil and natural gas will meet about 60% of global demand in the year 2040, about the same percentage as they do today, and will lead growth even though the energy mix evolves.\n\nWe expect oil will remain the leading energy source as it remains essential to meet growing needs for transportation and as a feedstock for petrochemicals. Natural gas is abundant and well suited to meet rising power generation and industrial needs, while also providing a cost-effective option to reduce carbon dioxide emissions. Natural gas demand will increase more than any other energy type, surpassing coal to become the second-largest source, in part driven by a tripling of global liquefied natural gas demand across this period.\n\nThe outlook anticipates that stringent government policies will increase the cost of CO2 emissions over time. These policies will incentivize the growth of energy types with low-carbon intensity, including natural gas, nuclear, solar and wind.\n\nOur view is grounded in the reality that abundant energy supplies are vital to modern life. Greater access to affordable and reliable energy will remain fundamental to reducing poverty and advancing standards of living for billions of people around the world.\n\nTo sustain progress and further expand prosperity, the world must increase the availability of affordable energy solutions. Therefore, meeting the growth in global energy demand will require diverse energy supplies from conventional and unconventional oil and gas, as well as liquefied natural gas and nuclear and renewables. We should pursue all economic opportunities.\n\nFortunately, technology advances continue to expand our energy options while helping to minimize our environmental footprint. Continued access to high-quality resources and substantial investments will remain fundamental in meeting these challenges. And free markets, supported by sound and reliable public policies, remain vital to the development of new energy supplies. This includes policies that promote free and open trade, and encourage private sector investments.\n\nSo now I\'ll provide a business and operational update. Shown here is the Upper Zakum development, offshore Abu Dhabi, which utilizes our innovative approach of creating four artificial islands to maximize economic recovery from this very large field. Exxon Mobil, along with our partners, developed the island drilling concept, which has significantly reduced total development cost. Exxon Mobil provides industry leadership to meet the world\'s energy needs. Key elements of our strategy are shown on this chart. Our approach has been consistent for decades, and through systematic implementation and continuous improvement, these elements have become resilient, competitive advantages.\n\nRisk management and operational excellence are at the very core of our business activities. Investment and cost discipline, along with world-class project execution result in the best asset mix in the industry. We continuously seek to high grade our portfolio beyond our exploration program through accretive acquisitions, restructurings and asset sales.\n\nAs I\'ll discuss in the coming slides, we capture significant value-added benefits from our vertical integration. Our long-standing commitment to technology leadership stimulates innovation and provides unique advantages. And our high-performing workforce drives premier results across our businesses. Our employees are the best of the best, and their dedication produces the results that I share with you today. In short, we are delivering on our commitments and achieving differentiated performance to meet our objective of growing shareholder value.\n\nThe level of Exxon Mobil\'s business integration is a sustainable, competitive advantage that is difficult to replicate, and we continue to increase that level of integration. Our businesses work together across the value chain to share knowledge, insights and best practices. This collaboration leads to better informed decisions, more efficient operations and higher quality investments, delivering unique value and resiliency. Our diverse asset base provides market optionality and operational flexibility to optimize value every step of the way from the wellhead all the way to the consumer. This structural advantage drives our differentiated financial strength and our superior results.\n\nAs I mentioned earlier, Exxon Mobil maintains a disciplined and paced investment approach, focused on creating long-term value. In 2016, we expect to spend around $23 billion or $8 billion less than in 2015. This plan reflects lower upstream project spending as we continue to bring major projects to completion and into production. Downstream and chemical plants reflect unique opportunities to grow and strengthen those businesses. We anticipate the 2017 budget will be less than $23 billion. We have a flexible, high-quality opportunity set and have the capability to increase the investment program based on market demand fundamentals or adjust future spending lower should conditions warrant. By selectively investing through the cycle, we remain positioned to capture market efficiencies and savings to deliver better financial returns.\n\nWe remain steadfast in our objective to create superior long-term value. Our planned deliverables are summarized on this chart. We manage the business to achieve industry-leading returns throughout the commodity price cycle. We work to maximize value chain benefits across the integrated portfolio, adjusting to changing product demand and improving the portfolio mix.\n\nCapital discipline remains paramount to our continued success. We\'ve paced the investment program selectively, investing in attractive opportunities while maintaining our flexibility to adjust.\n\nUpstream volumes are an outcome of the investment program, and we anticipate a range of 4.0 million to 4.2 million oil equivalent barrels per day of production through the year 2020.\n\nCash flow is expected to grow from new investments, continued operating excellence, reduced spending and self-help initiatives, and we continue to share the corporation\'s success with our shareholders. Distributions are made with a view to building long-term value by providing a reliable and growing dividend.\n\nNext, I\'ll review our Upstream business. Exxon Mobil has a high-quality, diverse portfolio of assets, with oil and natural gas producing activities in 24 countries. Currently, we\'re producing 2.3 million barrels per day of liquids and 10.5 billion cubic feet per day of natural gas. This base of operations provides long-term cash flow and the capability to progress attractive new opportunities through the cycle to meet long-term energy demand.\n\nWe have an extensive portfolio of approximately 100 capital projects to develop over 20 billion oil equivalent barrels, providing us multiple options that enable selective and paced investment. These projects are at varying stages of advancement, including those that are already under construction like the Odoptu Stage 2 project in Sakhalin, Russia; projects in the engineering design stage, such as the Tengiz Expansion project in Kazakhstan; and opportunities where we\'re evaluating concepts to expand development around existing infrastructure, such as deepwater tiebacks offshore West Africa.\n\nWe have an unmatched opportunity base across multiple resource types, including both short and longer cycle opportunities that provide investment flexibility. Let\'s talk about some of our recent startups. Projects that started up in 2015 added 300,000 oil equivalent barrels per day of working interest capacity. In West Africa -- in the deepwater, we added 70,000 oil equivalent barrels per day of working interest capacity with two capital-efficient projects that leveraged on existing infrastructure, the Erha North Phase 2 project in Nigeria and Kizomba Satellites Phase 2, offshore Angola. Both of these projects started up ahead of schedule and below budget. The Kearl expansion project started up five months ahead of schedule. The project incorporated learnings from the initial development phase and doubled Kearl\'s production capacity to 230,000 barrels per day. In Indonesia, the Banyu Urip project added 75,000 barrels per day of working interest capacity. Production has now successfully ramped up to full capacity at 185,000 barrels per day.\n\nMoving on to 2016, 2017 startups. 10 projects are scheduled to be completed this year and next, adding more than 450,000 oil equivalent barrels per day of working interest capacity. These projects highlight the geographic resource diversity of our portfolio. As you can see, the locations range from North America to Russia, from Australia to the Middle East. Also consistent with energy demand growth projections, we are investing across several resource types, including liquefied natural gas, deepwater developments, conventional and the Arctic. We\'re making great progress with these major construction projects with four of them starting up already this year.\n\nWe also have an attractive domestic onshore resource base. Our U.S. unconventional resource base is over 15 billion oil equivalent barrels. We continue to focus on liquids growth mainly through development in the Permian Basin and the Bakken plays with 2.1 million net acres and current net production of 230,000 barrels per day. In 2015, we increased our net Permian and Bakken production nearly 25%. The Permian unconventional production component nearly doubled as we increased horizontal drilling in the Wolfcamp formation in the Midland Basin.\n\nAdditionally, we continue to improve our acreage position through trades and farm-ins. We operate over 80% of our U.S. unconventional assets. Our ownership and operating position enable flexible development consistent with the business environment. As such, we\'ve reduced our rig count over 70% from the peak 2015 levels while remaining positioned to adjust those levels up or down further depending on the market conditions.\n\nTurning to our exploration portfolio. As shown on the map, we\'re pursuing a diverse set of high potential resource opportunities, which could add to our resource base. We hold more than 110 million net acres from under-explored regions with higher risk, but higher reward potential, to more established lower-risk basins close to existing infrastructure. We recently captured 10 new opportunities covering over 2 million net acres, gaining exposure to multiple plays in established areas as well as new emerging basins. The yellow stars marked the eight discoveries we made in 2015. We added 1.4 billion oil equivalent net barrels to our resource base, including the Liza exploration well, offshore Guyana, which was the largest oil discovery by industry in 2015. Additional resource was also added in Iraq, Nigeria, Romania, Australia and onshore North America.\n\nWe\'ve accumulated an attractive position offshore Guyana, a new area with very large resource potential. We recently acquired an operating interest in the Canje Block adjacent to our existing Stabroek Block. This brings our total position to 8.1 million gross acres. To put this in perspective, this is the equivalent of 1,400 Gulf of Mexico blocks.\n\nFollowing the Liza-1 discovery, we completed the largest proprietary 3-dimensional seismic survey in our company\'s history and have initiated a multi-well exploration and appraisal drilling campaign, with the spud of the Liza-2 well in February of this year. The data from this appraisal well and the 3D seismic will be used to further our understanding of the block\'s potential and possible development concepts.\n\nLet\'s now take a look at our Downstream and Chemical business. As the points on the map indicate, we have refining and petrochemicals manufacturing assets in all major regions of the world, supporting the marketing of our products in more than 130 countries. Our scale, integration and balanced portfolio of assets in fuels, lubricants and chemicals provide opportunities to capture the highest value for each molecule, while also capturing operating efficiencies.\n\nTo further grow the value of these businesses, we are progressing a diverse portfolio of attractive investments. Consistent with our strategy, these investments capitalize on several value chain opportunities, which include increasing feedstock and logistics flexibility.\n\nTo highlight a few projects underway, we are increasing our sour crude oil processing capability in Baton Rouge, Louisiana by expanding sulfur handling capacity by 40%, while also implementing steps to improve access to a broad range of North American crude oils.\n\nAt our refinery in Beaumont, Texas, we\'re expanding capacity to run attractive light crude oils by 20,000 barrels per day. In chemicals, we are constructing a multibillion-dollar ethane steam cracker and associated polyethylene facilities in Texas, to capitalize on low-cost North American feedstock. When completed, these facilities will be among the world\'s most competitive petrochemical projects through scale and integration with existing manufacturing facilities and production of premium metallocene polyethylene.\n\nAs mentioned earlier, at the Antwerp refinery in Belgium, we\'re constructing a 50,000-barrel per day delayed coker to upgrade low-value bunker fuel oil currently produced at our Northern European refineries into higher value low sulfur -- ultra-low sulfur diesel.\n\nAt the Rotterdam refinery in the Netherlands, we are expanding the hydrocracking unit to upgrade low-value hydrocarbons into premium lube base stocks and ultra-low sulfur diesel, positioning ExxonMobil as the first large-scale producer of Group 2 base stocks in Europe.\n\nAt our Fawley refinery in the United Kingdom and our refinery in Singapore, we are making investments to debottleneck existing units, allowing us to upgrade distillate streams into higher value chemical intermediate products.\n\nEach of these projects target our most competitive strategic assets. We are also making selective investments in our lubes and chemical value chains to increase production of specialty products. We\'re currently commissioning a specialty elastomers facility in Saudi Arabia to produce synthetic rubbers, polyolefin elastomers and carbon black to serve the auto industry.\n\nIn Singapore, we are expanding finished lubricant manufacturing capacity to support rapidly growing sales of our industry-leading synthetic lubricant products such as Mobil 1. Also in Singapore, construction is underway on a world scale facility to produce premium synthetic rubber for the growing tire market and premium resins for adhesive applications. These investments will strengthen ExxonMobil\'s leading global positions in these high-growth products.\n\nOur direct interface with consumers is largely through our brands such as Esso, Exxon, Mobil and Mobil 1. Our broad product offering is supported by quality and reliability along with technology development, which enables us to bring new high performance products to market and further grow our brands.\n\nOver the past decade, we have more than doubled sales of synthetic lubricants including Mobil 1, Mobil Delvac 1 and a Mobil SHC. We\'ve increased branded retail fuel sales by expanding our network to more than 20,000 Exxon, Mobil and Esso branded stations along with providing innovative brand marketing and technology programs to deliver a superior customer offering.\n\nI\'ll conclude the business and operational update with a photo of construction under way at our world scale Baytown refining and petrochemical complex, one of the investments I described earlier. These major investment programs, whether they are new country entries or expansions of existing and legacy locations, can only succeed if we also invest in the communities where we live and work. The people of ExxonMobil understand and take seriously our responsibilities to shareholders, to communities, our neighbors and our customers to be responsible corporate citizens and to operate with the highest standards of ethical behavior. Safety and environmental protection are more than just a priority at ExxonMobil, they are core values, an integral part of our culture and fundamental to the success of our business.\n\nAs we develop oil and gas resources to meet the world\'s growing energy needs, we must do so in a manner that contributes to the economic and social development of the communities in which we operate. Our efforts are directed at employing and training local workforces, supporting local suppliers and service providers and improving the livelihoods of the community members.\n\nWe continue to make substantial progress in training and hiring host country workers, which advances local economic development and education. For example, in Chad, 94% of our personnel are citizens of that country, and 72% of supervisory roles are held by locals. We make strategic investments to support needs of local communities. In 2015, we contributed $268 million to community initiatives around the world. Our education initiatives include programs supporting both students and teachers that encourage pursuit of careers in science, technology, engineering and math.\n\nFor 15 years, we have been a leader in the fight to combat preventable and treatable illnesses like malaria. To date, the antimalarial programs we have funded have reached more than 125 million people. Our support has helped train more than 520,000 health workers as well as distribute 3.8 million doses of antimalarial treatments and almost 14 million bed nets.\n\nWe\'ve learned that empowering women economically is essential to local development. Through ExxonMobil\'s women\'s economic opportunity initiative, we invest in programs to develop the next generation of female entrepreneurs and business leaders. Through these programs, we provide the skills, resources and access to technology needed to increase their productivity and their income.\n\nAnd our people give not only of their personal treasure, but also their time through volunteering. Nearly 15,000 Exxon employees -- ExxonMobil employees, retirees and their families donated more than 629,000 hours to almost 5,000 charitable organizations in 34 countries around the world. It is not a slogan, but a fact. Where ExxonMobil people go, good things happen in those communities.\n\nIn highly capital-intensive industries such as ours, financial results and stock market returns are best viewed over a long time horizon. The energy industry requires sustainable risk management of the physical assets as well as the cash and the capital and long cycle times for investments to deliver results. ExxonMobil has generated greater shareholder returns than the average of our competitors over the last 5-, 10- and 20-year periods. These superior returns reflect our sustained financial and operating advantages that position us to maximize shareholder value.\n\nI want to leave you with a few personal thoughts about our company and its meaningful contributions to society. We are delivering on our business commitments in creating long-term shareholder value. Our talented, dedicated workforce drives premier results, providing reliable, affordable energy to advance human progress the world over.\n\nExxonMobil is more than an energy company. We\'re making a significant difference in our communities, creating economic opportunities and improving lives by investing in better education, health care and infrastructure. We are pioneering science and advancing technologies to improve energy efficiency, reduce emissions and expand the sources of energy. We\'ve taken thoughtful consistent action to protect the environment and to help understand and reduce the risk of climate change. I\'m certainly proud of our employees. Who we are, what we do every day to provide energy the world needs while supporting our communities and improving the lives of people the world over.\n\nNow before we continue with our planned agenda items, I want to share with you a brief video that illustrates some of these meaningful contributions.\n\n(video playing)\n\nREX TILLERSON: So now, turning to the formal business of the meeting and a few brief remarks on shareholder proposals and voting. Each year, the corporation receives a number of suggestions from shareholders. Some of these are in the form of proposals to be presented at the Annual Meeting and each is given careful consideration.\n\nWe seek dialogues with the sponsors prior to the meeting when there is more time to better understand each other\'s positions and we often find agreement. Let me be clear on the conduct of the meeting. Recognizing that the majority of our shareholders have voted by proxy and are not present, we have established procedures to facilitate an orderly meeting.\n\nWe\'ve set up a process for speakers to identify themselves and to express their views and I assure you, we welcome those views. In order that as many shareholders as possible can participate, we have set time limits and a system of reminders to help you manage your time.\n\nWe have 14 items to consider. As Secretary Woodberry said earlier, discussion on all items of business will be deferred to the discussion period. This may enable us to have some time for general comments and questions as well and conclude the meeting in a reasonable time frame.\n\nFor those of you who may wish to leave the meeting at any time, let me express my appreciation for your attendance. Since we have a number of items yet to discuss on the program and you\'ve been sitting for a while, I would invite you to stand and take a short stretch break and I would ask that you not leave the hall. We\'ll resume in just a moment.\n\n(Break)\n\nREX TILLERSON: If you\'d please take your seats. The first item of business is the election of 14 directors. I nominate the 14 persons identified on Pages 16 through 20 of the proxy statement. These 14 people are highly qualified to serve on the Board.\n\nAll of our nominees are currently serving as ExxonMobil directors, except for [Ms. Braly], who has been nominated by the Board for first election as a director today. All director nominees are in attendance today with the exception of Jay Fishman who was unable to attend, although I know Jay has dialed in on the Internet to the meeting.\n\nNow I\'d like to ask the nominees seated to my right in the orchestra terrace to be recognized as their names are called and then I\'ll close the nominations. Michael Boskin, Peter Brabeck-Letmathe, Angela Braly, Ursula Burns, Larry Faulkner, Henrietta Fore, Kenneth Frazier, Douglas Oberhelman, Sam Palmisano, Steve Reinemund, William Weldon and Darren Woods.\n\nI declare the polls open for all who want to vote in the election of directors and the 13 remaining items. If you wish to change your proxy instructions on the election of directors or any other of the 13 items or if you have not submitted a proxy and wish to vote by ballot, they are available from the ushers.\n\nPlease raise your hand if you would like a ballot at any time during the formal business. They will be collected after all items have been discussed.\n\nThe next item on the agenda is the ratification of PricewaterhouseCoopers as the independent auditors. The Audit Committee of the Board has appointed PricewaterhouseCoopers to audit ExxonMobil\'s financial statements for 2016, and we are asking shareholders to ratify that appointment.\n\nPricewaterhouseCoopers is represented today by Mr. Alan Page. Alan, would you please stand? Thank you.\n\nThe Audit Committee\'s reasons for recommending PricewaterhouseCoopers appear in the proxy statement. I move the adoption of the proposal shown on Pages 24 and 25 of the proxy statement.\n\nThe next order of business is consideration of the Board-sponsored proposal regarding Executive compensation. This board proposal calls for shareholder advisory vote to approve Executive compensation as required by law. The Board recommends a vote for this proposal, as outlined on Pages 26 and 27 of the proxy statement.\n\nThe next order of business is consideration of the 11 shareholder proposals in the proxy statement. The first shareholder proposal regarding an independent chairman is shown on Pages 56 and 57 of the proxy statement. I understand that Beth Richtman will present the proposal. Beth, are you here? Yes.\n\nBETH RICHTMAN, INVESTMENT MANAGER, CALPERS: Good morning. Mr. Secretary, Chair, members of the Board and fellow shareholders. My name is Beth Richtman. I am an investment manager at CalPERS, the largest U.S. pension fund, with approximately $300 billion under management.\n\nCalPERS is a long-term owner with 13 million shares of ExxonMobil worth over $1 billion. I\'m here to present Proposal Number four, which calls on Exxon to separate the positions of chair and CEO in the transition to the next CEO on behalf of the Ellen Higgins Trust.\n\nCalPERS firmly believes that companies with good corporate governance and sustainable business practices will outperform over the long term. In our view, a separate independent chair and CEO structure eliminates the conflicts of interest, which arise when the CEO is also responsible for board oversight. It\'s like grading your own exam papers when you have a chair who effectively oversees the CEO.\n\nWith the announced retirement of Mr. Tillerson, we think it\'s an excellent time to review Exxon\'s leadership structure and improve the governance by separating the chair and CEO roles.\n\nNow is a time of great change, great risk and great opportunity for Exxon and the energy sector. Many other governments -- many other companies, large shareowners and governments all agree that the COP21 Paris agreement sets us on a path towards a low-carbon economy.\n\nIt is widely recognized that chairing the Board is a very time-intensive job. Having a separate chair would provide more time for Exxon\'s CEO to focus on running the company and the Board to focus on oversight in this time of unprecedented change for the company and the energy sector.\n\nWith an improved governance structure and improved focus and time commitments for the key leaders at Exxon, we would also expect improved disclosure to shareowners on how Exxon is analyzing and preparing for the transition to the low-carbon economy.\n\nCalPERS appreciates this opportunity to address the Board directly. We think that it is unfortunate that Exxon\'s current policy prevents the Board from having a dialogue with shareowners.\n\nEngagement between shareowners and directors leads to the type of productive discussion of long-term strategic issues that can prevent the necessity of shareholder proposals and proxy battles. We\'re hopeful Exxon will end this policy of preventing these important conversations with your shareowners.\n\nIn closing, please join CalPERS and vote for Proposal Number four to separate the CEO and chair roles and bring our company into the realm of best practice. Because accountability and transparency are the key tenets of good governance, CalPERS has also been actively soliciting with The New York City Comptroller\'s office for Proposal Number seven concerning proxy access and Proposal Number 12 to improve the company\'s risk reporting concerning climate change risk. Combined, these proposals will set Exxon on a future course for success. As long-term owners and as fiduciaries, we ask for nothing less. Thank you.\n\nREX TILLERSON: Thank you. The Board recommends a vote against this proposal, as outlined on Page 57 of the proxy statement.\n\nThe next shareholder proposal calls for the addition of a climate expert to the Board and as shown on Page 58 of the proxy statement. I understand Fr. Michael Crosby will present the proposal. Fr. Crosby?\n\nMICHAEL CROSBY, CAPUCHIN FRANCISCAN FRIAR: Good morning, Mr. Tillerson and Board members and fellow shareholders. As Mr. Tillerson said, I am Michael Crosby. I\'m a Capuchin Franciscan friar priest from Milwaukee. So I move our resolution. We\'re asking for a person with climate change expertise to be on the Board for two main reasons.\n\nAs many of you know, ExxonMobil doesn\'t allow its shareholders to have access to the Board, as a whole, or any of its members to discuss these issues independent of management. But the other reason, if you would look at Pages 6 to 8, is according to the references for the Board members, which are substantial, not one person has any expertise on climate.\n\nAnd as a result, the Board is only able to address the issue given what management says. It notes the fact that 12 members have global business credentials, that five members have financial expertise, one has some credentials in chemistry, but management\'s own list has nobody identified with climate risk. And this is the biggest risk facing the company\'s business, not to mention everybody on the planet. Maybe if we would have had a climate change expert on the Board, she or he would not have allowed Exxon to act in the way they did when we first filed our shareholder resolution in 1997 on climate change or maybe they wouldn\'t have denied it for the first 10 years after we did file it.\n\nEven as I speak today, the company is still supporting efforts, it seems, by some to undermine state\'s efforts to ensure the health of their people by having some kind of mitigation around climate change.\n\nToday, at the meeting, we heard from CalPERS and CalPERS just issued a statement that from now on, they\'re going to be asking all companies to have somebody with climate change on their board. If that\'s going to be for all companies, how much more for a company where almost 100% of our product and our efforts and production is related to climate change.\n\nI\'d like to compliment you on the way you began your remarks. I think it\'s the first time, and you know I\'ve been here quite a bit, where you begin with the problem of climate change and acknowledged it. But at this stockholders meeting, with more press participating than ever, ExxonMobil has a chance to restore the public\'s trust. This calls for conversion. I like the kiosk out there that talked about the drone and we can create disruptive change, you said.\n\nWe don\'t want disruptive change. We want constructive change and this conversion invites us to change. As a Catholic priest, I join Pope Francis in believing that conversion is possible to more and more efforts around renewables. And so as a result of that, I urge support for our resolution. Thank you.\n\nREX TILLERSON: Thank you, Fr. Crosby. The Board recommends a vote against this proposal, as outlined on Pages 58 and 59 of the proxy statement.\n\nThe next shareholder proposal, to hire an investment bank as shown on Page 59 of the proxy statement. I understand that Tracey Rembert will present the proposal. Ms. Rembert?\n\nTRACEY REMBERT, SHAREHOLDER, CHRISTIAN BROTHERS INVESTMENT SERVICES: Thank you, fellow shareholders, Mr. Tillerson and the Board. My name is Tracey Rembert and I\'m here representing Christian Brothers Investment Services, which is a Catholic investment manager with over $6 billion in assets under management. We\'ve been a longtime shareholder of ExxonMobil with over 329,000 shares currently, and we want to formally move Item Number Six, Ken Steiner\'s proposal on hiring an investment bank to break up the company.\n\nWhile we don\'t agree with this proposal, we are moving it as a formal courtesy to a fellow shareholder. We do think it -- we\'re at an interesting time in discussion of the company and its current business model. But we haven\'t given up on ExxonMobil just yet. We think that you can be prepared for a diversified energy future, but we can\'t keep having these conversations year-over-year. I think in terms of action that\'s needed, solutions that need to come to the table, I really hope I\'m not here 10 years from now having the same conversations and the company has dramatically started to address some of the issues raised.\n\nI\'m here to talk about impact on the world\'s poor, and issues continue to circle around the company about the nimbleness of your current business model to be able to address issues. Exxon presents a false choice between providing reliable and affordable energy and addressing climate change. We don\'t think it\'s an either/or. We think right now, absolutely, both issues need to be addressed. Impacts the world\'s poorest communities, including those in the global South, are happening now. Climate change is not some abstract thing likely to happen 80 or 100 years from now. It is here, and it is visceral, and it\'s devastating the world\'s poor people, and Exxon has said for over a decade that it wants to provide affordable energy to the world\'s poor, but only through business as usual measures, but we expect more.\n\nWe question whether lives will actually be improved in low-income communities if they are regularly suffering from the impacts of drought, extreme weather, geopolitical upheaval due to climate change and other factors. We want to hear from our company how you will begin to address these impacts that are a consequence of your core products and current business model.\n\nThis is not about "abundant energy supply" as you just mentioned in some of your remarks earlier. This is about abundant ultra-low carbon energy supply for the masses that want energy. This is what is needed.\n\nWe currently question whether ExxonMobil long term will do more harm than good with its current business strategy. And, therefore, we really urge you, Mr. Tillerson, especially the Board, as new leadership comes in, it\'s now or never.\n\nThe impacts to the world\'s poor -- we\'ve far surpassed, I think, the point where people are suffering and we\'d really like to see Exxon bring some viable solution to the table, not just talking about small investments in an energy transition. We do want to see Exxon actually steer us to a low-carbon path. Thank you.\n\nREX TILLERSON: Thank you, Ms. Rembert. The Board recommends a vote against this proposal, as outlined on Page 59 of the proxy statement.\n\nThe next shareholder proposal for a proxy access bylaw is shown on Page 59 and 60 of the proxy statement. I understand that Michael Garland will present the proposal. Mr. Garland?\n\nMICHAEL GARLAND, ASSISTANT COMPTROLLER FOR CORPORATE GOVERNANCE AND RESPONSIBLE INVESTMENT, OFFICE OF NEW YORK CITY COMPTROLLER: Good morning Mr. Chairman, members of the board and fellow shareowners. I\'m with the office of New York City Comptroller, Scott Stringer, and I\'m presenting Proposal Seven on behalf of The New York City Pension Funds.\n\nThese funds have $160 billion in assets and are substantial, long-term ExxonMobil shareowners. We\'re voting 10.5 million shares at today\'s meeting valued at nearly $1 billion and this represents the retirement security for New York City\'s teachers, firefighters, police and other loyal city employees.\n\nProposal seven calls for a proxy access bylaw to enable shareowners that have collectively held at least 3% of the company for three years to include a limited number of director candidates on the company ballot.\n\nThe proposed bylaw is intended to give substantial long-term shareowners a meaningful voice in electing the directors responsible for overseeing the company\'s long-term strategy and risks, including risks related to climate change.\n\nProxy access will enable Exxon shareowners to ensure that the Board is made up of independent and accountable directors who have the diverse mix of relevant skills, experience and perspectives necessary to navigate these challenges and create long-term shareowner value.\n\nThe proposed bylaw includes appropriate safeguards to prevent abuse, a 3% interest in Exxon is currently valued at over $11 billion. By any measure, a significant stake in the company\'s future. There\'ve been extraordinary developments since last year\'s Annual Meeting when the proposal received 49% of votes cast and these highlight why this is a defining moment for the board and why proxy access is so important for Exxon shareowners.\n\nOxford University released a study that found that major oil companies face significant risks from groupthink because they make large lumpy investments with long time horizons. The study found that Exxon had the least diverse board, particularly with respect to age range, nationality and industry experience among the six largest oil and gas companies in the world, publicly traded, and was therefore most exposed to risks of groupthink.\n\n175 companies -- 175 countries signed the Paris agreement, demonstrating that there\'s global political will to limit carbon emissions, a scenario the Board must take more seriously.\n\nThe Saudi government announced dramatic steps to prepare its economy for the inevitability of a low-carbon future. Major oil companies like BP and Shell have supported shareowner requests for climate risk reporting similar to Proposal 12, which The New York City funds have joined with CalPERS to actively support along with proxy access.\n\nFinally, since last year\'s meeting, more than 200 U.S. companies have enacted proxy access bylaws on terms similar to Proposal Seven. They include the largest U.S. oil companies other than Exxon. They include seven companies at which Exxon directors also serve as directors.\n\nMany of these companies have voluntarily enacted proxy access without the need for a vote, including more than 70% of the 72 companies that received proposals from New York City this year.\n\nExxon\'s board is an outlier in its continuing opposition to proxy access, the climate risk reporting and to engaging directly with shareowners. It\'s time for the board to demonstrate its commitment to an accountable system of corporate governance that fosters long-term value creation.\n\nWe urge shareowners to vote for Proposal Seven. Thank you.\n\nREX TILLERSON: Thank you, Mr. Garland. The Board recommends a vote against this proposal, as outlined on Pages 60 and 61 of the proxy statement.\n\nThe next shareholder proposal calls for a report on compensation for women and is shown on Pages 61 and 62 of the proxy statement. I understand that Tom Sifferman will present this proposal.\n\nTOM SIFFERMAN, REPRESENTATIVE, MOBIL OIL: Thank you, sir. It\'s good to be back here again, Chairman. The Board of Directors, shareholders, this is my second time presenting this. My name is Tom Sifferman as indicated. I\'m a Ph. D. and Registered/Licensed Professional Engineer and speaking on behalf of Eve Sprunt, my former coworker at Mobil Oil. She\'s a Ph. D.\n\nI have eight bullet items, most of it it\'s in the proxy statement, but one thing I wanted to say is that there is no doubt that Exxon from ExxonMobil\'s statement in the proxy that they are attempting to help women advance.\n\nThe question is how effective those actions have been? Subtle and subconscious prejudice is difficult to combat as a major factor for women\'s underrepresentation in leadership roles. The value that is placed one someone\'s role within a company is reflected in their monetary compensation.\n\nWomen in management and executive ranks are often underrepresented and support functions that are less well compensated instead. The results of ExxonMobil\'s actions are -- if they are so good, please show us the money, that\'s the important thing, provide data on how women\'s compensation compares with men\'s at all levels. The Bureau of Statistics and Labor reports that 76% of human resource managers are female, but on the average, their pay is only 71% of their male counterparts. If human resource managers, who are the company\'s watchdogs can\'t take care of themselves, who can?\n\nThe proof of how successful the efforts are is in the money. Again, show us the money. We need to see the data. If, and that\'s a question if, if ExxonMobil is a leader in advancing women the compensation data should be a valuable recruiting tool to attract the best and brightest women, let\'s use it for our advantage. As shareholders, we want ExxonMobil to succeed in advancing women to attract and retain the most advanced, most talented and productive women.\n\nWomen also invest money in different funds and fund managers now days don\'t seem to be too concerned about sending female investors a message that there should be pay transparency. Basically, show us the money. Thank you very much.\n\nREX TILLERSON: Thank you, Mr. Sifferman. The Board recommends a vote against this proposal, as outlined on Pages 62 and 63 of the proxy statement.\n\nThe next shareholder proposal calls for a report on lobbying and is shown on Pages 63 and 64 of the proxy statement. I understand that Hughes Jenkins will present the proposal. Mr. Jenkins?\n\nHUGHES JENKINS, EMPLOYEE, UNITED STEELWORKERS: Fellow shareholders and members of the board, my name is Hughes Jenkins, and I\'m a nine-year employee at ExxonMobil in Baton Rouge, Louisiana. On behalf of the United Steelworkers and (inaudible) followers, I hereby move Item Nine. The proposal asking our company to provide a report on state and federal lobbying expenditures including indirect funding of lobbying through trade associations and support for the American Legislative Exchange Council known as ALEC.\n\nIllustrating the deep international concern about lobbying and climate change, AP7, a Swedish pension fund, which owns over 3.4 million shares joined as a co-filer this year. Transparency and accountability in corporate spending to influence public policy are the best -- are in the best interest of ExxonMobil shareholders, and a high profile investigation into whether our company mislead investors on climate change only underscores the importance of full disclosure.\n\nCorporations contribute millions of dollars to trade associations, they\'re directly -- they lobby indirectly on their behalf without specific disclosure or accountability. ExxonMobil does not even disclose its trade association membership nor the portions of these payments used for lobbying, so shareholders currently have no way to know how much ExxonMobil\'s trade association contributions are being used to lobby on its behalf.\n\nExxonMobil also contributes to tax-exempt organizations [their] right and endorse model state legislation playing a key role in ALEC. ALEC has attracted negative attention for its role in promoting bills on anti-immigration policies, and also blocking EPA regulation on climate change. ALEC also promotes legislation that seeks to diminish workers\' rights in our country. As a proud member of the United Steelworkers Union, I find many of ALEC\'s proposed laws to be distasteful.\n\nThis is one of the reasons why we are seeking full disclosure on lobbying. Our company argument against Item Nine suggests shareholders can use government (inaudible) to get this information, but public information does not provide a clear picture of ExxonMobil\'s lobbying expenditures, nor is it easy to assess.\n\nWe urge shareholders to vote for this proposal to mandate our company supply this important report. Thank you.\n\nREX TILLERSON: Thank you, Mr. Jenkins. The Board recommends a vote against this proposal, as outlined on pages 64 and the 65 of the proxy statement.\n\nThe next proposal calls for a commitment to increase the total amount authorized for capital distributions and is shown on Page 65 and 66 of the proxy statement. I understand that [Natasha Lamb], will present the proposal. Ms. Lamb?\n\nNATASHA LAMB, DIRECTOR OF EQUITY RESEARCH, ARJUNA CAPITAL AND BALDWIN BROTHERS: Good morning, Mr. Chairman, members of the Board and fellow shareholders. My name is Natasha Lamb, and I am here on behalf of Arjuna Capital and Baldwin Brothers\' client Eric MacCallum to move Proposal Number 10, which asks our company to prioritize profitability and value over growth by returning more capital to shareholders.\n\nWe ask this in light of increasingly risky investments in potentially stranded carbon assets. That is carbon assets we can\'t burn without irreversible climate disruption. Implementing the proposal would represent a prudent, albeit, disciplined path forward in the face of unburnable carbon assets, which represent approximately two-thirds of proven global reserves according to the International Energy Agency.\n\nThese reserves cannot be burned when the Paris Climate Agreement established by 196 countries to prevent a less than 2 degrees Celsius rise in global temperature goes into effect. For a while, the business plan of extracting as much carbon as quickly as possible was a winner, last century, that same plan will destroy value this century and already has. Historic levels of capital spend on growth assets has eroded profitability and Exxon\'s risk profile, return on equity and return on invested capital are at historic lows.\n\nProfitability has fallen 68% over the last decade and Exxon\'s sterling credit rating has just been downgraded, increasing the company\'s cost of capital. Continuing to grow high cost fossil fuel reserves in the face of global climate change disruptive agreement is no longer prudent.\n\nCitigroup estimates unburnable fossil fuel reserves could amount to over $1 trillion out to 2050. While the Carbon Tracker Initiative estimates that as oil majors choose to undertake projects consistent with a 2-degree demand level, combined upstream assets would actually be worth $140 billion more.\n\nWe are asking our company to prioritize value over growth, investing in the most profitable core carbon assets and returning a greater percentage of profits to shareholders, which would allow Exxon to decrease in size while increasing profitability, essentially shifting from a business plan of growth for growth\'s sake to one of value.\n\nTo the opposite end, profitability has decreased, while total capital distributions have fallen for the last three years, by 35% last year and they\'re projected to fall 50% in 2016 from 2014 levels. I urge shareholders, management and the board to fully consider this strategically important issue at such a critical juncture. Thank you.\n\nREX TILLERSON: Thank you, Ms. Lamb. The board recommends a vote against this proposal as outlined on Pages 66 and 67 of the proxy statement.\n\nThe next shareholder proposal calls for a policy to limit global temperature increases to 2 degrees centigrade and is shown on Pages 67 and 68 of the proxy statement. I understand that Sister Patricia Daly will present the proposal.\n\nPATRICIA DALY, SISTERS OF ST. DOMINIC OF CALDWELL NEW JERSEY: Good morning, Mr. Tillerson. How are you today?\n\nREX TILLERSON: I\'m well. Thank you.\n\nPATRICIA DALY: Good morning, everybody, and members of the board. My name is Sister Patricia Daly. I\'m a Dominican Sister of Caldwell, New Jersey, the lead filer along with 32 other institutional investors, most of whom -- many of whom are here today, but most of whom are members of the Interfaith Center on Corporate Responsibility, representing faith-based initiatives, institutions, Native American tribes, health care systems and asset management firms.\n\nI move Item 11 on the proxy, which asks ExxonMobil to adopt a policy acknowledging the imperative to limit global warming to 2 degrees Celsius. This resolution calls for moral leadership on the part of our company. Accepting the moral obligation of the 2-degree target in the midst of the climate crisis is surely the lowest hurdle our company will face on this issue.\n\nInstead, in opposing this resolution, our company has chosen to disregard the consensus of the scientific community, the will of 195 nations that signed the Paris Agreement in December, many of our peers in the oil and gas industry and the calls of faith leaders from the Catholic, Jewish, Christian, Muslim, Buddhist and interfaith traditions. I\'m particularly and personally challenged by our Pope Francis in his encyclical letter, Laudato Si\', that was published last year, a reflection to all peoples on the planet on our common home. I hope people here may take the time to read that.\n\nOpposing this resolution positions our company against a movement for climate justice. As the world moves forward, ExxonMobil stands still. Every day, new reports and incidents demonstrate the importance of the 2-degree target for the sake of our planet for future generations and vulnerable communities, communities hit hardest by climate change. Record wildfires still rage in Alberta, disrupting the production of the very tar sands fueling global warming. We can see impacts today all around us.\n\nIn a world already facing the worst refugee crisis in generations, continued conflict and war and projected extreme heat waves that will render parts of the world uninhabitable, we must limit our warming to 2 degrees. Decades have been lost in the fight against climate change due in part to our company\'s deliberate campaign of disinformation.\n\nMr. Tillerson, last year, at the oil and gas -- the annual Oil & Money Conference, you said and I quote, "There\'s also a humanitarian dimension and a moral imperative to what we do." Today, we ask our company to acknowledge the full breadth of its moral imperative, moral responsibilities to provide the solutions to energy poverty while limiting global warming to less than 2 degrees. Anything less is a false solution for the world\'s poorest and most vulnerable. Please join your peers like Shell, BP and Saudi Aramco who stated, and I quote, "Our shared ambition is for a 2-degree future. It\'s a challenge for the whole of society. We are committed to playing our part." Now ExxonMobil must play its part, too.\n\nMr. Tillerson, you said earlier in the energy outlook that you believe that the energy outlook is in line with the 2-degree limit. If you really believe that, why would you and our Board of Directors recommend a vote against this resolution? Thank you.\n\nREX TILLERSON: Thanks, Sister Pat. The board recommends a vote against this proposal as outlined on Pages 68 and 69 of the proxy statement.\n\nThe next shareholder proposal calls for a report on impacts of climate change policies and is shown on Pages 69 and 70 of the proxy statement. I understand that Edward Mason will present the proposal. Mr. Mason?\n\nEDWARD MASON, SHAREHOLDER, CHURCH COMMISSIONERS FOR ENGLAND: Thank you. Chairman, members of the board, shareholders, good morning. My name is Edward Mason from the Church Commissioners for England, who manage the Church of England\'s endowment. We\'re long-term shareholders interested in the sustainable success of the companies in which we invest, and it\'s a pleasure to be here in Dallas for Exxon\'s annual meeting.\n\nThe Church Commissioner is a lead co-filer of shareholder Proposal Number 12, which was filed by New York State Common Retirement Fund, whom I\'m also representing. The resolution asks Exxon to publish annually an analysis of how its portfolio stress tests against a scenario in which the world restricts warming to 2 degrees. This is a reasonable request.\n\nChairman, as you have acknowledged this morning, climate change is real. The desire of global governments to restrict warming to below 2 degrees, affirmed in the Paris Agreement, is real. The financial risks and opportunities for companies associated with the transition to a low-carbon economy are real. These risks have been recognized by the global Financial Stability Board chaired by the governor of the Bank of England, which has created a task force on climate-related financial disclosure.\n\nExxon\'s peers have agreed to provide regular portfolio resilience reporting, including BP, Shell and Total. At BP and Shell\'s annual meetings last year, the reporting request was endorsed by 98% of shareholders at both companies. Because of Exxon\'s decision to diverge from its peers and oppose the request in this resolution, management will today experience a major shareholder revolt.\n\nChairman, the board is losing the confidence of its investors on climate change. In the run-up to this annual meeting, investors with over $10 trillion of assets affirmed their support for this shareholder proposal.\n\nMany of the world\'s largest asset managers are voting against management today -- Aegon, Amundi, Aviva Investors, AXA Investment Managers, BMO Global Asset Management, BNP Paribas Investment Partners, HSBC Global Asset Management, Legal & General Investment Management, Natixis Asset Management, Robeco and Schroders. The world\'s largest sovereign wealth fund, the Norwegian Government Pension Fund, pension schemes from around the world from the public and private sector alike and church investors like us from three continents are all voting against management.\n\nChairman and members of the board, Exxon can do better. Following today\'s vote, the investors backing this proposal look forward to starting afresh and having a responsive and productive engagement with the company on climate-related disclosures.\n\nREX TILLERSON: Thank you, Mr. Mason. The board recommends a vote against this proposal as outlined on Pages 70 and 71 of the proxy statement.\n\nThe next shareholder proposal calls for the company to report reserve replacement in BTUs and is shown on Page 71 of the proxy statement. I understand that Danielle Fugere will present the proposal. Ms. Fugere?\n\nDANIELLE FUGERE: Thank you. Ladies and gentlemen of the board, Mr. Chairman and all shareholders gathered here today, good morning. My name is Danielle Fugere. Thank you for the opportunity to move Item 13 entitled Carbon Asset Transition. This proposal sends a clear message that shareholders want our company to prepare to thrive in the coming clean energy economy. It\'s becoming clear, and I think we\'ve heard today, that business as usual is no longer a viable strategy for the 21st century and in fact is incompatible with a livable planet.\n\nRecognizing the urgency facing businesses to respond to climate change, this proposal seeks to provide a means by which our company can move beyond the current business model, begin to respond to new opportunities and consider becoming an energy company for the future.\n\nThis proposal asks Exxon to account for energy reserves in resource-neutral BTUs, or British thermal units, an internationally accepted energy unit, in addition to the traditional barrels of oil equivalent. Since Exxon already uses BTUs, this reporting is not difficult to accomplish. And this seemingly minor accounting tweak will have major consequences. It will help our economy and the market account for and place value on a range of energy resources beyond just oil and gas, including geothermal, cellulosic biofuels, wind, solar and other energy resources. By aligning one of the market\'s primary valuation metrics, reserve replacement, with this imperative for lower carbon energy, a path to become a truly diversified and competitive energy can be forged.\n\nCurrently, in oil and gas companies, market value is based in large part on reserve replacement ratio. Did the company replace every used barrel of oil it produced with a newly discovered barrel of oil? If annual oil reserve replacement isn\'t 100% or greater, a company\'s stock market value may be impaired. Unfortunately, we saw both of these things happen to Exxon this year when S&P downgraded the company\'s credit, it noted that in its view, the company\'s greatest business challenge is replacing its ongoing production.\n\nThe all-important reserve replacement metric locks management into an endless hunt for more oil and gas. So shareholders look to Exxon to be a leader in beginning the inevitable transition to becoming a diversified energy company, able to compete in a decarbonizing economy. Imagine energy analysts looking at Exxon\'s filings in the future and seeing 67% of barrels of oil equivalent replacement and 100% BTUs, and then the next year, 60% barrels replaced and again, 100% BTUs. This tells a story of a company in transition to not only survive, but thrive in a clean energy economy. We urge shareholders to vote in support of proxy Item 13. Thank you for your attention.\n\nREX TILLERSON: Thank you, Ms. Fugere. The board recommends a vote against this proposal as outlined on Page 72 of the proxy statement.\n\nThe last shareholder proposal calls for a report on hydraulic fracturing and is shown on Pages 72 and 73 of the proxy statement. And I understand, Ms. Fugere, you will present this proposal as well.\n\nDANIELLE FUGERE: Thank you. I\'m also presenting Item 14, a shareholder proposal requesting Exxon to increase its transparency on hydraulic fracturing operations. A similar proposal was filed last year and received 25% support from voting share owners.\n\nRather than improving its transparency, however, Exxon has remained a laggard in demonstrating to shareholders that it is using practices that reduce risk and reduce harms. In the face of well-known environmental and social impacts, the public has continued to pass restrictions and moratoria, limiting business opportunities and imposing a wide range of costs on companies, from the cost of delay to complete loss of access to valuable resources, all of which materially impact investors\' holdings. Improved reporting helps reassure the public that companies are addressing local issues such as air and water pollution, truck traffic and noise and competition for water in water-scarce areas.\n\nLocal issues require local reporting, thus we ask the company to report on local issues on a play-by-play basis, so that neighbors can understand specifically what the company is doing in their neighborhood. If Exxon reports worldwide reductions in air pollution, that does nothing to alleviate concerns of local community members about what the company might be doing in their neighborhoods.\n\nBetter reporting also provides information as to whether the company is using best practices in places where they are not legally required to do so. Stating that the company follows the law is not useful in places where the law doesn\'t sufficiently protect communities. Methane leakage is a good example. The public and lawmakers are focused like lasers on the issue of whether companies are monitoring or/and addressing methane leakage where it occurs. If leaks aren\'t addressed, natural gas may become worse for the climate than coal. While methane leakage laws are being developed, it\'s critically important that industry and Exxon get out in front of the issue and trumpet the actions they are taking to prevent or address methane leaks.\n\nExxon is currently far behind other companies in its reporting. Leaders like BHP Billiton, Hess, Apache, CONSOL and Noble Energy earned top scores this year in a report called Disclosing the Facts, while Exxon rated near the bottom, again, far behind its peers and frankly, behind where such a leading company should be. Peer reporting has earned these other companies accolades, and the sky hasn\'t fallen around them.\n\nIn the absence of meaningful disclosures, shareholders have no way to fully assess the steps Exxon is taking, both over time and relative to peers, regarding its actions to reduce the impact of its shale oil and gas operations. It\'s beyond time that Exxon begin to demonstrate leadership in this important area. We encourage shareholders to vote for proxy Item number 14 to increase Exxon\'s transparency and communicate on these important issues. Thank you.\n\nREX TILLERSON: Thank you, Ms. Fugere. The board recommends a vote against this proposal as outlined on Pages 73 and 74 of the proxy statement.\n\nAll items of business have now been introduced. I invite any of you who want to, to stand and stretch your legs again for a minute. And for those of you who wish to address the meeting in the discussion period, this would be a good time for you to move toward the aisle to the reserved seats, so you\'ll have ready access to a microphone.\n\n(Break)\n\nREX TILLERSON: Okay, let\'s resume the meeting, so if you would take your seats please.\n\nI would now open the floor for discussion on the items of business presented. I\'d like to emphasize that your comments should be limited to only the board and shareholder proposals in the proxy statement at this time. As I said earlier, we\'ll try to set some time aside to address other topics if we have time remaining. We received a number of questions on proxy cards and through our website. As time permits, we\'ll try to get to some of those.\n\nAs described earlier, move to the aisle if you\'ve not already done so, or if not already seated nearby. Remain seated and raise your speaker identification card to indicate you want to address the meeting. When recognized, give your completed speaker identification card to the usher, and a microphone will be provided. Stand and begin by stating your name and the board or shareholder proposal you want to comment on. Bear in mind the rules of conduct shown in the program. Make your comments as brief as possible so that we can allow as many people to address the meeting as wish to do so. We\'ll continue to use the lighting system to help you manage your time. First priority will be given to those who have not yet had an opportunity to speak. So I would welcome your questions or comments at this time.\n\nQuestions and Answers\n\nREX TILLERSON: There in the back.\n\nDAVID RIESMAN, SHAREHOLDER: Fellow shareholders -- oh sorry. My name is David Riesman. I\'m speaking on Item Nine, lobbying. Fellow shareholders, member of the board, I\'m here representing Yale University\'s Dwight Hall, one of the co-filers of the lobbying proposal.\n\nYale and over 1,000 academics have publicly indicated support of this proposal. In the past year, Exxon has come under unprecedented scrutiny of its political spending and public policy positions, including ongoing investigations by the state attorney general to determine whether ExxonMobil deliberately misled the public and investors. This investigation and the risk it poses to shareholder value underscore the need to improve disclosure and oversight of our company\'s lobbying, especially through third parties.\n\nExxon publicly supports a carbon tax and acknowledges the realities of climate change, but its lobbying practices appear to conflict with his positions. Exxon\'s management justifies its support of organizations that obstruct climate policy by claiming that Exxon supports some, but not all, of the actions of such organizations.\n\nHowever, because Exxon is an oil and gas company, the company\'s exposure to climate policy obstructionism is in danger of being construed as intentional. We believe that ExxonMobil shareholders deserve to understand the process by which the company contributes to organizations whose positions contradict Exxon\'s stated principles, blocking progress on climate change.\n\nIt\'s hard to take these principles in good faith while seeing these contradictory practices. For example, Exxon is an active leader and funder of ALEC, a primary agent of climate policy obstructionism. The company has justified participation in ALEC on the basis of the organization\'s STEM lobbying, among others.\n\nHowever, Exxon can pursue STEM lobbying through other organizations. Recently, 100 companies have publicly left ALEC. Many companies, including Shell and BP, have determined that ALEC\'s harmful climate obstruction outweighs any of ALEC\'s benefits. Given that ALEC\'s harmful policy obstruction seems to overshadow any other membership benefits, why is this company still associated with ALEC? Thank you.\n\nREX TILLERSON: Thank you. Other speakers? All right, down here. I didn\'t hear a question.\n\nDAVID RIESMAN: Why the company is still associated with ALEC, given the previous statements?\n\nREX TILLERSON: Well, ALEC, as you probably know, is an organization of state legislators and legislative bodies, over 400 members. And most of the most productive legislation in this country is passed at state levels.\n\nWe have found our engagement with them to be very productive in a number of broad areas, whether it\'s tax policy, other regulatory areas, particularly educational reform, and that is why we remain engaged with them. We find them to be a highly competent and very thoughtful policy organization. Whether we agree with the positions they ultimately take or not, we find the engagement very useful to our understanding of these issues, and we intend to continue.\n\nDown here.\n\nMICHAEL MACCRACKEN, CHIEF SCIENTIST, CLIMATE CHANGE SCIENTIST, MERCY INVESTMENT SERVICES: Thank you. Mr. Tillerson, members of the board, my name is Michael MacCracken. I\'ve been a climate change scientist for nearly 50 years, and I\'m here on behalf of Mercy Investment Services, speaking with respect to the items raised in Proposals 11 and 12.\n\nAt last year\'s shareholder meeting, Mr. Tillerson, you responded to an audience question that climate model results were too uncertain to be reliable, and that if emissions cutbacks were not sufficient, engineering ways would emerge and serve as plan B. While we\'ll have no -- or we\'ll have no choice but to try and adapt, I\'d suggest that the international scientific community would take strong exception to the viewpoints you expressed with respect to what it means with regards to national and international policies.\n\nOn the first point, in DOE\'s major, widely peer-reviewed 1985 climate change assessment that preceded IPCC, Exxon\'s leading climate change scientist for the past several decades, Dr. Brian Flannery, coauthored the chapter on projecting climate change. That chapter concluded that quote, "Climate models currently available when run with standard scenarios of fossil fuel CO2 emissions indicate a global warming of the order of 1 degrees C by the year 2000 relative to the year 1850, an additional 2 to 5 degrees C warming over the next century." So that projection was made three decades ago, and it\'s still the case today. Basically, we\'ve had a good sense of what the large-scale outcomes were, but the actions we have taken have really not changed things much.\n\nIndeed, as you said last year, improving climate models and having more data and computer resources have been helpful, but there are inevitable uncertainties. The top line conclusions, however, have not changed over this entire period. The main reason now for the differences between models is with respect to the choices we make with what energy technologies we have, what emissions we have. So basically, we need to decide what\'s going to happen with respect to the future.\n\nWith the potential -- with respect to the potential for mitigation -- with adapting, all major assessments indicate that it\'s going to be very hard technologically and economically to adapt to a lot of the different consequences, biodiversity and other things.\n\nSo my question would relate to these resolutions, have to do with what due diligence analyses ExxonMobil has done to date with respect to the likelihood and what we\'re finding in the scientific community, the consequences are becoming more severe -- likely to be more severe than IPCC has been projecting? And what win-win solutions ExxonMobil is looking for to try and meet the more aggressive actions that are going to be needed to meet the Paris accord and objectives.\n\nREX TILLERSON: Well, as you and I have spoken before, my view on the competency of the models has really not changed. And in fact, that is reflected in the IPCC\'s own FAQs. In the FAQs following the Paris climate accord, the question was asked of the IPCC panel, is there a scientific basis for the 2-degree centigrade target?\n\nAnd you can go to the FAQ and find this for yourself. I\'m going to paraphrase it. The answer is no. There is and they say it, they go on to say there is no scientific basis for the 2-degrees centigrade target. The target has been one chosen by society in a general consensus process. I take no exception to that, okay? So we can set the science apart from what we, as society, decide we\'re going to choose to pursue as a matter of policy.\n\nSo now your question is as a matter of policy. And I said to you relative to the understanding of the science, there\'s no space between us and the IPCC. We see the science the same way. Our differences on policy choices are, I think, one that is grounded in reality. And the reality is, there is no alternative energy source known on the planet or available to us today to replace the pervasiveness of fossil fuels in our global economy and on our very quality of life, and I would go beyond that and say our very survival.\n\nSo it is a judgment of balance between future climatic events, which could prove to be catastrophic, but are unknown by the IPCC\'s own acknowledgment, and more immediate needs of humanity today to address poverty, starvation, broad-based disease control and the quality of life of billions that people who are still living in today, which is unacceptable to many of us. And the only way out of that is to provide them the energy sources we have today. And that\'s why we continue to believe fossil fuels will have a significant and important role to play for as far as we can see. And no government studies disagree with that.\n\nSo then your second question is what do we do? So then what we do is we have now, for more than three decades, focused our research efforts on given that, that is the case, how can we provide people the capability to use these fuels and these energy sources in a less impactful way? And in the early days, we invested in enormous amounts of time and energy on energy efficiency. How can I continue to carry out my activities consuming the same amount of fossil fuels and have a lower impact on the atmosphere and the surrounding environment? And we continue that.\n\nWe have advanced our technology investigations, and I addressed them today. We think one of the important breakthroughs that is needed is in carbon capture sequestration. If, in fact, our view is -- turns out to be true and the world\'s going to have to continue to use fossil fuels whether they like it or not, if we can achieve a breakthrough on carbon capture sequestration, we can continue using fossil fuels, capturing the carbon in an economic way and sequestering it. And that is a huge if, but it\'s also a huge breakthrough that would change things dramatically. A lot of people are working on that. And we\'ve been working on it on a very long time. And I mentioned today a new endeavor we have entered into that we believe has promise, but we also know it is many years away from perfection.\n\nSo those are the kinds of things we are doing. So we are not ignoring the risk that is out there. And I think as you and I discussed, I don\'t think we see it all that differently. Our differences are how we think we\'re going to address it. And again, we are grounded in the reality of the world we live in today and what is known technologically to us today. And I would invite you to read a piece that Bill Gates recently wrote where he has undertaken on himself to support research. And he would come to the same conclusion. He and I have had this conversation. There\'s no space between we -- he and I on this either. We\'ve got to have some technology breakthroughs. But until we achieve those, just saying turn the taps off is not acceptable to humanity. Thank you.\n\nMICHAEL MACCRACKEN: If I may just for a moment respond on the issue of the Paris levels. I mean, it is the case that there\'s no sort of sharp threshold because consequences are getting worse and worse and worse. And society has set that partly as a political one because it can\'t achieve what the scientific community would suggest, which is -- it was about 0.5 degree warming when we started losing mass from Greenland and the Antarctic ice sheets. And what paleoclimatic evidence talks about and would say, coming out of the last glacial and as has been the case in the past history, is that sea level rise ultimately is of the order of 15 to 20 meters per degree C warming in global average temperature. It\'s tremendous, not something that is easily adapted to.\n\nSo yes, society chose an arbitrary level, and they haven\'t committed yet to try and even -- made the commitments to even get to that level. But really, they should be pushing down and doing all they can. And that\'s what we\'re basically asking for is doing all that can be done with regard to cutting emissions.\n\nREX TILLERSON: I understand, and so I\'m going to respond to a couple of points you made there, and then we\'re going to move on to another speaker. The factors that you just described, 15 meters, half a degree C, there are a large number of alternative views that suggest those are the most extreme outcomes. And in fact, an IPCC scientific paper, as you well know, they show a very broad range. So you are focusing on the worst possible combination of events from all of the possible scenarios. We choose to look at it as a range of risk, and that\'s the way we manage our business as well. And again, I don\'t think there\'s any space between us other than the solutions that we think we have to pursue.\n\nYes, sir, in the middle?\n\nDAVID MARTINEAU, GEOLOGIST: My name is [David Martineau]. I\'m a geologist, and I\'ve been drilling and fracking wells for about 56 years now. But the reason I\'m here kind of is this interest -- they used to call it global warming, but now it\'s called climate change. You want to know why? The planet has cooled by 0.07 degrees in this past century, partially because of the Exxon emerging energy policies. We thank you for that.\n\nBut if you want to know about global warming, there\'s a book called Unstoppable Global Warming, and it was a bestseller in The New York Times. And it tells you why we have global warming today, and we\'ve been having it forever and ever.\n\nThe other item was an article that was written by, I can\'t read my numbers here, let me (inaudible - microphone inaccessible) by Ian Pilmer (sic - Ian Plimer). He\'s an Australian geologist, and he edits the Encyclopedia for Geology. And he came up with where does carbon dioxide really come from? A four-day volcano eruption in Iceland negated every single effort for the past five years to control CO2. Also, there was one in the Philippines in 1991. It spewed out enough greenhouse gas in a year to take care of everything that\'s happened in our country forever and ever as far as CO2. CO2 is that vital chemical compound that every planet requires to live and grow and synthesize into oxygen for us humans and all animals to live. We need -- global warming is here and it\'s going to be here forever. Trying to change it here the last few bits doesn\'t make sense.\n\nOne quick thing about fracking. The first frac job occurred, believe it or not, in 1866. Colonel Edward Roberts got out of the military -- Civil War, and he came up with the idea of dropping gunpowder down in a wellbore where they were drilling shallow wells. And they went ahead and ignited that, and it increased the production tenfold. And we\'ve been fracking wells for 100 years. Thanks to that, we have a lot of oil and gas. He patented that, and people didn\'t want to pay the patent fee. So they would do it at night, and that\'s where the word moonlighting came from. But if you all don\'t like what you\'re doing here, why -- I mean if you don\'t like Exxon, why don\'t you go ahead and sell it and buy some solar stock? And whether you\'ll have enough money to pay for your gasoline and pay for your jet fuel that you used to come to visit here, but I recommend you vote against all of the carbon planet change proposals. Thank you.\n\nREX TILLERSON: Thank you. In the inside aisle here.\n\nRENEE BOUCHARD: Good morning. My name is [Renee Bouchard] and I\'m actually from Texas. I live in Wimberley, Texas. Maybe you all are aware, if you watched the news a year ago yesterday, we were inundated by a flood that devastated our town. I lost our home. My family lost our home as well as 400 other homes in Wimberley were destroyed by the catastrophic flood. Needless to say, my feeling is that the effects of climate change are real and are happening now, not something that\'s abstract and in the future, not something that\'s just been affecting indigenous populations, even though that\'s really heartbreaking as well. Our flood was one of five flooding-related federal disasters in Texas just in the past two years.\n\nI\'ve lived in this area for over 20 years. Flooding, as you know, you\'ll be saying: flooding happens in that part of Texas. That\'s true, but not like this, a 40-foot wall of water, 13 inches of rain. Anyway, when these disasters happen, federal taxpayers spend billions on aid and bailouts. As you all probably are aware, State Attorneys Generals have suggested fossil fuel companies may be legally liable for these climate damages. I think from today\'s meeting, I know what your position is and whether Exxon should be held legally liable for damages from climate change.\n\nSo my question is, what happens if the company loses in the coming court battles? And I anticipate there will be more. How is Exxon calculating the financial risks it would face if fossil fuel companies have to pay for damage from climate change, like the tobacco companies who, for many decades, said cigarettes were not an unhealthy thing to take part in. Thank you.\n\nREX TILLERSON: Speculating on future court events would be irresponsible on my part, and therefore those numbers will be unestimable. Over here.\n\nKATHY MULVEY, ACCOUNTABILITY CAMPAIGN MANAGER, UNION OF CONCERNED SCIENTISTS: Good morning. I\'m Kathy Mulvey with the Union of Concerned Scientists, and I\'m attending on the proxy of the Unitarian Universalist Association. And I have a question related to Item Nine on transparency and in climate science and policy.\n\nLast July, UCS published The Climate Deception Dossiers, evidence that ExxonMobil and other major fossil fuel companies have intentionally spread disinformation about climate science for decades. I have one copy here. So as state Attorneys General investigate whether ExxonMobil misled investors and the public about the realities and risks of climate change, people are increasingly making comparisons to the tobacco industry\'s misconduct and measures taken to address it. ExxonMobil now claims that quote, "We do not fund or support those who deny the reality of climate change."\n\nBut actions speak louder than words. ExxonMobil is represented on the board of the American Petroleum Institute, API, which continues to claim that the science of climate change is unsettled, while attempting to block efforts to limit carbon pollution. Exxon\'s Randy Randol contributed to a notorious 1998 road map plan to sow uncertainty about climate science among the media, the public and policymakers. Other shareholders have spoken about ALEC, which was one of the organizations that API identified to bankroll that campaign. Your company sponsored ALEC\'s 2015 annual meeting where attendees were told, "The biggest scam of the last 100 years is global warming."\n\nMr. Tillerson, more than 26,000 UCS supporters have sent messages to you demanding that ExxonMobil stop funding ALEC as your peers, BP and Shell, have done. You\'ve spoken this morning of commitment to integrity in everything we do. When will your company publicly condemn climate deception? And in response to the question from the Yale student, you seemed to imply perhaps not agreeing with ALEC. So what would a commitment to integrity mean in terms of separating the company from ALEC\'s climate disinformation?\n\nREX TILLERSON: The only way I know to respond is to tell you that we will never withdraw our support for people to express their free-speech opinions on any matter whatsoever. And the fact that people have different opinions on climate change, they have every right to their opinion. Whether we agree with it or not, I will support their right to say so.\n\nKATHY MULVEY: To follow up on that, it\'s quite clear from legal rulings that the First Amendment was never intended to protect fraud. And so clearly, there are consumer protection laws in the several federal states that are designed to present -- prevent commercial misinformation, so.\n\nREX TILLERSON: And we have responded to those allegations as well. They have largely been debunked. All of our publications are readily available, as you know, for examination. In fact, we donated all of those archives to the University of Texas at Austin. If we were wanting to hide something, we did a pretty poor job of it.\n\nLet\'s go to the lady in the back there. No, right here, just in the middle.\n\nCHARLOTTE RAWLS, SHAREHOLDER: I don\'t intend to speak and then I sit and write notes. My name is [Charlotte Rawls]. I come from Shoreacres, Texas, which is on Galveston Bay. I overlook the Baytown refinery. I watch and see what happens there daily because my window overlooks it. And what I see is a lot of smoke and a lot of flares. And then I cross the Fred Hartman Bridge and I go into Baytown, and I work with school children there. I work with school children who have much asthma -- and also in that area, heart disease and a lot of respiratory problems and cancer.\n\nI think it\'s no wonder that we have one of the premier cancer institutes in Houston, MD Anderson, because we do have so much cancer in our area. When it rains, we see much more and the smells -- and definitely, it smells like toxins to me, I don\'t know exactly what it is. I would like to see zero emissions. If we\'re not going to admit to climate change and admit that fossil fuels are causing -- are a part of climate change, so at least let\'s look at the health effects from the things that are coming out of the plants.\n\nI have experience in Baytown, but I also read about experiences about other people around the world. The other thing that happens with children is IQs are lowered when there\'s significant amounts of air pollution, which there are in the city of Baytown.\n\nI do appreciate your admitting that there is climate change. I\'ve been to several meetings. I followed ExxonMobil very closely because I\'m a third generation shareholder. My grandfather worked for the Humble Pipe Line in -- right after World War I, and we did not buy anything, but Humble. We didn\'t go to Shell, we didn\'t go to Mobil, we didn\'t go anywhere. We went to Humble. And it was not until the Exxon Valdez disaster that I started going to other places.\n\nI continue to have my stock and then inherited my mother\'s stock. So I would also like to say something on behalf of my mother, which she voted every year, her proxy. And she said we need more women represented on the Board of Directors. I see a couple. I need to see more.\n\nThe other thing I debate often, whether I need to divest of my Exxon stock. I\'m sorry, I still call it Exxon stock. ExxonMobil stock. It\'s been a dilemma for me for a long time because I keep thinking that I can influence change. I\'ve done that through the Waterkeeper Alliance. I\'ve done that through the Natural Resources Defense Council. I\'ve done it through Earthjustice.\n\nI try, I try, I try to make things better, but it\'s about time to divest. I do benefit greatly from the stock. However, when I look to see what taxpayers are going to be paying for, I\'m not sure it\'s that good of a deal. Health care, and I want to go back to ALEC because we have still --\n\nREX TILLERSON: Could you begin to wrap it up?\n\nCHARLOTTE RAWLS: I\'m sorry. I think it\'s more expensive to own my stock than to take my dividends.\n\nREX TILLERSON: Thank you. So over here to this side of the hall.\n\nROBERT FORE, REPRESENTATIVE, PRESBYTERIAN CHURCH U.S.A.\'S FOUNDATION: Thank you, Mr. Chairman. Ladies and gentlemen, members of the board and fellow shareholders. My name is Robert Fore. I\'m here representing the shares of the Presbyterian Church U.S.A.\'s Foundation. And as long-term shareholders, we are very concerned -- and well, very concerned about the long-term sustainability and importantly, profitability of our company.\n\nAnd I\'m going to talk a little bit about the Items 11 and 12 today and talk specifically about some of the peer groups that were mentioned, peer companies and their recognition of the 2-degree scenario. Specifically, I want to talk about one, which is Saudi Aramco. Saudi Arabia has made it clear that they are seeking to monetize their oil reserves now, regardless of the impact on price.\n\nMinister Al-Naimi said that he could live with $20 per barrel. The Deputy Crown Prince has said $30 per barrel or $70 per barrel, it\'s all the same to them. And he has furthermore announced plans, a plan called Vision 2030, in which he wants to wean his country off of oil by the year 2030 or sooner.\n\nGiven the global signals of the end of the oil era, from low commodity prices to recent statements and decisions from Saudi Aramco to start selling shares publicly to build up a $2 trillion sovereign wealth fund, we\'re concerned about some of the claims of the future of energy will look very much like the past.\n\nI think you\'ve talked a little bit today about 2040 scenarios. My question is, what do the Saudis -- or why do the Saudis see the future of oil -- future of energy so differently than the leadership of ExxonMobil? What do they know that we are not considering? Furthermore, what specifically are you doing to ensure the business model of ExxonMobil is nimble enough to withstand low carbon demand scenarios, including disruptions, be they technological regulatory or market-based?\n\nREX TILLERSON: Well, as to Saudi Aramco\'s views or the Kingdom of Saudi Arabia, I -- you would have to ask them. I can\'t speak on their behalf. I think we showed you today a number of things that we\'re doing in terms of remaining flexible to alternative outcomes in the future.\n\nWe have, unlike many of our competitors, we have for many years included a price of carbon in our outlook. And that price of carbon gets put into all of our economic models when we make investment decisions as well.\n\nIt\'s a proxy. We don\'t know how else to model what future policy impacts might be. But whatever policies are, ultimately they come back to either your revenues or your cost. So we choose to put it in as a cost.\n\nSo we have accommodated that uncertainty in the future, and everything gets tested against it. As to other actions, it\'s the research areas you see in terms of our understanding of the issue. And we probably have been engaged in the scientific study of this longer than any of our peers for four decades now. And we continue to be engaged, so we are very, very aware and up-to-date on the current scientific understanding. And we fund a number of research areas, both academically and institutionally, that are areas where people are investigating possible breakthroughs, whether it\'d be in battery technologies or alternative fuel technologies or what\'s the next possible game-changing technology.\n\nWe do that so we\'re aware of whether that is something that has potential or not. And so we monitor all of that, and we invest and run our programs accordingly. And should something evolve, we have the capacity to become engaged in that if we see it is in the interest of our shareholders. So that\'s how we\'re responding.\n\nSo back over here.\n\nHUNTER MARTIN, SHAREHOLDER: Mr. Tillerson, my name is [Hunter Martin]. My wife and I have been driving up here from Houston for more than 25 years, going back to Larry Rawl\'s days for the meetings. And I think I bought my first shares of the company probably the year you were born. And I don\'t presume --\n\nREX TILLERSON: I wish my dad had bought some shares the year I was born.\n\nHUNTER MARTIN: I do not presume to forecast what\'s going to happen over the next 12 months. But [Lori] and I want to thank you and compliment you on your stewardship of our company.\n\nApplying whatever parameters one would choose, you personify all that is best in America and world industry, and the lives of all of us shareholders are more secure and in return, happier because of you.\n\nOur thanks embrace the employees as well as the board. At the top, of course, is you and your superlative guidance. Congratulations on your achievements, Mr. Tillerson, and thanks for your dedication to all of us.\n\nREX TILLERSON: Thank you for those kind words. Right here.\n\nANNA KOLINSKI: CEO Tillerson, it\'s good to have the opportunity to speak to you. So my name is [Anna Kolinski] and my grandfather, James F. Black, was a scientist for Exxon for over 40 years. He started with Standard Oil during World War II, later earned dozens of patents for Esso and later Exxon.\n\nIn 1977, he briefed the company\'s top executives on the scientific realities of climate change. He said that present thinking holds that man has a time window of 5 to 10 years before the need for hard decisions regarding changes in energy strategies might become critical. Like you acknowledge on your slides, this was over 30, almost 40 years ago. And since then, Exxon has continued to support groups that dispute the science that my grandfather warned your company about.\n\nMy question was going to be, will you withdraw funding from ALEC because they\'re known to spread misinformation about climate change, but it sounds like your answer on that is a resounding no. So since you won\'t be doing that, will Exxon be taking action to, at all, refute or counteract the negative impacts they\'re making by funding an organization that says that climate change is a scam?\n\nREX TILLERSON: We will continue to engage in the policy discussions, as we currently do, with a number of broad-based groups on all sides of these issues and we\'ll continue to be active in the discussions legislatively in Washington and elsewhere, including through the IPCC on what we think are thoughtful, sensible policy actions that accommodate both our need for economic growth as well as addressing these risks, which are going to be very, very daunting. Thank you.\n\n<Presentation>\n\nREX TILLERSON: I believe all the items of business have been covered now. So if any of you have proxy cards, please hand them to the ushers at this time. Those of you who have already returned your proxy cards need not vote by ballot unless you wish to change your votes. If you wish to change your vote, simply mark the appropriate sections of the ballot. The ballots will now be collected and turned over to the inspectors of election to be counted. If you wish your ballot to be kept secret, the ushers will provide you an envelope.\n\nThe appointed proxies in attendance today hereby cast all votes which we have been authorized to cast, in accordance with the instructions indicated on the individual proxy cards. So if you have proxies, please pass those to the ushers in the aisles at this time.\n\nSince proxies and ballots have been collected, I now declare the polls closed.\n\nQuestions and Answers\n\nREX TILLERSON: While the inspectors of election are preparing their preliminary report, I\'m willing to answer a few more questions regarding ExxonMobil\'s business, maybe one or two because time is running late on us. But I\'ll interrupt at the point that the election results are available. So are there any other questions that people would like to put to me at this time? Let me go over here to someone that\'s not spoken.\n\nUNIDENTIFIED AUDIENCE MEMBER: Good morning, Mr. Tillerson, directors and fellow stockholders and employees and retirees of the corporation. For many years -- I\'m a retiree of the corporation now. But for many years, I was on boards of health care and safety for the company. For years, we were told that in health care area, we have to have a big pool for synergies and for economies of scale. But today, I understand that the pool has been split between employees\' health care and retiree annuitant\' health care.\n\nMy question to you, Mr. Tillerson, is one is why was this done because we no longer reach these economies of scale if we split the health care pool? And two is any forward comments you can make about what the future might hold for us annuitants as far as the Medicare supplement plan or any health care that the company still helps us afford? Thank you.\n\nREX TILLERSON: Well, I\'m not sure I would characterize it as a splitting of the pools. We have -- obviously, we track claims and premiums in both of those pools. And so the changes you\'ve seen are merely reflective of that experience.\n\nFor many, many years, the retiree pool has been subsidized by the employee pool, and it still is today. And it\'s just a question of how much of that burden -- as we continue to bring our manning levels down within the corporation, from roughly 115,000 at the time of the ExxonMobil merger to 73,000 today. We\'ve got a look at the cost of the plan to everyone.\n\nI do want to acknowledge that all of our plan participants, retirees included, have really done a very, very good job managing their health issues, which has allowed us to keep our growth in health care costs down. And so we appreciate that.\n\nBut the changes you\'re seeing are really just reflective of having to bring the things back into balance. I encourage everyone to continue to take advantage of the various programs that we do provide through the company to help manage your health care. That helps us keep the cost of the program down. As you know, it\'s a self-funded program.\n\nHave you got a comment down here or a question?\n\nJULIAN MARTINEZ, REPRESENTATIVE, SER-JOBS FOR PROGRESS: Thank you, Mr. Chairman. My name\'s Julian Martinez, and I represent SER Jobs for Progress National. SER is a national, nonprofit, community-based organization serving more than 1.3 million people a year by assisting them with their employment and educational needs.\n\nSER National would like to thank ExxonMobil for working with us and Hispanic Heritage foundation in the past. Your efforts in sponsoring STEM programs is exemplary, and we appreciate those efforts. However, an area that we strongly believe needs improvement is your governance.\n\nExxonMobil has no Hispanics on their board nor on their senior management team. In order to reflect the diverse nature of the American marketplace, you need to consider making some changes.\n\nThe American dream is alive and well. Hispanics born in this country are more highly educated and earn higher incomes than their immigrant parents. Hispanics outpace all other Americas in forming their own businesses. They will represent fully one-third of the U.S. population in just 45 years and be an economic force of great consequence. Latino-owned businesses increased 46.9% from 2007 to 2012.\n\nIn 2012, 3.3 million Latino businesses had receipts totaling over $517 billion, making it 12% of our total business. Female Latino business owners were 44.4% of Latino firms as compared to 35.3% for non-Latino firms. Hispanics are the youngest racial or ethnic group in the United States. About one-third or 17.9 million of the Latino population is younger than 18, about a quarter or 14.6 million are millennials. Exxon needs to consider who both their workforce and the customers will be and embody governance and leadership to reflect that market.\n\nMy question, totally unrelated is, what do you think the price of oil is going to do between one and five years?\n\nREX TILLERSON: Well, as many of have heard me say before, if I knew that, I wouldn\'t be here. I\'d be on the beach with my laptop, making an oil bet every morning. So I don\'t know is the answer to the question.\n\n<Presentation>\n\nREX TILLERSON: The inspectors of election are ready to report the preliminary vote. So may we have your report, please?\n\nUNIDENTIFIED COMPANY REPRESENTATIVE: Mr. Chairman, at least 3.5 billion shares of stock of the corporation have been voted on the 14 items of business discussed at today\'s meeting. Voting results are expressed as a percentage of total votes cast. According to New Jersey corporate law, abstentions are not votes cast.\n\nSubject to the final tabulation of votes, which should not materially change the results, we report that on average, 95.9% of the votes cast were voted to elect as directors the 14 nominees listed in the proxy statement.\n\nOn the resolution concerning the ratification of independent auditors, approximately 98.9% of the shares voting thereon were voted for, and 1.1% were voted against.\n\nOn the resolution concerning an advisory vote to approve executive compensation, approximately 89.3% of the shares voting thereon were voted for and 10.7% were voted against.\n\nOn the resolution concerning an independent chairman, approximately 38.8% of the shares voting thereon were voted for, and 61.2% were voted against.\n\nOn the resolution concerning a climate expert onboard, approximately 20.9% of the shares voting thereon were voted for, and 79.1% were voted against.\n\nOn the resolution concerning hire an investment bank, approximately 2% of the shares voting thereon were voted for, and 98% were voted against.\n\nOn the resolution concerning a proxy access bylaw, approximately 61.9% of the shares voting thereon were voted for, and 38.1% were voted against.\n\nOn the resolution concerning a report on compensation for women, approximately 8.5% of the shares voting thereon were voted for, and 91.5% were voted against.\n\nOn the resolution concerning a report on lobbying, approximately 25.8% of the shares voting thereon were voted for, and 74.2% were voted against.\n\nOn the resolution concerning increased capital distributions, approximately 4.1% of the shares voting thereon were voted for, and 95.9% were voted against.\n\nOn the resolution concerning a policy to limit global warming to 2 degrees C, approximately 18.5% of the shares voting thereon were voted for and 81.5% were voted against.\n\nOn the resolution concerning a report on impacts of climate change policies, approximately 38.2% of the shares voting thereon were voted for, and 61.8% were voted against.\n\nOn the resolution concerning report reserves replacements in BTUs, approximately 5.5% of the shares voting thereon were voted for, and 94.5% were voted against.\n\nOn the resolution concerning a report on hydraulic fracturing, approximately 24.5% of the shares voting thereon were voted for, and 75.5% were voted against.\n\nOur written reports will be submitted to the secretary as soon as they are completed.\n\nREX TILLERSON: Thank you. As stated in the written report of inspectors of election on each of these -- a written report of the inspectors of election on each of these matters will be delivered to the secretary, and they will be filed with the SEC on a Form 8-K.\n\nIn concluding the meeting, let me thank all of you once again for attending. We appreciate you taking the time and the effort to be here. We know some of you travel a long way to be with us today, and we always enjoy seeing you, and we appreciate that you take that time. We take all of your comments seriously. It is useful to us to hear from you. And even where we disagree, I know our disagreements are both sincere on the issues as we all see them. I think we all strive for the same goals, and that\'s to provide a great quality of life for our -- not just ourselves and our children, but people the world over. And that\'s what the men and women of ExxonMobil are dedicated to.\n\nSo it\'s great to see the shareholders that are here every year, great to see some of our [annuitants] here. Appreciate you all being here. Safe travels.\n\n[Thomson Financial reserves the right to make changes to documents, content, or other information on this web site without obligation to notify any person of such changes.\n\nIn the conference calls upon which Event Transcripts are based, companies may make projections or other forward-looking statements regarding a variety of items. Such forward-looking statements are based upon current expectations and involve risks and uncertainties. Actual results may differ materially from those stated in any forward-looking statement based on a number of important factors and risks, which are more specifically identified in the companies\' most recent SEC filings. Although the companies may indicate and believe that the assumptions underlying the forward-looking statements are reasonable, any of the assumptions could prove inaccurate or incorrect and, therefore, there can be no assurance that the results contemplated in the forward-looking statements will be realized.\n\nTHE INFORMATION CONTAINED IN EVENT TRANSCRIPTS IS A TEXTUAL REPRESENTATION OF THE APPLICABLE COMPANY\'S CONFERENCE CALL AND WHILE EFFORTS ARE MADE TO PROVIDE AN ACCURATE TRANSCRIPTION, THERE MAY BE MATERIAL ERRORS, OMISSIONS, OR INACCURACIES IN THE REPORTING OF THE SUBSTANCE OF THE CONFERENCE CALLS. IN NO WAY DOES THOMSON FINANCIAL OR THE APPLICABLE COMPANY OR THE APPLICABLE COMPANY ASSUME ANY RESPONSIBILITY FOR ANY INVESTMENT OR OTHER DECISIONS MADE BASED UPON THE INFORMATION PROVIDED ON THIS WEB SITE OR IN ANY EVENT TRANSCRIPT. USERS ARE ADVISED TO REVIEW THE APPLICABLE COMPANY\'S CONFERENCE CALL ITSELF AND THE APPLICABLE COMPANY\'S SEC FILINGS BEFORE MAKING ANY INVESTMENT OR OTHER DECISIONS.]'
\end{verbatim}

This call took place on May 25th, 2016. The transcript is over 125,000
words, nearly as long as the third Lord of the Rings book. It would be a
pain to read all of it, so we'll use python to extract insights.
Currently, the contents of \texttt{call{[}"Text"{]}} is a
\href{https://docs.python.org/3/library/stdtypes.html\#text-sequence-type-str}{``string''}--
a sequence of characters. We can do a number of things with strings,
including splitting a big string into smaller strings using a specific
delimiter and the \texttt{.split()} function. For example, I can break
down the whole text of the earnings call roughly into sentences by
splitting the string every time I encounter a period (``.''). This
returns a list of smaller strings, and if i select the first one using
\texttt{{[}0{]}}, I get the first sentence of this call:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{call[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{].split(}\StringTok{\textquotesingle{}.\textquotesingle{}}\NormalTok{)[}\DecValTok{0}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
"I'm Rex Tillerson, I'm the Chairman and Chief Executive Officer of the Exxon Mobil Corporation"
\end{verbatim}

Lovely! the first sentence is an introduction by then-CEO
\href{https://en.wikipedia.org/wiki/Rex_Tillerson}{Rex Tillerson}.

He was CEO of Exxon from 2006 until he retired on January 1st 2017. One
month later, he was sworn in as U.S. Secretary of State under Donald
Trump. Let's see what Rex thinks about climate change!

\hypertarget{regular-expressions-regex}{%
\section{Regular Expressions (Regex)}\label{regular-expressions-regex}}

Another thing we can do with strings in python is search them using
regular expressions. A regular expression is a sequence of characters
that specifies a search pattern in text. You can play around building
some regex queries using this \href{https://regexr.com/}{tool}.

You can think about this as Ctrl+F on steroids; In its simplest form, we
can use regex to search for a character, word, or phrase in a bunch of
text. For example, we can use regular expressions to count how many
times ``climate change'' is mentioned in this earnings call using the
\texttt{re.findall()} function:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# import the regular expressions library }
\ImportTok{import}\NormalTok{ re}

\CommentTok{\# use the findall function to search for mentions of "climate change" in the text of our call}
\NormalTok{climate\_change }\OperatorTok{=}\NormalTok{ re.findall(}\VerbatimStringTok{r\textquotesingle{}climate change\textquotesingle{}}\NormalTok{, call[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{], re.IGNORECASE)}

\CommentTok{\# this returns a list of strings matching our search term. }
\CommentTok{\# the length of the list gives us the number of occurances}
\BuiltInTok{len}\NormalTok{(climate\_change)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
51
\end{verbatim}

Looks like climate change is mentioned 51 times in this earnings call.

\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}

\hypertarget{exercise-8}{%
\subsection{Exercise}\label{exercise-8}}

how many times is the phrase ``global warming'' mentioned?

\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}

But we have 182 earnings calls in this sample-- suppose we want to count
the number of times climate change is mentioned in each one, so we can
see the salience of this topic over time.

\hypertarget{applying-a-lambda-function-to-a-dataframe}{%
\subsection{Applying a lambda function to a
dataframe}\label{applying-a-lambda-function-to-a-dataframe}}

Because each row of our dataframe \texttt{df} is an earnings call (the
text of which is contained in
\texttt{df{[}\textquotesingle{}Text\textquotesingle{}{]}}, we want to
apply the analysis we did for the single earnings call above to each row
of \texttt{df}.

We can accomplish this using a
\href{https://www.w3schools.com/python/python_lambda.asp}{\textbf{lambda
function}}. This allows us to iterate over each value in a dataframe
column, and apply a function to it. In the simple example below, I use a
lambda function to create a new column that is takes the values from a
different column and multiplies them by 2:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a dataframe called "example" with one column called "numbers" which contains numbers 0{-}5}
\NormalTok{example}\OperatorTok{=}\NormalTok{ pd.DataFrame(\{}\StringTok{\textquotesingle{}numbers\textquotesingle{}}\NormalTok{:[}\DecValTok{0}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{5}\NormalTok{]\})}

\CommentTok{\# print the dataframe }
\BuiltInTok{print}\NormalTok{(}\StringTok{"}\CharTok{\textbackslash{}n}\StringTok{ Before applying lambda function: }\CharTok{\textbackslash{}n}\StringTok{"}\NormalTok{, example)}

\CommentTok{\# create a new column called "doubled numbers"}
\CommentTok{\# apply a lambda function that iterates over each row in the "numbers" column}
\CommentTok{\# call each row "x", and multiply it by 2}
\NormalTok{example[}\StringTok{\textquotesingle{}doubled numbers\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{ example[}\StringTok{\textquotesingle{}numbers\textquotesingle{}}\NormalTok{].}\BuiltInTok{apply}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ x: x}\OperatorTok{*}\DecValTok{2}\NormalTok{)}

\CommentTok{\# print the dataframe, which now contains the new column}
\BuiltInTok{print}\NormalTok{(}\StringTok{"}\CharTok{\textbackslash{}n}\StringTok{ }\CharTok{\textbackslash{}n}\StringTok{ After applying lambda function: }\CharTok{\textbackslash{}n}\StringTok{"}\NormalTok{, example)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}

 Before applying lambda function: 
    numbers
0        0
1        1
2        2
3        3
4        4
5        5

 
 After applying lambda function: 
    numbers  doubled numbers
0        0                0
1        1                2
2        2                4
3        3                6
4        4                8
5        5               10
\end{verbatim}

There were simpler ways of doing this (namely,
\texttt{example{[}doubled\ numbers{]}=example{[}\textquotesingle{}numbers\textquotesingle{}{]}*2}).
But if we want to do something more complex, lambda functions are very
useful. Remember, we used
\texttt{re.findall(r\textquotesingle{}climate\ change\textquotesingle{},\ call{[}\textquotesingle{}Text\textquotesingle{}{]},\ re.IGNORECASE)}
to get a list of mentions of climate change in the text of one earnings
call, and measured the length of the list using \texttt{len()} to count
the number of mentions. We can turn this into a lambda function as
follows:

\texttt{df{[}\textquotesingle{}Text\textquotesingle{}{]}.apply(lambda\ x:\ len(re.findall(r\textquotesingle{}climate\ change\textquotesingle{},\ x,\ re.IGNORECASE)))}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{df{[}\textquotesingle{}Text\textquotesingle{}{]}}: the column
  we want to iterate over.
\item
  \texttt{.apply(lambda\ x:}: iterate over each row in the column, and
  call each value in that column x. In other words, x will represent the
  text of each earnings call.
\item
  \texttt{len(re.findall(r\textquotesingle{}climate\ change\textquotesingle{},\ x,\ re.IGNORECASE)}
  this is exactly the same as what did previously to find the number of
  mentions of climate change in the one earnings call, except that we
  swapped \texttt{call{[}\textquotesingle{}Text\textquotesingle{}{]}}
  with \texttt{x}, since we want to do this for the text of every
  earnings call.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a column called "climate change" that contains the count of mentions of this keyword}
\NormalTok{df[}\StringTok{\textquotesingle{}climate change\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{].}\BuiltInTok{apply}\NormalTok{(}\KeywordTok{lambda}\NormalTok{ x: }\BuiltInTok{len}\NormalTok{(re.findall(}\VerbatimStringTok{r\textquotesingle{}climate change\textquotesingle{}}\NormalTok{, x, re.IGNORECASE)))}

\CommentTok{\# print the title of each earnings call, along with the number of mentions of climate change.}
\BuiltInTok{print}\NormalTok{(df[[}\StringTok{\textquotesingle{}Title\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}climate change\textquotesingle{}}\NormalTok{]])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
                                                 Title  climate change
0    Exxon  Mobil Corp at Barclays CEO EnergyPower ...               2
1     Q2 2019  Exxon  Mobil Corp Earnings Call - Final               0
2    Event Brief of Q2 2019  Exxon  Mobil Corp Earn...               0
3    Exxon  Mobil Corp at JPMorgan Energy Conferenc...               0
4    Exxon  Mobil Corp Annual Shareholders Meeting ...              25
..                                                 ...             ...
177  Event Brief of Q3 2002  Exxon  Mobil Corporati...               0
178  Q3 2002  Exxon  Mobil Corporation Earnings Con...               0
179  Q2 2002  Exxon  Mobil Corporation Earnings Con...               0
180  Abstract of Q2 2002  Exxon  Mobil Corporation ...               0
181  Exxon  Mobil Corporation First Quarter 2002 Re...               0

[182 rows x 2 columns]
\end{verbatim}

Amazing! We've now got a column indicating how many times ``climate
change'' was mentioned in each earnings call.

\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}

\hypertarget{exercise-9}{%
\subsection{Exercise}\label{exercise-9}}

Create three new columns that count the frequency of the terms ``global
warming'', ``carbon capture'', and another phrase or word of your
choosing. When you've done this, edit the code below so that it not only
shows the frequency of ``climate change'' mentions, but also the three
additional columns you created.

Advanced: Our current measure of the number of mentions of keywords
might be biased: if one earnings call mentions climate change 10 times
more than another, but that earnings call has 10 times more words, then
the \emph{rate} of keyword mentions hasn't actually increased; people
are just talking more. You can get the word count of each earnings call
in the lambda function above using \texttt{len(x)} (using \texttt{len()}
on a string will get you a word count). Edit the lambda function above
such that we don't get a \emph{count} of the number of mentions of
climate change, but the \emph{rate} of mentions (i.e., count of
``climate change'' divided by total word count per call).

\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}

Let's plot the frequency of these mentions over time to analyze temporal
trends in the salience of climate change and other keywords in these
calls. We'll accomplish this using

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# extract the year from the date column }
\NormalTok{df[}\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{].dt.year}

\CommentTok{\# group the dataframe by year, calculating the sum of the "climate change" column}
\CommentTok{\# save it as a new dataframe called "yearly"}
\NormalTok{yearly}\OperatorTok{=}\NormalTok{df.groupby(}\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{)[}\StringTok{\textquotesingle{}climate change\textquotesingle{}}\NormalTok{].}\BuiltInTok{sum}\NormalTok{()}

\CommentTok{\# plot yearly}
\NormalTok{yearly.plot()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<Axes: xlabel='Year'>
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W04. Natural Language Processing_files/figure-pdf/cell-14-output-2.png}

}

\end{figure}

Do you notice any patterns in the salience of these topics over time?

\hypertarget{intermediate-regex}{%
\section{Intermediate Regex}\label{intermediate-regex}}

Great. We can see how frequently climate related keywords come up in
earnings calls between shareholders and Exxon Mobil representatives over
time. But what if we want to look at what they're actually saying?

We can get a bit fancier with Regex to look at the content of these
discussions. Regex can be pretty confusing, but it's also a very
powerful tool. Before moving on, let's familiarize ourselves a bit more
with regex.

Let's try to extract all \emph{sentences} containing the phrase
``climate change''; the regex would look like this:

\texttt{({[}\^{}.{]}*climate\ change{[}\^{}.{]}*)}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{()} indicates that we want to match a group of characters, not
  just the characters themselves. In this case, the group is a sentence,
  not just the word climate change. But how do we
\item
  \texttt{{[}\^{}.{]}*} we want to match all characters except periods.
  This will break the text up into sentences
\item
  \texttt{climate\ change} the phrase we want our sentence to contain.
\end{enumerate}

when you put it all together, the regex will search for groups of
characters (1.) bounded by periods (2.) that contain the phrase
``climate change'' (3.)

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a list called "climate\_sentences" that contains the results of this query}
\NormalTok{climate\_sentences}\OperatorTok{=}\NormalTok{re.findall(}\VerbatimStringTok{r"([\^{}.]*climate change[\^{}.]*)"}\NormalTok{,}\StringTok{" "}\NormalTok{.join(df[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{]))}

\BuiltInTok{print}\NormalTok{(}\BuiltInTok{len}\NormalTok{(climate\_sentences))}
\CommentTok{\# print the first 10 sentences in the list}
\ControlFlowTok{for}\NormalTok{ sentence }\KeywordTok{in}\NormalTok{ climate\_sentences[:}\DecValTok{10}\NormalTok{]:}
  \BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}}\CharTok{\textbackslash{}n}\StringTok{\textquotesingle{}}\NormalTok{, sentence)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
251

  However, as the world looks to lower their carbon emissions and respond to the risk of climate change, there is a desire to better understand how robust our plans are to evolving policies and changing market trends

  Meeting the growing need for energy and addressing the risk of climate change are not mutually exclusive

  Over the past year, I've met with policymakers from both sides of the aisle: NGOs, academia, and participated in a climate change dialogue at the Vatican

 
Our approach to climate change has 4 components

  We don't believe that society has to choose between economic prosperity and reducing the risk of climate change

 
Recent steps the company has made in the last month to start to make arrangements for dialogue with the Climate Action 100+ group at independent director level are welcome, but the fact that it has taken so long to get to this point reflects how painfully slow progress has been to date with Exxon on climate change

  I would tell you that we understand the concerns and share your desire to meaningfully address climate change, and I think ExxonMobil plays a pretty important role in that both today and in the future

  The Board's been engaged, and I would tell you, too, that we've had many, many discussions not only with your organization, but with many groups outside who have this concern, and I think are making very good progress in addressing some of the fundamental challenges associated with the risk of climate change

 
Members of the Board, this week's Economist describes ExxonMobil as a notable laggard on climate change

 
The next shareholder proposal calls for a specific Board climate change committee, and I understand that Natasha Lamb will present this proposal
\end{verbatim}

\hypertarget{semantic-analysis}{%
\subsection{Semantic Analysis}\label{semantic-analysis}}

Now we can see the \emph{sentences} which mention climate change, which
helps us understand a bit about the context. We can perform semantic
analysis on some of these sentences to take a close look at the grammar
of some of these sentences; I've isolated the 9th sentence and produced
a dependency tree, like the ones we've seen in class.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{from}\NormalTok{ spacy }\ImportTok{import}\NormalTok{ displacy}

\CommentTok{\#run the NLP pipeline on the 9th sentence from our list of sentences about climate change.}
\NormalTok{doc }\OperatorTok{=}\NormalTok{ nlp(climate\_sentences[}\DecValTok{8}\NormalTok{].lstrip())}

\CommentTok{\#dataframe to store the results}
\NormalTok{results}\OperatorTok{=}\NormalTok{pd.DataFrame()}

\ControlFlowTok{for}\NormalTok{ token }\KeywordTok{in}\NormalTok{ doc:}
    \CommentTok{\# create a row for each token in the sentence, containing the text, lemma, Part Of Speech, and dependency relation}
\NormalTok{    row}\OperatorTok{=}\NormalTok{pd.DataFrame(\{}\StringTok{\textquotesingle{}text\textquotesingle{}}\NormalTok{:[token.text], }\StringTok{\textquotesingle{}lemma\textquotesingle{}}\NormalTok{:[token.lemma\_], }\StringTok{\textquotesingle{}pos\textquotesingle{}}\NormalTok{:[token.pos\_], }\StringTok{\textquotesingle{}dep\textquotesingle{}}\NormalTok{:[token.dep\_]\})}
    \CommentTok{\# append the row to the results dataframe}
\NormalTok{    results}\OperatorTok{=}\NormalTok{pd.concat([results, row])}

\CommentTok{\# print the results}
\NormalTok{results}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllll@{}}
\toprule\noalign{}
& text & lemma & pos & dep \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & Members & member & NOUN & nsubj \\
0 & of & of & ADP & prep \\
0 & the & the & DET & det \\
0 & Board & Board & PROPN & pobj \\
0 & , & , & PUNCT & punct \\
0 & this & this & DET & det \\
0 & week & week & NOUN & poss \\
0 & \textquotesingle s & \textquotesingle s & PART & case \\
0 & Economist & economist & NOUN & nsubj \\
0 & describes & describe & VERB & ROOT \\
0 & ExxonMobil & ExxonMobil & PROPN & dobj \\
0 & as & as & ADP & prep \\
0 & a & a & DET & det \\
0 & notable & notable & ADJ & amod \\
0 & laggard & laggard & NOUN & pobj \\
0 & on & on & ADP & prep \\
0 & climate & climate & NOUN & compound \\
0 & change & change & NOUN & pobj \\
\end{longtable}

\begin{Shaded}
\begin{Highlighting}[]

\CommentTok{\#print out the dependency tree}
\NormalTok{displacy.render(doc, jupyter}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<IPython.core.display.HTML object>
\end{verbatim}

The root of this dependency tree is the verb ``describes''. The main
subject is the Economist, and the object is Exxon. But we're still
missing one vital piece of information: who is speaking? It makes a big
difference to our understanding of whats going on. Are mentions of
climate change increasing over time because shareholders are asking more
questions? Or did CEO Rex Tillerson have a spiritual awakening in which
all he wants to do is talk about climate change? For that, we need to
figure out who's talking, and resturcture our dataframe.

\hypertarget{advanced-regex}{%
\section{Advanced Regex}\label{advanced-regex}}

The earnings call transcript is structured in such a way that it should
be possible to separate speakers based on regular expressions. Every
time a new person is speaking, they are introduced in the transcript in
a new paragraph; Consider the excerpt below:

\begin{verbatim}
OPERATOR: Our next question comes from Philip Weiss with Argus Research.

PHILIP WEISS, ANALYST, ARGUS RESEARCH COMPANY: Good morning. I did have one, most of my questions have been answered, but I do have one follow-up on the US. You said that the rig count that's being used for liquids-rich is rising but when I look at production, natural gas as a percentage of your total production has grown, and liquids has actually fallen a little bit. So, I wonder if you can just comment on when we might start to see that trend change?

DAVID ROSENTHAL: Sure. The fall off in the liquids is really just the overall decline in the conventional, as well as some divestments. You'll recall we had a divestment in the Eastern Gulf of Mexico and that had an impact on us year-over-year in particularly in the second half.
In terms of when we'll see significant production growth out of the unconventional, I mentioned some of the increases in percentages, although we haven't given all of the specific production volumes, but we'll do that as we progress.
\end{verbatim}

Now, we can't simply split by new line (\texttt{\textbackslash{}n});
David Rosenthal has two paragraphs. We also can't just split using
\texttt{:}, since this may appear in the text other than to indicate
speakers. Let's describe the features of the characters we're looking to
split out:

\texttt{({[}A-Z{]}+.+{[}A-Z{]}+:)}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  It's a group of characters * regex: \texttt{()}
\item
  The words are all caps, and can contain any characters *
  regex:\texttt{({[}A-Z{]})}
\item
  There can be multiple words, and they can be separated by anything *
  regex: \texttt{({[}A-Z{]}+.+{[}A-Z{]})}
\item
  The sequence always ends in a colon * regex:
  \texttt{({[}A-Z{]}+.+{[}A-Z{]}+:)}
\end{enumerate}

Let's use this regex in \texttt{re.findall()} to get a list of the
speakers on this call:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create a list of all the speakers by searching text of the earnings call for the above regex. }
\NormalTok{speakers }\OperatorTok{=}\NormalTok{ re.findall(}\VerbatimStringTok{r\textquotesingle{}([A{-}Z]+.+[A{-}Z]+: )\textquotesingle{}}\NormalTok{, call[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{])}

\CommentTok{\# because they don\textquotesingle{}t introduce the speaker in the opening statement, insert a placeholder at the beginning of this list.}
\NormalTok{speakers.insert(}\DecValTok{0}\NormalTok{,}\StringTok{\textquotesingle{}INTRODUCTION\textquotesingle{}}\NormalTok{)}

\CommentTok{\# using set(list) will give you the unique values in a list}
\CommentTok{\# the length of set(list) gives us the number of unique speakers }
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}There are\textquotesingle{}}\NormalTok{, }\BuiltInTok{len}\NormalTok{(}\BuiltInTok{set}\NormalTok{(speakers)),}\StringTok{\textquotesingle{}speakers on this call:\textquotesingle{}}\NormalTok{)}

\CommentTok{\# let\textquotesingle{}s print out the first 10 speakers: }
\ControlFlowTok{for}\NormalTok{ speaker }\KeywordTok{in}\NormalTok{ speakers[:}\DecValTok{10}\NormalTok{]:}
  \BuiltInTok{print}\NormalTok{(speaker)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
There are 31 speakers on this call:
INTRODUCTION
JEFF WOODBURY, VP OF INVESTOR RELATIONS, CORPORATE SECRETARY, EXXONMOBIL CORPORATION: 
REX TILLERSON: 
REX TILLERSON: 
REX TILLERSON: 
BETH RICHTMAN, INVESTMENT MANAGER, CALPERS: 
REX TILLERSON: 
MICHAEL CROSBY, CAPUCHIN FRANCISCAN FRIAR: 
REX TILLERSON: 
TRACEY REMBERT, SHAREHOLDER, CHRISTIAN BROTHERS INVESTMENT SERVICES: 
\end{verbatim}

We want to do more than just identify the speakers though; we want to
break up the text of our earnings call into chunks of speech and
associate each chunk of speech with its speaker. We can split a string
using regular expressions using
\texttt{re.split(\textless{}regex\textgreater{},\textless{}text\textgreater{})}.
This takes one block of text, splits it into chunks using the regex, and
returns a list of chunks:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# split the text of the earnings call using our regex, save the list as "speech"}
\NormalTok{speech}\OperatorTok{=}\NormalTok{re.split(}\VerbatimStringTok{r\textquotesingle{}[A{-}Z]+.+[A{-}Z]+: \textquotesingle{}}\NormalTok{, call[}\StringTok{\textquotesingle{}Text\textquotesingle{}}\NormalTok{])}

\CommentTok{\# now, we can print the fourth speaker:}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Speaker: }\CharTok{\textbackslash{}n}\StringTok{\textquotesingle{}}\NormalTok{, speakers[}\DecValTok{3}\NormalTok{])}
 
\CommentTok{\# and the text of the fourth speech:  }
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}}\CharTok{\textbackslash{}n}\StringTok{ Speech: }\CharTok{\textbackslash{}n}\StringTok{\textquotesingle{}}\NormalTok{, speech[}\DecValTok{3}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Speaker: 
 REX TILLERSON: 

 Speech: 
 So now, turning to the formal business of the meeting and a few brief remarks on shareholder proposals and voting. Each year, the corporation receives a number of suggestions from shareholders. Some of these are in the form of proposals to be presented at the Annual Meeting and each is given careful consideration.

We seek dialogues with the sponsors prior to the meeting when there is more time to better understand each other's positions and we often find agreement. Let me be clear on the conduct of the meeting. Recognizing that the majority of our shareholders have voted by proxy and are not present, we have established procedures to facilitate an orderly meeting.

We've set up a process for speakers to identify themselves and to express their views and I assure you, we welcome those views. In order that as many shareholders as possible can participate, we have set time limits and a system of reminders to help you manage your time.

We have 14 items to consider. As Secretary Woodberry said earlier, discussion on all items of business will be deferred to the discussion period. This may enable us to have some time for general comments and questions as well and conclude the meeting in a reasonable time frame.

For those of you who may wish to leave the meeting at any time, let me express my appreciation for your attendance. Since we have a number of items yet to discuss on the program and you've been sitting for a while, I would invite you to stand and take a short stretch break and I would ask that you not leave the hall. We'll resume in just a moment.

(Break)

\end{verbatim}

Now we've associated chunk of speech with their speaker, amazing. Let's
create a new dataframe that reflects this structure. Currently, our
dataframe \texttt{call} has one row. Let's use the two lists we just
created, \texttt{speakers} and \texttt{speech}, to create a dataframe in
which each row is one chunk of speech. A column called ``speaker'' will
indicate who is speaking, and a column called ``speech'' will contain
the text of the speech:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# create the new dataframe, from the two lists, and name it "speaker\_df"}
\NormalTok{speaker\_df}\OperatorTok{=}\NormalTok{pd.DataFrame(\{}\StringTok{"speaker"}\NormalTok{:speakers,}\StringTok{"speech"}\NormalTok{:speech\})}

\CommentTok{\# clean up the "speaker" column by removing the colons using .str.replace(":","")}
\CommentTok{\# remove trailing white space using str.rstrip()}
\NormalTok{speaker\_df[}\StringTok{\textquotesingle{}speaker\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{speaker\_df[}\StringTok{\textquotesingle{}speaker\textquotesingle{}}\NormalTok{].}\BuiltInTok{str}\NormalTok{.replace(}\StringTok{\textquotesingle{}:\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{).}\BuiltInTok{str}\NormalTok{.rstrip()}

\CommentTok{\# print rows in which rex tillerson is speaking:}
\BuiltInTok{print}\NormalTok{(speaker\_df[speaker\_df[}\StringTok{\textquotesingle{}speaker\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\StringTok{"REX TILLERSON"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
          speaker                                             speech
2   REX TILLERSON  Thank you, Jeff. We will address our items of ...
3   REX TILLERSON  So now, turning to the formal business of the ...
4   REX TILLERSON  If you'd please take your seats. The first ite...
6   REX TILLERSON  Thank you. The Board recommends a vote against...
8   REX TILLERSON  Thank you, Fr. Crosby. The Board recommends a ...
10  REX TILLERSON  Thank you, Ms. Rembert. The Board recommends a...
12  REX TILLERSON  Thank you, Mr. Garland. The Board recommends a...
14  REX TILLERSON  Thank you, Mr. Sifferman. The Board recommends...
16  REX TILLERSON  Thank you, Mr. Jenkins. The Board recommends a...
18  REX TILLERSON  Thank you, Ms. Lamb. The board recommends a vo...
20  REX TILLERSON                           I'm well. Thank you.\n\n
22  REX TILLERSON  Thanks, Sister Pat. The board recommends a vot...
24  REX TILLERSON  Thank you, Mr. Mason. The board recommends a v...
26  REX TILLERSON  Thank you, Ms. Fugere. The board recommends a ...
28  REX TILLERSON  Thank you, Ms. Fugere. The board recommends a ...
29  REX TILLERSON  Okay, let's resume the meeting, so if you woul...
30  REX TILLERSON                             There in the back.\n\n
32  REX TILLERSON  Thank you. Other speakers? All right, down her...
34  REX TILLERSON  Well, ALEC, as you probably know, is an organi...
36  REX TILLERSON  Well, as you and I have spoken before, my view...
38  REX TILLERSON  I understand, and so I'm going to respond to a...
40  REX TILLERSON           Thank you. In the inside aisle here.\n\n
42  REX TILLERSON  Speculating on future court events would be ir...
44  REX TILLERSON  The only way I know to respond is to tell you ...
46  REX TILLERSON  And we have responded to those allegations as ...
48  REX TILLERSON                 Could you begin to wrap it up?\n\n
50  REX TILLERSON  Thank you. So over here to this side of the ha...
52  REX TILLERSON  Well, as to Saudi Aramco's views or the Kingdo...
54  REX TILLERSON  I wish my dad had bought some shares the year ...
56  REX TILLERSON    Thank you for those kind words. Right here.\n\n
58  REX TILLERSON  We will continue to engage in the policy discu...
59  REX TILLERSON  I believe all the items of business have been ...
60  REX TILLERSON  While the inspectors of election are preparing...
62  REX TILLERSON  Well, I'm not sure I would characterize it as ...
64  REX TILLERSON  Well, as many of have heard me say before, if ...
65  REX TILLERSON  The inspectors of election are ready to report...
67  REX TILLERSON  Thank you. As stated in the written report of ...
\end{verbatim}

\hypertarget{analyzing-distinguishing-terms}{%
\section{Analyzing Distinguishing
Terms}\label{analyzing-distinguishing-terms}}

And there we have it. We started with a PDF on a website, and we've
ended up with a dataframe in which each row is a speech, with a column
indicating who is speaking, what they're saying, and when they said it.

Now, lets use this dataframe to create a scatterplot comparing the
language used by the company's CEO Rex Tillerson and the company's
shareholders.This will give us insights into the topics that are
important for shareholders, and the debates that take place within the
company.

We'll do so using the
\href{https://spacy.io/universe/project/scattertext}{scattertext}
libray:

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{\%\%}\NormalTok{capture}

\ImportTok{import}\NormalTok{ scattertext }\ImportTok{as}\NormalTok{ st}

\CommentTok{\# create a corpus of text from the dataframe }
\NormalTok{corpus }\OperatorTok{=}\NormalTok{ st.CorpusFromPandas(speaker\_df, }\CommentTok{\# load the dataframe }
\NormalTok{                             category\_col}\OperatorTok{=}\StringTok{\textquotesingle{}speaker\textquotesingle{}}\NormalTok{, }\CommentTok{\# indicate which column contains the category we want to distinguish by }
\NormalTok{                             text\_col}\OperatorTok{=}\StringTok{\textquotesingle{}speech\textquotesingle{}}\NormalTok{, }\CommentTok{\# indicate which column stores the text to be analyzed}
\NormalTok{                             nlp}\OperatorTok{=}\NormalTok{nlp).build() }\CommentTok{\# load the NLP models used for analysis }

\CommentTok{\# remove stopwords from the corpus of text}
\NormalTok{corpus}\OperatorTok{=}\NormalTok{corpus.remove\_terms(nlp.Defaults.stop\_words, ignore\_absences}\OperatorTok{=}\VariableTok{True}\NormalTok{)}

\CommentTok{\# now, we create the scatterplot }
\NormalTok{html }\OperatorTok{=}\NormalTok{ st.produce\_scattertext\_explorer(}
\NormalTok{                   corpus, }\CommentTok{\# load the corpus }
\NormalTok{                   category}\OperatorTok{=}\StringTok{"REX TILLERSON"}\NormalTok{, }\CommentTok{\# indicate which category value we want to compare against all others; in this case, all rows in which "REX TILLERSON" is the speaker}
\NormalTok{                   category\_name}\OperatorTok{=}\StringTok{\textquotesingle{}Rex Tillerson\textquotesingle{}}\NormalTok{, }\CommentTok{\# set the label on the plot as "Rex Tillerson"}
\NormalTok{                   not\_category\_name}\OperatorTok{=}\StringTok{\textquotesingle{}Others\textquotesingle{}}\NormalTok{, }\CommentTok{\# set the label on the plot for all other speakers as "Others"}
\NormalTok{                   width\_in\_pixels}\OperatorTok{=}\DecValTok{1000}\NormalTok{) }\CommentTok{\#set the width }
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# display the plot                   }
\NormalTok{display(HTML(html))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
<IPython.core.display.HTML object>
\end{verbatim}

The plot above compares the frequency of terms used by Exxon CEO Rex
Tillerson (on the Y axis) against those used by other speakers (mainly
shareholders, on the X axis). The top right corner will contain terms
used frequently by both groups. the bottom left corner contains terms
used infrequently by both groups. The top left corner contains terms
used frequently by Rex Tillerson, but infrequently by shareholders. The
bottom right corner contains terms used frequently by shareholders, but
infrequently by Rex Tillerson.

A list of top terms used by each group is shown in the right. On this
list, we can see that ``climate change'' is the 4th most common phrase
used by the ``Others'' category, but isn't even in the top ten for Rex.
If you click on ``climate change'' in this list, it will give you some
statistics on how frequently this term is used by each group, as well as
a selection of example sentences in which the term appears. You can
search for other words/phrases either by clicking on them in the
scatterplot, or entering them into the ``Search the chart'' box below
the scatterplot.

\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}

\hypertarget{exercise-10}{%
\subsection{Exercise}\label{exercise-10}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Search for ``ALEC'' in this plot, and then google the term to find out
  more about what this is. What are shareholders aiming to do regarding
  ALEC, and how does Tillerson respond?
\item
  Search for the term ``carbon''. What differences do you notice in the
  use of this term by Rex Tillerson versus the shareholders?
\item
  Use this plot to identify another topic that shareholders are
  pressuring Exxon about.
\end{enumerate}

\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}

\bookmarksetup{startatroot}

\hypertarget{external-pressure}{%
\chapter{External pressure}\label{external-pressure}}

The fact that Exxon knew about climate change in the 1970s and still
funded climate denial resulted in public outrage, culminating in an
online movement organized around the twitter hashtag \#ExxonKnew.

I've downloaded almost 100,000 tweets containing the hashtag
\#ExxonKnew, between 2016 and 2017. Work together as a group to explore
and analyze this dataset.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tweets}\OperatorTok{=}\NormalTok{pd.read\_csv(}\StringTok{\textquotesingle{}https://storage.googleapis.com/qm2/wk4/Exxon\_tweets\_clean.csv\textquotesingle{}}\NormalTok{)}
\NormalTok{tweets}\OperatorTok{=}\NormalTok{tweets.sample(frac}\OperatorTok{=}\DecValTok{1}\NormalTok{, random\_state}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\CommentTok{\#don\textquotesingle{}t edit this line, i\textquotesingle{}m shuffling the data for you}
\NormalTok{tweets}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllll@{}}
\toprule\noalign{}
& text & created\_at & author\_id & lang & latitude & longitude \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
13362 & RT @Exxon\_Knew: ICYMI: @LamarSmithTX21 keeps p... &
2016-11-13T07:09:30.000Z & 36152170 & en & NaN & NaN \\
45804 & Look what Exxon has bought and paid for... \#Ex... &
2016-06-17T15:46:41.000Z & 31419872 & en & NaN & NaN \\
59678 & RT @350: It\textquotesingle s time to make fossil fuel
compani... & 2016-05-07T15:29:06.000Z & 1576283767 & en & NaN & NaN \\
62120 & RT @Energydesk: "There is no doubt": 1980 docs... &
2016-04-27T08:49:56.000Z & 429464827 & en & NaN & NaN \\
8046 & RT @MarkRuffalo: Why does Putin want Trump Pre... &
2016-12-11T02:33:34.000Z & 56839488 & en & NaN & NaN \\
... & ... & ... & ... & ... & ... & ... \\
21440 & RT @greenpeaceusa: This is a big deal. \#ExxonK... &
2016-09-20T18:01:48.000Z & 7.15E+17 & en & NaN & NaN \\
73349 & the @FBI is investigating what \#ExxonKnew abou... &
2016-03-29T15:53:32.000Z & 750681121 & en & NaN & NaN \\
50057 & RT @sierraclub: Exxon knew about \#climatechang... &
2016-05-29T06:39:38.000Z & 461795005 & en & NaN & NaN \\
5192 & RT @PriceofOil: \#ExxonKnew, Exxon lied. And no... &
2016-12-13T23:03:07.000Z & 14247309 & en & NaN & NaN \\
77708 & RT @AKRisingTide: The banner coming down to pr... &
2016-03-06T07:42:49.000Z & 3135294514 & en & NaN & NaN \\
\end{longtable}

\hypertarget{sentiment-analysis}{%
\subsection{Sentiment Analysis}\label{sentiment-analysis}}

Sentiment analysis is the computational study of people's opinions,
sentiments, emotions, appraisals, and attitudes towards entities such as
products, services, organizations, individuals, issues, events, topics,
and their attributes. Let's study the sentiment of the tweets in this
dataset.

\href{https://spacytextblob.netlify.app/}{spacytextblob} performs
sentiment analysis using the TextBlob library. Adding spacytextblob to a
spaCy nlp pipeline creates a new extension attribute for the Doc.

The \texttt{.\_.blob} attribute contains all of the methods and
attributes that belong to the \texttt{textblob.TextBlob} class. Some of
the common methods and attributes include:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{.\_.blob.polarity}: a float within the range {[}-1.0, 1.0{]}.
\item
  \texttt{.\_.blob.subjectivity}: a float within the range {[}0.0,
  1.0{]} where 0.0 is very objective and 1.0 is very subjective.
\item
  \texttt{.\_.blob.sentiment\_assessments.assessments}: a list of
  polarity and subjectivity scores for the assessed tokens.
\end{enumerate}

Let's run sentiment analysis on a single tweet:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# grab the 130th row in the dataframe and select the text of the tweet}
\NormalTok{text}\OperatorTok{=}\NormalTok{tweets.iloc[}\DecValTok{130}\NormalTok{][}\StringTok{\textquotesingle{}text\textquotesingle{}}\NormalTok{]}

\CommentTok{\# apply the NLP pipeline to this text.}
\NormalTok{doc }\OperatorTok{=}\NormalTok{ nlp(text)}

\BuiltInTok{print}\NormalTok{(text)}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Polarity: \textquotesingle{}}\NormalTok{, doc.\_.blob.polarity)}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Subjectivity: \textquotesingle{}}\NormalTok{, doc.\_.blob.subjectivity)}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Assessments: \textquotesingle{}}\NormalTok{, doc.\_.blob.sentiment\_assessments.assessments)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
RT @simonrlomax: #Climate Witch Hunt: The Left Is Coming for You Next https://t.co/O7ant6fsgt
https://t.co/O7ant6fsgt via .@KevinNR @NRO #E…
Polarity:  0.0
Subjectivity:  0.0
Assessments:  [(['left'], 0.0, 0.0, None), (['next'], 0.0, 0.0, None)]
\end{verbatim}

We can see that the model has deemed this tweet to be expressing
negative sentiment: it has a polarity of -0.15. It also deems this to be
a pretty subjective tweet, with a subjectivity score of 0.8. It does
indeed appear to be expressing a subjective opinion. Finally, we can see
which words are leading to this assessment. The word ``good'' is leading
to a 0.7 increase in the polarity score, and a 0.6 increase in the
subjectivity score. The word ``worst'' is leading to a -1 change
polarity, and a +1 change in subjectivity. The overall scores are
weighted averages of these values. Though these scores do roughly align
with the actual sentiment of this tweet, \textbf{ALWAYS} pay attention
to whats going on inside of your sentiment analysis pipeline. Even
though the overall sentiment score here is negative, it should probably
be even more negative; the algorithm picked up on the word ``good'' in
this tweet, and this improved the polarity score by 0.7. But the context
in which ``good'' was uttered in this tweet is actually negative! the
person is saying ``stop saying \#Tillerson is good on climate''-- this
is expressing negative sentiment!

\hypertarget{assessed-question-3}{%
\subsection{Assessed Question}\label{assessed-question-3}}

In this assessed question, we want to use NLP and spatial analysis to
map out where the people who are most angry about Exxon are located.

In the code cell below:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  create a dataframe called ``sample'' which contains the first 3000
  tweets from the ``tweets'' dataframe.
\item
  Using a lambda function,
  \texttt{.apply(lambda\ x:\ nlp(x).\_.blob.polarity)}, create a column
  in the sample dataframe that contains the polarity of each tweet.
\item
  Create a column that contains the subjectivity score for each tweet.
\item
  Filter the dataframe to keep only the tweets that are subjective
  (subjectivity score \textgreater{} 0.5) \emph{and} tweets that have
  negative sentiment (polarity score \textless{} 0).
\end{enumerate}

\emph{Question: which twitter user hates Exxon the most (lowest total
sentiment)? Make note of their ID.}

\bookmarksetup{startatroot}

\hypertarget{distributions-and-basic-statistics}{%
\chapter{Distributions and Basic
Statistics}\label{distributions-and-basic-statistics}}

\hypertarget{workshop-5-open-in-colab}{%
\section[\emph{Workshop 5} ]{\texorpdfstring{\emph{Workshop 5}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W05.\%20Distributions\%20and\%20Basic\%20Statistics.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Workshop 5 Open In Colab}}\label{workshop-5-open-in-colab}}

For the rest of this course, we'll be working with data from the U.S.
Census \href{https://www.census.gov/programs-surveys/cps.html}{Current
Population Survey (CPS)}.

\hypertarget{aims-2}{%
\subsection{Aims:}\label{aims-2}}

\begin{itemize}
\item
  Choosing appropriate summary statistics for varying distributions
\item
  Understanding:

  \begin{itemize}
  \tightlist
  \item
    The nature of our dataset, including potential bias
  \item
    How to generate summary statistics for our dataset
  \item
    The distribution of different variables
  \item
    The intuition behind the Central Limit Theorem
  \end{itemize}
\end{itemize}

\hypertarget{getting-started-1}{%
\section{Getting Started}\label{getting-started-1}}

\hypertarget{first-things-first-bias}{%
\subsection{First Things First: Bias}\label{first-things-first-bias}}

Once we've acquired a dataset, the first step is \emph{always} to
develop an understanding of where the data has come from. For this
dataset, use the following
\href{https://www.census.gov/programs-surveys/cps/technical-documentation/methodology.html}{documentation
page} to answer the questions below:

\begin{enumerate}
\def\labelenumi{\arabic{enumi})}
\tightlist
\item
  What is the population of interest?
\item
  What was the sampling strategy?
\item
  What are potential sources of selection bias?
\end{enumerate}

I'll start by importing the libraries I need: matplotlib (for graphs),
pandas (for data), numpy (for maths) and random (for generating random
numbers):

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#This tells python to draw the graphs "inline" {-} in the notebook}
\OperatorTok{\%}\NormalTok{matplotlib inline  }
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ norm}
\ImportTok{import}\NormalTok{ statistics}
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}

\ImportTok{import}\NormalTok{ pylab}
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\CommentTok{\# make the plots (graphs) a little wider by default}
\NormalTok{pylab.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\FloatTok{10.}\NormalTok{, }\FloatTok{8.}\NormalTok{)}
\NormalTok{sns.}\BuiltInTok{set}\NormalTok{(font\_scale}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)}
\NormalTok{sns.set\_style(}\StringTok{"white"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Now that I've imported the libraries I'm going to be using, I'm ready to
import the data:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{=}\NormalTok{pd.read\_csv(}\StringTok{\textquotesingle{}https://storage.googleapis.com/qm2/wk7/cps.csv\textquotesingle{}}\NormalTok{)}
\NormalTok{df.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllllllllll@{}}
\toprule\noalign{}
& year & state & age & sex & race & sch & ind & union & incwage &
realhrwage & occupation \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 1990 & 36 & 58 & 1 & 3 & 12.0 & 871 & 0.0 & 14200.0 & 12.269874 &
Office and Admin Support \\
1 & 2009 & 5 & 28 & 1 & 1 & 12.0 & 8660 & 1.0 & 17680.0 & 8.635149 &
Office and Admin Support \\
2 & 1990 & 36 & 37 & 1 & 1 & 14.0 & 380 & 1.0 & 28000.0 & 21.169851 &
. \\
3 & 1990 & 6 & 34 & 1 & 1 & 18.0 & 740 & 1.0 & 27500.0 & 20.447746 &
Computer and Math Technicians \\
4 & 1981 & 51 & 38 & 1 & 4 & 13.0 & 798 & NaN & 17000.0 & 18.892282 &
Managers \\
\end{longtable}

Our dataframe has 10 columns:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \emph{year}: Survey year
\item
  \emph{age}: the person's age
\item
  \emph{sex}: the person's sex

  \begin{itemize}
  \tightlist
  \item
    1=male
  \item
    2=female
  \end{itemize}
\item
  \emph{race}: the person's race

  \begin{itemize}
  \tightlist
  \item
    White non hispanic=1
  \item
    Black non hispanic=2
  \item
    Hispanic=3
  \item
    Other non hispanic=4)
  \end{itemize}
\item
  \emph{sch}: Educational attainment

  \begin{itemize}
  \tightlist
  \item
    None = 0,
  \item
    Grades 1-12 = 1-12
  \item
    Some University = 13,
  \item
    Associate's degree = 14,
  \item
    BA = 16
  \item
    Advanced Degree = 18
  \end{itemize}
\item
  \emph{union}: Union membership

  \begin{itemize}
  \tightlist
  \item
    N/A = 0,
  \item
    No union coverage = 1,
  \item
    Member of labor union=2,
  \item
    Covered by union but not a member=3
  \end{itemize}
\item
  \emph{incwage}: Wage and salary income
\item
  \emph{realhrwage}: Real Hourly Wage
\item
  \emph{occupation}: Occupation
\item
  \emph{ind}:
  \href{https://www.census.gov/naics/?58967?yearbck=2002}{industry code}
\end{enumerate}

\hypertarget{summary-statistics}{%
\section{Summary Statistics}\label{summary-statistics}}

After thinking about the origins of our dataset and loading it into
python, the next step is to generate summary statistics. This is vital
for us to better understand our data. Pandas has a useful function,
\texttt{describe}, which will generate summary statistics for all
numerical variables in our entire dataframe:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df.describe()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllllll@{}}
\toprule\noalign{}
& year & state & age & sex & race & sch & ind & union & incwage &
realhrwage \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
count & 344287.000000 & 344287.000000 & 344287.000000 & 344287.000000 &
344287.000000 & 344287.000000 & 344287.000000 & 301908.000000 &
3.442870e+05 & 344287.000000 \\
mean & 2002.599122 & 28.121004 & 41.734364 & 1.489057 & 1.570077 &
13.498057 & 4235.846009 & 0.221505 & 3.976170e+04 & 22.886629 \\
std & 10.831555 & 15.818556 & 10.415874 & 0.499881 & 0.952252 & 2.799038
& 3468.163157 & 0.499690 & 4.529758e+04 & 506.489695 \\
min & 1981.000000 & 1.000000 & 25.000000 & 1.000000 & 1.000000 &
0.000000 & 10.000000 & 0.000000 & 1.500000e+01 & 2.000000 \\
25\% & 1990.000000 & 13.000000 & 33.000000 & 1.000000 & 1.000000 &
12.000000 & 760.000000 & 0.000000 & 1.670000e+04 & 11.723004 \\
50\% & 2007.000000 & 28.000000 & 41.000000 & 1.000000 & 1.000000 &
13.000000 & 4270.000000 & 0.000000 & 3.000000e+04 & 17.698591 \\
75\% & 2011.000000 & 41.000000 & 50.000000 & 2.000000 & 2.000000 &
16.000000 & 7860.000000 & 0.000000 & 5.000000e+04 & 26.442308 \\
max & 2013.000000 & 56.000000 & 64.000000 & 2.000000 & 4.000000 &
18.000000 & 9590.000000 & 3.000000 & 1.259999e+06 & 294610.968750 \\
\end{longtable}

\texttt{describe} returns a dataframe with the same columns as the
source dataframe. For numeric data, the result's index will include
count, mean, std, min, max as well as lower, 50 and upper percentiles.
By default the lower percentile is 25 and the upper percentile is 75.
The 50 percentile is the same as the median. ``incwage'' is the annual
income variable. Because values are generally in the tens of thousands,
Python displays this using scientific notation (e.g.~3.442870e+05). This
is pretty ugly, so let's create a new variable called ``income'' which
divides that number by 1000 to make it more manageable:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}incwage\textquotesingle{}}\NormalTok{]}\OperatorTok{/}\DecValTok{1000}
\BuiltInTok{print}\NormalTok{(df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
0         14.20
1         17.68
2         28.00
3         27.50
4         17.00
          ...  
344282    17.00
344283     3.20
344284    13.00
344285     8.80
344286    15.00
Name: income, Length: 344287, dtype: float64
\end{verbatim}

\hypertarget{exercise-11}{%
\subsection{Exercise}\label{exercise-11}}

Given these summary statistics, answer the following questions:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  what is the median hourly wage?
\item
  what is the average age?
\item
  are there more men or women?
\item
  intepret the mean of the ``race'' column.
\end{enumerate}

The answer to the last question should provoke some futher thought; the
race column is categorical, but because it contains numbers it's being
treated as numerical. The mean of a categorical variable is meaningless;
For object data (e.g.~categories, strings or timestamps), the result's
index will include count, unique, top, and freq. The top is the most
common value. The freq is the most common value's frequency. Timestamps
also include the first and last items.

Let's convert the race column from a numerical variable into a
categorical one, and try \texttt{describe} once again:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df.dtypes}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
year            int64
state           int64
age             int64
sex             int64
race            int64
sch           float64
ind             int64
union         float64
incwage       float64
realhrwage    float64
occupation     object
income        float64
dtype: object
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df[}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{].astype(}\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{)}
\NormalTok{df[}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{].describe()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
count     344287
unique         4
top            1
freq      240382
Name: race, dtype: int64
\end{verbatim}

what other variables are categorical? Convert them to categorical and
describe. What is the most common occupation in this dataset?

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# convert the variables to categorical and describe}
\end{Highlighting}
\end{Shaded}

These statistics are useful, but suppose we want detailed counts of the
number of individuals in each category; For this, we can use the
\texttt{groupby} function, with the \texttt{.size()} operator which
simply counts the number of rows in each category.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{occupations}\OperatorTok{=}\NormalTok{ df.groupby(}\StringTok{\textquotesingle{}occupation\textquotesingle{}}\NormalTok{).size()}
\NormalTok{occupations.sort\_values(ascending}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
occupation
.                                         132708
Office and Admin Support                   50635
Managers                                   35696
Consruction, Extraction, Installation      30579
Production                                 29732
Transportation and materials moving        21277
Computer and Math Technicians               8602
Protective Service adj_occupations          7809
financial Operators                         7702
Business Operators                          7327
Community and Social Workers                6025
Lawyers, Judges,Physicans and dentists      3835
Farming, Fishing & Forestry                 2360
dtype: int64
\end{verbatim}

What is the most common profession?

\bookmarksetup{startatroot}

\hypertarget{distributions}{%
\chapter{Distributions}\label{distributions}}

Now that we've cleaned our data up, let's have a closer look at the
\emph{distribution} of our data. The best way to do this is using a
histogram, which takes one variable and divides its values into a number
of bins shown on the X axis, and then counts the number of observations
in each of those bins on the Y axis. Let's start by looking at the
distribution of the \texttt{income} variable:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{2013}\NormalTok{] }\CommentTok{\# filter the dataframe to only contain 2013 data}

\NormalTok{plt.hist(df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{]) }\CommentTok{\# plot a histogram of the income variable}
\NormalTok{plt.show() }\CommentTok{\# show the plot}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-10-output-1.png}

}

\end{figure}

This is a pretty ugly histogram, and it's not telling us very much
useful information. It shows that the vast majority of people make
between 0 and \$100,000 per year, and a few make over 200k. A small
number make over \$1 million per year, so the plot is being extended to
accomodate these outliers. Let's try fixing the histogram up a bit.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{plt.hist(df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{], bins}\OperatorTok{=}\DecValTok{100}\NormalTok{, edgecolor}\OperatorTok{=}\StringTok{\textquotesingle{}white\textquotesingle{}}\NormalTok{, density}\OperatorTok{=}\VariableTok{True}\NormalTok{) }
\CommentTok{\# i\textquotesingle{}ve increased the number of bins to 100 to make the plot smoother,}
\CommentTok{\# added the density=True argument to make the y{-}axis a probability density instead of a count,}
\CommentTok{\# and the edgecolor=\textquotesingle{}white\textquotesingle{} argument to add some space between the bars, making the plot easier to read}


\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Income ($, thousands)\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a label to the x axis}
\NormalTok{plt.title(}\StringTok{"U.S. Income Distributon, 2013"}\NormalTok{) }\CommentTok{\# add a title to the plot}

\NormalTok{plt.show() }
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-11-output-1.png}

}

\end{figure}

That's better. We can now see more variation in how much people earn
within the \$0-200,000 range since we increased the number of bins in
the histogram. It could still be improved, though. The outliers making
over \$ 1 million are creating lots of dead space in this plot. We can
defensibly omit them from the plot, as long as we acknowledge that we've
done this somewhere in our analysis.

Let's also plot the mean and median of our distribution.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{inc\_summary}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{].describe() }\CommentTok{\# get summary statistics for the income variable using the describe() method, and store them in a variable called inc\_summary}
\BuiltInTok{print}\NormalTok{(inc\_summary[[}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}50\%\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}std\textquotesingle{}}\NormalTok{]]) }\CommentTok{\# print the mean, median and standard deviation}

\NormalTok{plt.hist(df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{], bins}\OperatorTok{=}\DecValTok{100}\NormalTok{, edgecolor}\OperatorTok{=}\StringTok{\textquotesingle{}white\textquotesingle{}}\NormalTok{, density}\OperatorTok{=}\VariableTok{True}\NormalTok{) }\CommentTok{\# plot the histogram again}
\NormalTok{plt.axvline(inc\_summary[}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{], color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{1}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}Mean\textquotesingle{}}\NormalTok{) }\CommentTok{\# get the mean from the inc\_summary variable and plot a vertical line in red at that point}
\NormalTok{plt.axvline(inc\_summary[}\StringTok{\textquotesingle{}50\%\textquotesingle{}}\NormalTok{], color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{1}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Median\textquotesingle{}}\NormalTok{) }\CommentTok{\# do the same for the median, but plot it in black}

\NormalTok{plt.legend()}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Income ($, thousands\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.title(}\StringTok{"U.S. Income Distributon, 2013"}\NormalTok{)}
\NormalTok{plt.xlim(}\DecValTok{0}\NormalTok{,}\DecValTok{250}\NormalTok{) }\CommentTok{\# set the x{-}axis limits to 0 to 250{-}{-} this will get rid of the outliers on the right side of the plot}

\NormalTok{plt.show()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
mean    51.821863
50%     40.000000
std     60.163449
Name: income, dtype: float64
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-12-output-2.png}

}

\end{figure}

This histogram is far more informative-- use the questions in the
exercise below to guide your interpretation of this plot.

\hypertarget{exercise-12}{%
\subsection{Exercise}\label{exercise-12}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  What is the (approximate) mean of this distribution?
\item
  What is the median?
\item
  Keeping in mind that we've excluded some extreme values, why might the
  mean be larger than the median? Intepret this qualitatively in
  reference to income inequality.
\item
  There are slight bumps in density at \$100,000, \$150,000, \$200,000,
  and \$250,000. Why might this be?
\end{enumerate}

As we have seen, there are a few extreme outliers in the income
distribution (really rich people). Outliers can bias some statistical
tests, so for the rest of this workbook, we're going to subset our
dataframe to exclude those who make over \$200k per year:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{]}\OperatorTok{\textless{}}\DecValTok{200}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\hypertarget{functions}{%
\section{Functions}\label{functions}}

Now we've got a pretty good sense of what's going on with the income
variable. But suppose we want to do this for another variable. We could
just copy and paste the code above, switch around the variable in
question, and edit the labels. But there's a far more efficient way of
doing things. In Python and most programming languages, you can write
your own \textbf{function}.

A function is a block of code that you can call on to do a specific
task. You can write your own functions, or you can use functions that
other people have written. Functions are useful because they allow you
to write code once, and then call on it whenever you need it. This is
much more efficient than writing the same code over and over again. You
can define a function by using the \texttt{def} keyword. For example, we
can define a function called \texttt{variable\_stats} that will
calculate the mean, median, and standard deviation of the variable tha
you specify.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ variable\_stats(variable): }\CommentTok{\# define a function called variable\_stats that takes a variable as an argument}
\NormalTok{    mean }\OperatorTok{=}\NormalTok{ variable.mean() }\CommentTok{\# calculate the mean of the variable}
\NormalTok{    median }\OperatorTok{=}\NormalTok{ variable.median() }\CommentTok{\# calculate the median of the variable}
\NormalTok{    std }\OperatorTok{=}\NormalTok{ variable.std() }\CommentTok{\# calculate the standard deviation of the variable}
    \BuiltInTok{print}\NormalTok{(}\StringTok{"Mean: "} \OperatorTok{+} \BuiltInTok{str}\NormalTok{(mean)) }\CommentTok{\# print the mean}
    \BuiltInTok{print}\NormalTok{(}\StringTok{"Median: "} \OperatorTok{+} \BuiltInTok{str}\NormalTok{(median)) }\CommentTok{\# print the median}
    \BuiltInTok{print}\NormalTok{(}\StringTok{"Standard deviation: "} \OperatorTok{+} \BuiltInTok{str}\NormalTok{(std)) }\CommentTok{\# print the standard deviation}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# We can then call on this function whenever we want to calculate these statistics. }

\NormalTok{variable\_stats(df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{]) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Mean: 46.75408852483797
Median: 40.0
Standard deviation: 32.82190958000738
\end{verbatim}

Now, to calculate the same values for the age variable, we can simply
change which variable we feed the function:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{variable\_stats(df[}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{]) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Mean: 42.845076809704665
Median: 43.0
Standard deviation: 10.576333292267508
\end{verbatim}

We can write a more complex function to deal with plotting new
histograms for different variables, since most of the code we need to
plot a histogram won't change from one variable to the next. A few
things will change-- the variable that we're plotting, the title of the
graph, and the labels on the x and y axes, and perhaps the number of
bins. We can write a function called \texttt{plot\_histogram} that takes
these four things as arguments, and then plots a histogram. Then, we can
call on this function whenever we want to plot a histogram of a new
variable. Below is the same code we used to plot the histogram of
income, but this time we've written it as a function called
\texttt{plot\_histogram} and substituted the variable name
\texttt{income} for the argument \texttt{variable}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ plot\_histogram(variable, bin\_number, xlab, title): }\CommentTok{\# define a function called plot\_histogram that takes a variable, number of bins, x{-}axis label, and title as arguments}
    
\NormalTok{    summary}\OperatorTok{=}\NormalTok{variable.describe()     }
\NormalTok{    plt.hist(variable, bins}\OperatorTok{=}\NormalTok{bin\_number,edgecolor}\OperatorTok{=}\StringTok{\textquotesingle{}white\textquotesingle{}}\NormalTok{, density}\OperatorTok{=}\VariableTok{True}\NormalTok{) }\CommentTok{\# plot the histogram. Notice i\textquotesingle{}ve changed "bins=100" to "bins=bin\_number" so that the number of bins can be specified when the function is called}
\NormalTok{    plt.axvline(summary[}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{], color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{1}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}Mean \textquotesingle{}}\OperatorTok{+}\BuiltInTok{str}\NormalTok{(}\BuiltInTok{round}\NormalTok{(summary[}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{],}\DecValTok{2}\NormalTok{)))}
\NormalTok{    plt.axvline(summary[}\StringTok{\textquotesingle{}50\%\textquotesingle{}}\NormalTok{], color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{1}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Median \textquotesingle{}}\OperatorTok{+}\BuiltInTok{str}\NormalTok{(}\BuiltInTok{round}\NormalTok{(summary[}\StringTok{\textquotesingle{}50\%\textquotesingle{}}\NormalTok{],}\DecValTok{2}\NormalTok{)))}

\NormalTok{    plt.legend()}
\NormalTok{    plt.xlabel(xlab) }\CommentTok{\# i\textquotesingle{}ve changed the x{-}axis label to "xlab" so that it can be specified when the function is called}
\NormalTok{    plt.title(title) }\CommentTok{\# similarly, we can now specify the title when calling the function}
\NormalTok{    plt.show()}
\end{Highlighting}
\end{Shaded}

Now we can just call the function with the variable we want to plot, the
number of bins, the x-axis label, and the title using one line of code.
Let's recreate the histogram of income from above, but this time using
the function we just defined:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{plot\_histogram(variable }\OperatorTok{=}\NormalTok{ df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{], bin\_number }\OperatorTok{=} \DecValTok{20}\NormalTok{, xlab }\OperatorTok{=} \StringTok{\textquotesingle{}Income ($, 000)\textquotesingle{}}\NormalTok{, title }\OperatorTok{=} \StringTok{\textquotesingle{}U.S. Income Distribution, 2013\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-18-output-1.png}

}

\end{figure}

If we want to produce lots of similar plots, this really helps us cut
down on repetition.

\hypertarget{exercise-13}{%
\subsection{Exercise}\label{exercise-13}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Plot the histogram of the `age' variable with 80 bins, label the
  x-axis ``Age'', and add title of `U.S. Age Distribution, 2013'.
\item
  Plot the distribution of schooling years.

  \begin{itemize}
  \tightlist
  \item
    Find an appropriate number of bins
  \item
    Label it clearly
  \item
    Interpret salient trends
  \end{itemize}
\end{enumerate}

\hypertarget{the-central-limit-theorem}{%
\section{The Central Limit Theorem}\label{the-central-limit-theorem}}

But as we learned in class, the Central Limit Theorem states that the
\textbf{distribution of the mean of a sample of observations will be
approximately normal, regardless of the distribution of the original
observations}. So, if we take a \textbf{large enough sample} of
observations from each of these variables, and calculate the mean of
each sample, we should get a normal distribution. This is important
because the normal distribution behaves in a very predictable way.

The code below creates a ``standard normal'' distribution with a mean of
0 and a standard deviation of 1:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mu, se}\OperatorTok{=} \DecValTok{0}\NormalTok{, }\DecValTok{1} \CommentTok{\# create two variables, a mean "mu" equal to zero, and standard deviation "se" equal to 1}
\NormalTok{x }\OperatorTok{=}\NormalTok{ np.linspace(mu }\OperatorTok{{-}} \DecValTok{3}\OperatorTok{*}\NormalTok{se, mu }\OperatorTok{+} \DecValTok{3}\OperatorTok{*}\NormalTok{se, }\DecValTok{100}\NormalTok{) }\CommentTok{\# create a range of values from {-}3 to 3 standard deviations}

\NormalTok{plt.plot(x, norm.pdf(x, mu, se)) }\CommentTok{\# plot the normal distribution}
\NormalTok{plt.axvline(mu, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}solid\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{1}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}µ\textquotesingle{}}\NormalTok{)  }\CommentTok{\# plot a vertical line at the mean}
\NormalTok{plt.axvline(mu}\OperatorTok{{-}}\NormalTok{se}\OperatorTok{*}\DecValTok{2}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}µ ± 2σ\textquotesingle{}}\NormalTok{) }\CommentTok{\# plot a vertical line at the mean plus 2 standard deviations}
\NormalTok{plt.axvline(mu}\OperatorTok{+}\NormalTok{se}\OperatorTok{*}\DecValTok{2}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)  }\CommentTok{\# plot a vertical line at the mean minus 2 standard deviations}
\NormalTok{plt.legend()}
\NormalTok{plt.show()}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-21-output-1.png}

}

\end{figure}

for a distribution with this shape,

\begin{verbatim}
* 68% of the values will be within 1 standard deviation of the mean
* 95.45% of the values will be within 2 standard deviations of the mean
* 99.7% of the values will be within 3 standard deviations of the mean
\end{verbatim}

so in the plot above, if I took a random value from the distribution,
there's a 95\% chance that it would be between -2 and 2 (within the
dotted lines), and a 99.7\% chance that it would be between -3 and 3.

It's crucial to note, however, that this applies to the mean of a
sample, not individual observations. For example, this doesnt mean that
there is a 95\% chance that an individual taken at random will have an
income that is within 2 standard deviations of the mean (\$46k). It
means that if we take a sample of 100 observations, there is a 95\%
chance that the \textbf{mean of that sample} will be within 2 standard
deviations of the mean (\$46k).

\hypertarget{sampling}{%
\subsection{Sampling}\label{sampling}}

To illustrate how this works, for the rest of this workshop we're going
to pretend that the dataframe contains the entire adult
\textbf{population} of the United States (of course, it is actually a
sample but just pretend). The mean of this distribution will thus be the
\textbf{population mean}; for the income variable, this is \$46k.

We can use the \texttt{sample} function to take a random sample of
observations from a distribution. We'll take a sample of 5 observations
from the income variable and use the \texttt{mean} function to calculate
the mean of this sample.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{income\_sample }\OperatorTok{=}\NormalTok{ df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{].sample(}\DecValTok{5}\NormalTok{, replace}\OperatorTok{=}\VariableTok{True}\NormalTok{) }\CommentTok{\# take a random sample of 10 observations from the income variable}
\NormalTok{income\_sample\_mean}\OperatorTok{=}\NormalTok{income\_sample.mean() }\CommentTok{\# calculate the mean of the sample}
\BuiltInTok{print}\NormalTok{(}\StringTok{"Mean: "} \OperatorTok{+} \BuiltInTok{str}\NormalTok{(income\_sample\_mean)) }\CommentTok{\# print the sample mean}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Mean: 33.0
\end{verbatim}

\hypertarget{exercise-14}{%
\subsection{Exercise}\label{exercise-14}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Run the code cell above 10 times and make note of the mean. What is
  the farthest the sample mean deviates from the ``population'' mean of
  \$51k?
\item
  Increase the sample size from 5 to 100 and run the cell 10 more times.
  Now, what is the farthest the sample mean deviates from the population
  mean?
\item
  Increase the sample size to 1000. What do you notice about the sample
  means as we increase the sample size?
\end{enumerate}

Hopefully, you will have noticed that as the sample size increases, the
sample means tend to be closer to the population mean. But clicking that
cell is hard work. Let's create a loop that will run that block of code
10000 times, save the sample means in a list, and plot the distribution
of sample means as a histogram. Once again, we'll start by only drawing
samples of 10 observations:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#create an empty list to store sample means}
\NormalTok{sample\_means}\OperatorTok{=}\NormalTok{[]}

\NormalTok{sample\_size}\OperatorTok{=}\DecValTok{10}
\CommentTok{\# loop 10,000 times.}
\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{0}\NormalTok{,}\DecValTok{10000}\NormalTok{):}
\NormalTok{    sample}\OperatorTok{=}\NormalTok{ df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{].sample(sample\_size, replace}\OperatorTok{=}\VariableTok{True}\NormalTok{) }\CommentTok{\# draw a sample of 10 observations from the income variable, with replacement}
\NormalTok{    sample\_mean}\OperatorTok{=}\NormalTok{sample.mean() }\CommentTok{\# calculate the mean of the sample}
\NormalTok{    sample\_means.append(sample\_mean) }\CommentTok{\# append the sample mean to the list of sample means}
    
\NormalTok{plt.hist(sample\_means, bins}\OperatorTok{=}\DecValTok{30}\NormalTok{, edgecolor}\OperatorTok{=}\StringTok{\textquotesingle{}white\textquotesingle{}}\NormalTok{, density}\OperatorTok{=}\VariableTok{True}\NormalTok{) }\CommentTok{\# plot a histogram of the sample means}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}Distribution of Sample Means (n=}\SpecialCharTok{\{\}}\StringTok{)\textquotesingle{}}\NormalTok{.}\BuiltInTok{format}\NormalTok{(sample\_size)) }\CommentTok{\# add a title}
\NormalTok{plt.show()}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-23-output-1.png}

}

\end{figure}

\hypertarget{exercise-15}{%
\subsection{Exercise}\label{exercise-15}}

\begin{verbatim}
1. Edit the code block above so that instead of drawing samples of 10 observations, it draws samples of 1000 observations
2. What happens to the distribution?
\end{verbatim}

When we draw 10,000 samples of 1000 observations each, the distribution
of sample means looks a lot more normally distributed than the
underlying distribution of income itself, which is skewed to the right.
To show how normally distributed it is, let's overlay the normal
distribution line we plotted earlier and fit it to the distribution of
sample means. We'll start off by making the same histogram of sample
means, but add a line plot of the normal distribution and some droplines
at ± 2 standard deviations.

Because we may want to do this for several different variables, let's
once again package our code as a function in which we can swap around a
couple bits. In this case, we may want to swap around the variable we're
plotting, the label on the x-axis, and the size of the samples we're
drawing. So we'll create a function called \texttt{plot\_sample\_means}
that takes these three things as arguments (\texttt{var}, \texttt{xlab},
and \texttt{sample\_size}).

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ plot\_sample\_means(var, xlab, sample\_size): }\CommentTok{\# define a function called plot\_sample\_means that takes a variable, x{-}axis label, and sample size as arguments}

    \CommentTok{\#create an empty list to store sample means}
\NormalTok{    sample\_means}\OperatorTok{=}\NormalTok{[]}

    \CommentTok{\# loop 10,000 times.}
    \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{0}\NormalTok{,}\DecValTok{10000}\NormalTok{):}
        \CommentTok{\# for each iteration, draw a sample of the size specified by the "sample\_size" parameter}
\NormalTok{        sample}\OperatorTok{=}\NormalTok{var.sample(sample\_size, replace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
        \CommentTok{\# calculate the mean, and append it to the list of sample means. }
\NormalTok{        sample\_mean}\OperatorTok{=}\NormalTok{sample.mean()}
\NormalTok{        sample\_means.append(sample\_mean)}
    
    \CommentTok{\# now, plot a histogram }
\NormalTok{    plt.hist(sample\_means, color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{,alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{, bins}\OperatorTok{=}\BuiltInTok{int}\NormalTok{(}\DecValTok{30}\NormalTok{), edgecolor}\OperatorTok{=}\StringTok{\textquotesingle{}white\textquotesingle{}}\NormalTok{, density}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
    
    \CommentTok{\# fit a normal distribution to the data }
\NormalTok{    mu, se }\OperatorTok{=}\NormalTok{ norm.fit(sample\_means)}
\NormalTok{    xmin, xmax }\OperatorTok{=}\NormalTok{ plt.xlim()}
\NormalTok{    x }\OperatorTok{=}\NormalTok{ np.linspace(xmin, xmax, }\DecValTok{100}\NormalTok{)}
\NormalTok{    p }\OperatorTok{=}\NormalTok{ norm.pdf(x, mu, se) }
\NormalTok{    plt.plot(x, p, }\StringTok{\textquotesingle{}k\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{2}\NormalTok{)}

    \CommentTok{\# calculate the difference between the mean of the sample means }
\NormalTok{    diff}\OperatorTok{=}\BuiltInTok{abs}\NormalTok{(mu}\OperatorTok{{-}}\NormalTok{var.mean())}
    
    \CommentTok{\# add droplines, labels, title, legend, and limit the x{-}axis range to 3 standard deviations from the mean on either side.}
\NormalTok{    plt.axvline(mu, color}\OperatorTok{=}\StringTok{\textquotesingle{}green\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}solid\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{3}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}µx̄=\textquotesingle{}}\OperatorTok{+}\BuiltInTok{str}\NormalTok{(}\BuiltInTok{round}\NormalTok{(mu, }\DecValTok{3}\NormalTok{)))}
\NormalTok{    plt.axvline(mu}\OperatorTok{{-}}\NormalTok{se}\OperatorTok{*}\DecValTok{2}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}µ ± 2σ\textquotesingle{}}\NormalTok{)}
\NormalTok{    plt.axvline(mu}\OperatorTok{+}\NormalTok{se}\OperatorTok{*}\DecValTok{2}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)}
\NormalTok{    plt.legend()}
\NormalTok{    plt.xlabel(xlab)    }
\NormalTok{    plt.title(}\StringTok{\textquotesingle{}Distribution of Sample Means (n=}\SpecialCharTok{\{\}}\StringTok{)\textquotesingle{}}\NormalTok{.}\BuiltInTok{format}\NormalTok{(sample\_size))}
\NormalTok{    plt.xlim(mu}\OperatorTok{{-}}\NormalTok{se}\OperatorTok{*}\DecValTok{3}\NormalTok{, mu}\OperatorTok{+}\NormalTok{se}\OperatorTok{*}\DecValTok{3}\NormalTok{)}
\NormalTok{    plt.show()  }
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{plot\_sample\_means(df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{], xlab}\OperatorTok{=}\StringTok{\textquotesingle{}Income, ($, 000)\textquotesingle{}}\NormalTok{, sample\_size}\OperatorTok{=}\DecValTok{1000}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-25-output-1.png}

}

\end{figure}

We can see that the distribution of sample means (for samples of 1000
people) very closely approximates the normal distribution. The addition
of droplines at ±2σ tells us that \textbf{if we take a random sample of
1000 people, there is a 95\% chance that the mean of this sample will
fall between \textasciitilde\$44.7k and \textasciitilde\$48.8k}.

Why is this important? let's see what happens when we filter the sample
based on peoples' attributes. The code below creates two dataframes: one
called \texttt{men} which only contains male respondents, and one called
\texttt{women} which only contains female respondents. Then, we run the
\texttt{plot\_sample\_means()} function on each of these dataframes.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{men}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{1}\NormalTok{][}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{] }\CommentTok{\# create a new dataframe containing only income values for men}
\NormalTok{women}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{2}\NormalTok{][}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{] }\CommentTok{\# create a new dataframe containing only income values for women}

\NormalTok{plot\_sample\_means(men, xlab}\OperatorTok{=}\StringTok{\textquotesingle{}Income, ($, 000)\textquotesingle{}}\NormalTok{, sample\_size}\OperatorTok{=}\DecValTok{500}\NormalTok{)}
\NormalTok{plot\_sample\_means(women, xlab}\OperatorTok{=}\StringTok{\textquotesingle{}Income, ($, 000)\textquotesingle{}}\NormalTok{, sample\_size}\OperatorTok{=}\DecValTok{500}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-26-output-1.png}

}

\end{figure}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-26-output-2.png}

}

\end{figure}

\hypertarget{exercise-16}{%
\subsection{Exercise}\label{exercise-16}}

\begin{verbatim}
1. These two histograms may look the same, but look closely at the values. 
2. The population mean income for women is around $39.6k. Based on the histogram of sample means taken from only men, 
    * What is the likelihood of observing a sample mean of $39.6k among men due to random chance?
3. Interpret this finding qualitatively.
\end{verbatim}

The plot of incomes for men and women show very different distributions,
but they look quite similar. To make this more readable, let's define
one last function that can take two or more groups and plot the
distribution of their sample means on the same plot:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ two\_hist(groups,group\_labs,xlab, title): }\CommentTok{\# define a function called two\_hist that takes a list of groups, a list of group labels, an x{-}axis label, and a title as arguments}

\NormalTok{        plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{15}\NormalTok{,}\DecValTok{6}\NormalTok{)) }\CommentTok{\# set the figure size}

\NormalTok{        it}\OperatorTok{={-}}\DecValTok{1} \CommentTok{\# create a counter variable called "it" and set it equal to {-}1}
        \ControlFlowTok{for}\NormalTok{ var }\KeywordTok{in}\NormalTok{ groups: }\CommentTok{\# loop through each group in the list of groups}
\NormalTok{            it}\OperatorTok{+=}\DecValTok{1} \CommentTok{\# increase the iterator by 1}
\NormalTok{            sample\_size}\OperatorTok{=}\DecValTok{1000} \CommentTok{\# set the sample size equal to 1000}
\NormalTok{            sample\_means}\OperatorTok{=}\NormalTok{[] }\CommentTok{\# create an empty list to store sample means}
\NormalTok{            iterations}\OperatorTok{=}\DecValTok{10000} \CommentTok{\# set the number of iterations equal to 10,000}

            \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{0}\NormalTok{,iterations): }\CommentTok{\# loop through the number of iterations}
\NormalTok{                sample}\OperatorTok{=}\NormalTok{var.sample(sample\_size, replace}\OperatorTok{=}\VariableTok{True}\NormalTok{) }\CommentTok{\# draw a sample of the size specified by the "sample\_size" parameter}
\NormalTok{                sample\_mean}\OperatorTok{=}\NormalTok{sample.mean() }\CommentTok{\# calculate the mean of the sample}
\NormalTok{                sample\_means.append(sample\_mean) }\CommentTok{\# append the sample mean to the list of sample means}
            
\NormalTok{            plt.hist(sample\_means, bins}\OperatorTok{=}\BuiltInTok{int}\NormalTok{(iterations}\OperatorTok{/}\DecValTok{300}\NormalTok{),edgecolor}\OperatorTok{=}\StringTok{\textquotesingle{}white\textquotesingle{}}\NormalTok{,density}\OperatorTok{=}\VariableTok{True}\NormalTok{, label}\OperatorTok{=}\NormalTok{group\_labs[it])  }\CommentTok{\# plot a histogram of the sample means}
\NormalTok{            mu, se }\OperatorTok{=}\NormalTok{ norm.fit(sample\_means) }\CommentTok{\# fit a normal distribution to the data}
\NormalTok{            xmin, xmax }\OperatorTok{=}\NormalTok{ plt.xlim() }\CommentTok{\# set the x{-}axis limits}
\NormalTok{            x }\OperatorTok{=}\NormalTok{ np.linspace(xmin, xmax, }\DecValTok{100}\NormalTok{) }\CommentTok{\# create a range of values from the minimum to the maximum x{-}axis value}
\NormalTok{            p }\OperatorTok{=}\NormalTok{ norm.pdf(x, mu, se) }\CommentTok{\# calculate the probability density function for the normal distribution}

\NormalTok{            mean}\OperatorTok{=}\NormalTok{var.mean() }\CommentTok{\# calculate the mean of the variable}
\NormalTok{            lower\_ci}\OperatorTok{=}\NormalTok{mu}\OperatorTok{{-}}\NormalTok{se}\OperatorTok{*}\FloatTok{1.96} \CommentTok{\# calculate the lower bound of the 95\% confidence interval}
\NormalTok{            upper\_ci}\OperatorTok{=}\NormalTok{mu}\OperatorTok{+}\NormalTok{se}\OperatorTok{*}\FloatTok{1.96} \CommentTok{\# calculate the upper bound of the 95\% confidence interval}

\NormalTok{            plt.plot(x, p, }\StringTok{\textquotesingle{}k\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{2}\NormalTok{) }\CommentTok{\# plot the normal distribution}
\NormalTok{            plt.xlabel(xlab) }\CommentTok{\# add an x{-}axis label}
\NormalTok{            plt.title(title) }\CommentTok{\# add a title}
\NormalTok{            plt.axvline(mean, color}\OperatorTok{=}\StringTok{\textquotesingle{}green\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}solid\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{3}\NormalTok{) }\CommentTok{\# add a vertical line at the mean of the variable}
\NormalTok{            plt.axvline(lower\_ci, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{) }\CommentTok{\# add a vertical line at the mean minus 2 standard deviations}
\NormalTok{            plt.axvline(upper\_ci, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{) }\CommentTok{\# add a vertical line at the mean plus 2 standard deviations}
\NormalTok{            plt.legend() }\CommentTok{\# add a legend}
            
\NormalTok{        plt.show()  }\CommentTok{\# show the plot }
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{two\_hist([men,women],[}\StringTok{\textquotesingle{}Men\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Women\textquotesingle{}}\NormalTok{],}\StringTok{\textquotesingle{}Income ($, thousands)\textquotesingle{}}\NormalTok{, }\StringTok{"Income Sample Means"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W05. Distributions and Basic Statistics_files/figure-pdf/cell-28-output-1.png}

}

\end{figure}

There we have it: A stark, quantitative representation of the gender
wage gap.

If we simply compared the means between a sample of women and a sample
of men, the best we could do in terms of inference would be to say
something like ``the average income for men was \$14.2k higher than it
was for women.'' Though this is an important finding, the central limit
theorem lets us add important context.

\begin{verbatim}
1. We took 10,000 large samples of men and calculated the means of each sample.
2. We found that over 99.7% of them were greater than $50,000. 
3. We took 10,000 large samples of women and calculated the means of each sample.
4. We found that over 99.7% of them were smaller than $44,000.
\end{verbatim}

The fact that the distribution of income sample means between men and
women do not overlap at all tells us that the probability of this
variation in incomes being due to random chance is extremely, extremely
small. Thus, we can say that the observed difference in income between
men and women is \textbf{statistically significant}.

\bookmarksetup{startatroot}

\hypertarget{assessed-question-4}{%
\chapter{Assessed Question}\label{assessed-question-4}}

Given that we've used functions to create these plots, we can make plots
with different data relatively easily. As a reminder, here are the
values for the sex, race, and schooling variable in our dataframe:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \emph{year}: Survey year
\item
  \emph{age}: the person's age
\item
  \emph{sex}: the person's sex

  \begin{itemize}
  \tightlist
  \item
    1=male
  \item
    2=female
  \end{itemize}
\item
  \emph{race}: the person's race

  \begin{itemize}
  \tightlist
  \item
    White non hispanic=1
  \item
    Black non hispanic=2
  \item
    Hispanic=3
  \item
    Other non hispanic=4
  \end{itemize}
\item
  \emph{sch}: Educational attainment

  \begin{itemize}
  \tightlist
  \item
    None = 0,
  \item
    Grades 1-12 = 1-12
  \item
    Some University = 13,
  \item
    Associate's degree = 14,
  \item
    BA = 16
  \item
    Advanced Degree = 18
  \end{itemize}
\item
  \emph{union}: Union membership

  \begin{itemize}
  \tightlist
  \item
    N/A = 0,
  \item
    No union coverage = 1,
  \item
    Member of labor union=2,
  \item
    Covered by union but not a member=3
  \end{itemize}
\item
  \emph{incwage}: Wage and salary income
\item
  \emph{realhrwage}: Real Hourly Wage
\item
  \emph{occupation}: Occupation
\item
  \emph{ind}:
  \href{https://www.census.gov/naics/?58967?yearbck=2002}{industry code}
\end{enumerate}

Intersectionality is an important consideration when thinking about
inequality.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Create a new dataframe which restricts the census data to just white
  men who have completed highschool (and only highschool).
\item
  Do the same for hispanic women with a Bachelor's degree (and only a
  Bachelor's).
\item
  Use the \texttt{two\_hist()} function above to plot these two income
  groups against each other.
\end{enumerate}

\emph{Part A:}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Modify the function to ascertain the absolute difference in
  \emph{population means} (i.e., the mean of sample means) between these
  two groups. Round the result to an integer.
\end{enumerate}

\emph{Part B:}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{4}
\tightlist
\item
  Is this difference likely to occur due to random chance? Answer using
  a 95\% confidence interval.
\end{enumerate}

\bookmarksetup{startatroot}

\hypertarget{merging-and-joining}{%
\chapter{Merging and Joining}\label{merging-and-joining}}

\hypertarget{reading-week-open-in-colab}{%
\section[\emph{Reading Week} ]{\texorpdfstring{\emph{Reading Week}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/RW.\%20Merging\%20and\%20Joining.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Reading Week Open In Colab}}\label{reading-week-open-in-colab}}

Sometimes, we will want to combine data from different sources about the
same subject - perhaps we want to compare the GDP in a country with life
expectancy, or the proportion of free schools meals with the level of
unemployment.

\hypertarget{aims-3}{%
\subsection{Aims}\label{aims-3}}

\begin{itemize}
\tightlist
\item
  Understand joins
\item
  Work with joining dataframes in Pandas
\item
  Create your own examples
\end{itemize}

\hypertarget{downloading-the-data-2}{%
\section{Downloading the Data}\label{downloading-the-data-2}}

Let's grab the data we will need this week from our course website and
save it into our data folder. If you've not already created a data
folder then do so using the following command.

Don't worry if it generates an error, that means you've already got a
data folder.

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{!}\NormalTok{mkdir data}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
mkdir: cannot create directory ‘data’: File exists
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{!}\NormalTok{mkdir data}\OperatorTok{/}\NormalTok{wk5}
\OperatorTok{!}\NormalTok{curl https:}\OperatorTok{//}\NormalTok{s3.eu}\OperatorTok{{-}}\NormalTok{west}\OperatorTok{{-}}\FloatTok{2.}\ErrorTok{amazonaws}\NormalTok{.com}\OperatorTok{/}\NormalTok{qm2}\OperatorTok{/}\NormalTok{wk3}\OperatorTok{/}\NormalTok{UN\_Life\_all.csv }\OperatorTok{{-}}\NormalTok{o .}\OperatorTok{/}\NormalTok{data}\OperatorTok{/}\NormalTok{wk5}\OperatorTok{/}\NormalTok{UN\_Life\_all.csv}
\OperatorTok{!}\NormalTok{curl https:}\OperatorTok{//}\NormalTok{s3.eu}\OperatorTok{{-}}\NormalTok{west}\OperatorTok{{-}}\FloatTok{2.}\ErrorTok{amazonaws}\NormalTok{.com}\OperatorTok{/}\NormalTok{qm2}\OperatorTok{/}\NormalTok{wk3}\OperatorTok{/}\NormalTok{UN\_Cities\_1214\_country.csv }\OperatorTok{{-}}\NormalTok{o .}\OperatorTok{/}\NormalTok{data}\OperatorTok{/}\NormalTok{wk5}\OperatorTok{/}\NormalTok{UN\_Cities\_1214\_country.csv}
\OperatorTok{!}\NormalTok{curl https:}\OperatorTok{//}\NormalTok{s3.eu}\OperatorTok{{-}}\NormalTok{west}\OperatorTok{{-}}\FloatTok{2.}\ErrorTok{amazonaws}\NormalTok{.com}\OperatorTok{/}\NormalTok{qm2}\OperatorTok{/}\NormalTok{wk3}\OperatorTok{/}\NormalTok{UN\_Cities\_1214\_population.csv }\OperatorTok{{-}}\NormalTok{o .}\OperatorTok{/}\NormalTok{data}\OperatorTok{/}\NormalTok{wk5}\OperatorTok{/}\NormalTok{UN\_Cities\_1214\_population.csv}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  354k  100  354k    0     0   411k      0 --:--:-- --:--:-- --:--:--  410k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 31445  100 31445    0     0  63397      0 --:--:-- --:--:-- --:--:-- 63397
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  373k  100  373k    0     0   471k      0 --:--:-- --:--:-- --:--:--  471k
\end{verbatim}

\hypertarget{joining-instructions}{%
\section{Joining Instructions}\label{joining-instructions}}

Joins are the combination of different datasets, and are common in
relational databases as a way of performing queries. There are lots of
examples of why and when we might want to do this, but most start with
two tables of data. We're going to start with some data we've generated.

I'm going to go back and work with fake data for a while, because it's
clean and small and we can see what's going on - when we work with real
data, we have to take great care that the data is clean, the indices
match, and so on.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\ImportTok{import}\NormalTok{ random}
\OperatorTok{\%}\NormalTok{matplotlib inline}
\end{Highlighting}
\end{Shaded}

Let's create dataframes which represent fictitious values associated
with people. Let's assume our data is anonymised because we're ethical
researchers and don't want information about real people leaking out.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people1 }\OperatorTok{=}\NormalTok{ pd.DataFrame(}\DecValTok{5}\OperatorTok{+}\NormalTok{np.random.randn(}\DecValTok{5}\NormalTok{, }\DecValTok{5}\NormalTok{))}
\NormalTok{people1.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}units of alcohol drunk\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}cigarettes smoked\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}sleep per night\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}height\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}BMI\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people1}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& units of alcohol drunk & cigarettes smoked & sleep per night & height
& BMI \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 4.589208 & 5.052479 & 5.514619 & 4.721543 & 6.076186 \\
1 & 5.091470 & 4.275959 & 6.630442 & 7.084920 & 4.787786 \\
2 & 5.751082 & 4.630197 & 5.286618 & 5.058565 & 3.803777 \\
3 & 6.219418 & 6.131729 & 4.359941 & 5.165182 & 3.270455 \\
4 & 5.930070 & 3.745579 & 4.980880 & 6.670358 & 5.003252 \\
\end{longtable}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people2 }\OperatorTok{=}\NormalTok{ pd.DataFrame(}\DecValTok{5}\OperatorTok{+}\NormalTok{np.random.randn(}\DecValTok{3}\NormalTok{, }\DecValTok{5}\NormalTok{))}
\NormalTok{people2.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}units of alcohol drunk\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}cigarettes smoked\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}sleep per night\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}height\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}BMI\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people2}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& units of alcohol drunk & cigarettes smoked & sleep per night & height
& BMI \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 3.657942 & 5.022931 & 5.657866 & 5.342434 & 4.768451 \\
1 & 5.801720 & 6.528911 & 3.863262 & 3.918306 & 3.233783 \\
2 & 4.937641 & 4.726278 & 4.398084 & 5.610086 & 4.368852 \\
\end{longtable}

\bookmarksetup{startatroot}

\hypertarget{adding-new-observations}{%
\chapter{Adding new observations}\label{adding-new-observations}}

It looks as if we have some data about people (although we've just made
it up), and a set of common measurements. It would be nice to have all
of this in one place, so let's \emph{merge} them into one dataframe.
We'll use the \emph{concat} command, which is short for
\emph{concatenate}, or ``chain together''.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people3 }\OperatorTok{=}\NormalTok{ pd.concat([people1,people2])}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people3}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& units of alcohol drunk & cigarettes smoked & sleep per night & height
& BMI \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 4.656973 & 3.732003 & 5.204398 & 5.592159 & 3.964027 \\
1 & 5.023007 & 3.480838 & 4.677067 & 5.065464 & 4.795884 \\
2 & 7.415662 & 4.302678 & 3.746028 & 5.616205 & 4.797184 \\
3 & 5.102570 & 4.572136 & 3.668020 & 2.840370 & 4.426059 \\
4 & 5.393448 & 4.397537 & 6.849025 & 4.490472 & 5.248013 \\
0 & 3.657942 & 5.022931 & 5.657866 & 5.342434 & 4.768451 \\
1 & 5.801720 & 6.528911 & 3.863262 & 3.918306 & 3.233783 \\
2 & 4.937641 & 4.726278 & 4.398084 & 5.610086 & 4.368852 \\
\end{longtable}

\hypertarget{what-is-the-problem-above}{%
\subsection{What is the problem
above?}\label{what-is-the-problem-above}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people4 }\OperatorTok{=}\NormalTok{ pd.concat([people1,people2], ignore\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{people4}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& units of alcohol drunk & cigarettes smoked & sleep per night & height
& BMI \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 4.656973 & 3.732003 & 5.204398 & 5.592159 & 3.964027 \\
1 & 5.023007 & 3.480838 & 4.677067 & 5.065464 & 4.795884 \\
2 & 7.415662 & 4.302678 & 3.746028 & 5.616205 & 4.797184 \\
3 & 5.102570 & 4.572136 & 3.668020 & 2.840370 & 4.426059 \\
4 & 5.393448 & 4.397537 & 6.849025 & 4.490472 & 5.248013 \\
5 & 3.657942 & 5.022931 & 5.657866 & 5.342434 & 4.768451 \\
6 & 5.801720 & 6.528911 & 3.863262 & 3.918306 & 3.233783 \\
7 & 4.937641 & 4.726278 & 4.398084 & 5.610086 & 4.368852 \\
\end{longtable}

\texttt{ignore\_index} is very useful when we want a new DataFrame which
only contains data from other DataFrames, but unrelated otherwise.

\hypertarget{data-with-a-unique-index-adding-new-observations}{%
\section{Data with a unique index: adding new
observations}\label{data-with-a-unique-index-adding-new-observations}}

Let's now examine data where the elements of study are not anonymous.
Let's consider that we have some city data. If we have city names (or
equivalent) in the index column, simply concatenating them would be
fine, because the names would not repeat in the way the index has above.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1 }\OperatorTok{=}\NormalTok{ pd.DataFrame(}\DecValTok{5}\OperatorTok{+}\NormalTok{np.random.randn(}\DecValTok{5}\NormalTok{, }\DecValTok{5}\NormalTok{))}
\NormalTok{df1.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}area\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}population\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}mean temperature\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}elevation\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}annual rainfall\textquotesingle{}}\NormalTok{]}
\NormalTok{df1.index }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}London\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Paris\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beijing\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Medellin\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Port Elizabeth\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 4.150726 & 6.091615 & 5.638999 & 4.033120 & 5.312239 \\
Paris & 6.406381 & 5.192887 & 5.165797 & 4.642474 & 5.776229 \\
Beijing & 5.300187 & 4.790422 & 5.425208 & 4.857182 & 4.830031 \\
Medellin & 5.248481 & 4.734017 & 4.762919 & 5.325021 & 4.415028 \\
Port Elizabeth & 3.663045 & 5.555412 & 5.418251 & 4.369018 & 5.411102 \\
\end{longtable}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df2 }\OperatorTok{=}\NormalTok{ pd.DataFrame(}\DecValTok{5}\OperatorTok{+}\NormalTok{np.random.randn(}\DecValTok{3}\NormalTok{, }\DecValTok{5}\NormalTok{))}
\NormalTok{df2.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}area\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}population\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}mean temperature\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}elevation\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}annual rainfall\textquotesingle{}}\NormalTok{]}
\NormalTok{df2.index }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}Mumbai\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Sydney\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Boston\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df2}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
Mumbai & 7.023555 & 4.045827 & 4.536805 & 5.383593 & 5.707156 \\
Sydney & 5.444850 & 4.930251 & 3.803988 & 5.578729 & 6.248074 \\
Boston & 3.380747 & 3.468165 & 4.166799 & 4.950791 & 5.094166 \\
\end{longtable}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df3 }\OperatorTok{=}\NormalTok{ pd.concat([df1,df2])}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df3}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 4.150726 & 6.091615 & 5.638999 & 4.033120 & 5.312239 \\
Paris & 6.406381 & 5.192887 & 5.165797 & 4.642474 & 5.776229 \\
Beijing & 5.300187 & 4.790422 & 5.425208 & 4.857182 & 4.830031 \\
Medellin & 5.248481 & 4.734017 & 4.762919 & 5.325021 & 4.415028 \\
Port Elizabeth & 3.663045 & 5.555412 & 5.418251 & 4.369018 & 5.411102 \\
Mumbai & 7.023555 & 4.045827 & 4.536805 & 5.383593 & 5.707156 \\
Sydney & 5.444850 & 4.930251 & 3.803988 & 5.578729 & 6.248074 \\
Boston & 3.380747 & 3.468165 & 4.166799 & 4.950791 & 5.094166 \\
\end{longtable}

\hypertarget{exercise-concat-continued}{%
\section{Exercise: Concat continued}\label{exercise-concat-continued}}

Repeat the above for fictitious values for New York, Tokyo, Manila and
Budapest - concatenate into a new dataframe ``df''.

\hypertarget{combining-on-attributes}{%
\section{Combining on Attributes}\label{combining-on-attributes}}

What if we're looking at the same locations but different attributes?
Consider the same df1

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1 }\OperatorTok{=}\NormalTok{ pd.DataFrame(}\DecValTok{5}\OperatorTok{+}\NormalTok{np.random.randn(}\DecValTok{5}\NormalTok{, }\DecValTok{5}\NormalTok{))}
\NormalTok{df1.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}area\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}population\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}mean temperature\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}elevation\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}annual rainfall\textquotesingle{}}\NormalTok{]}
\NormalTok{df1.index }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}London\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Paris\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beijing\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Medellin\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Port Elizabeth\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 6.216092 & 4.902209 & 3.726599 & 4.628916 & 6.348860 \\
Paris & 6.041971 & 3.477545 & 3.075159 & 2.630728 & 5.945750 \\
Beijing & 4.117056 & 5.939825 & 5.166189 & 6.534852 & 4.581087 \\
Medellin & 4.186988 & 5.007498 & 5.732247 & 5.746915 & 2.452759 \\
Port Elizabeth & 5.755107 & 6.332844 & 5.603563 & 5.072384 & 6.222260 \\
\end{longtable}

But a new dataframe df4, which details the same locations, but has
different information about them:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df4 }\OperatorTok{=}\NormalTok{ pd.DataFrame(}\DecValTok{5}\OperatorTok{+}\NormalTok{np.random.randn(}\DecValTok{5}\NormalTok{, }\DecValTok{3}\NormalTok{))}
\NormalTok{df4.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}Mean House Price\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}median income\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}walkability score\textquotesingle{}}\NormalTok{]}
\NormalTok{df4.index }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}London\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Paris\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beijing\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Medellin\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Port Elizabeth\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df4}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llll@{}}
\toprule\noalign{}
& Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 6.041301 & 4.795007 & 5.916860 \\
Paris & 6.125245 & 3.869070 & 4.279607 \\
Beijing & 4.853104 & 5.725823 & 4.187186 \\
Medellin & 5.482517 & 3.667043 & 3.928093 \\
Port Elizabeth & 5.565643 & 5.884004 & 5.007168 \\
\end{longtable}

We have to join ``on'' the index - meaning when merging the records,
python will look at the index column.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined }\OperatorTok{=}\NormalTok{ df1.merge(df4, left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall &
Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 6.216092 & 4.902209 & 3.726599 & 4.628916 & 6.348860 & 6.041301
& 4.795007 & 5.916860 \\
Paris & 6.041971 & 3.477545 & 3.075159 & 2.630728 & 5.945750 & 6.125245
& 3.869070 & 4.279607 \\
Beijing & 4.117056 & 5.939825 & 5.166189 & 6.534852 & 4.581087 &
4.853104 & 5.725823 & 4.187186 \\
Medellin & 4.186988 & 5.007498 & 5.732247 & 5.746915 & 2.452759 &
5.482517 & 3.667043 & 3.928093 \\
Port Elizabeth & 5.755107 & 6.332844 & 5.603563 & 5.072384 & 6.222260 &
5.565643 & 5.884004 & 5.007168 \\
\end{longtable}

Note that this joins on the \emph{index}, not the row number - so if the
order of elements in df4 is different, it should still work.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df4 }\OperatorTok{=}\NormalTok{ pd.DataFrame(np.random.randn(}\DecValTok{5}\NormalTok{, }\DecValTok{3}\NormalTok{))}
\NormalTok{df4.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}Mean House Price\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}median income\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}walkability score\textquotesingle{}}\NormalTok{]}
\NormalTok{df4.index }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}Paris\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Port Elizabeth\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beijing\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Medellin\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}London\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 6.216092 & 4.902209 & 3.726599 & 4.628916 & 6.348860 \\
Paris & 6.041971 & 3.477545 & 3.075159 & 2.630728 & 5.945750 \\
Beijing & 4.117056 & 5.939825 & 5.166189 & 6.534852 & 4.581087 \\
Medellin & 4.186988 & 5.007498 & 5.732247 & 5.746915 & 2.452759 \\
Port Elizabeth & 5.755107 & 6.332844 & 5.603563 & 5.072384 & 6.222260 \\
\end{longtable}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df4}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llll@{}}
\toprule\noalign{}
& Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
Paris & 0.425225 & -0.446028 & -0.381586 \\
Port Elizabeth & -0.918616 & 1.274748 & 0.355480 \\
Beijing & 0.918480 & 1.060849 & -1.040598 \\
Medellin & 1.414231 & 0.914922 & -0.393816 \\
London & -1.036150 & -0.902475 & -0.417904 \\
\end{longtable}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined }\OperatorTok{=}\NormalTok{ df1.merge(df4, left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall &
Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 6.216092 & 4.902209 & 3.726599 & 4.628916 & 6.348860 &
-1.036150 & -0.902475 & -0.417904 \\
Paris & 6.041971 & 3.477545 & 3.075159 & 2.630728 & 5.945750 & 0.425225
& -0.446028 & -0.381586 \\
Beijing & 4.117056 & 5.939825 & 5.166189 & 6.534852 & 4.581087 &
0.918480 & 1.060849 & -1.040598 \\
Medellin & 4.186988 & 5.007498 & 5.732247 & 5.746915 & 2.452759 &
1.414231 & 0.914922 & -0.393816 \\
Port Elizabeth & 5.755107 & 6.332844 & 5.603563 & 5.072384 & 6.222260 &
-0.918616 & 1.274748 & 0.355480 \\
\end{longtable}

\hypertarget{merge-records}{%
\section{Merge Records}\label{merge-records}}

Consider now a case where we have data for some but not all cities; so
df1 stil has data for these 5 cities:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 4.898594 & 6.625739 & 3.587877 & 6.063331 & 4.342769 \\
Paris & 6.032702 & 3.479265 & 2.383832 & 5.251509 & 5.158178 \\
Beijing & 4.368419 & 4.993774 & 2.942992 & 3.761624 & 6.002863 \\
Medellin & 7.437921 & 5.228150 & 3.902431 & 4.437361 & 5.563400 \\
Port Elizabeth & 7.053265 & 5.936734 & 5.842155 & 6.042136 & 7.057592 \\
\end{longtable}

But our new table, df5, contains data for three cities:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df5 }\OperatorTok{=}\NormalTok{ pd.DataFrame(}\DecValTok{5}\OperatorTok{+}\NormalTok{np.random.randn(}\DecValTok{3}\NormalTok{, }\DecValTok{3}\NormalTok{))}
\NormalTok{df5.columns }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}Mean House Price\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}median income\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}walkability score\textquotesingle{}}\NormalTok{]}
\NormalTok{df5.index }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}London\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Paris\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Glasgow\textquotesingle{}}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df5}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llll@{}}
\toprule\noalign{}
& Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 4.848734 & 6.598818 & 5.442444 \\
Paris & 5.294294 & 4.282418 & 5.741057 \\
Glasgow & 5.375804 & 4.697775 & 4.393675 \\
\end{longtable}

\hypertarget{exercise-17}{%
\section{Exercise:}\label{exercise-17}}

How many cities appear in: - both dataframes - only df1 - only df5 -
neither df1 nor df5?

\hypertarget{way-back-venn}{%
\section{Way Back Venn}\label{way-back-venn}}

What is the mechanism for joining data where these mismatches exist?
Well, there are several, starting with the\ldots{}

\hypertarget{inner-join}{%
\section{Inner Join:}\label{inner-join}}

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{from}\NormalTok{ IPython.display }\ImportTok{import}\NormalTok{ Image}

\NormalTok{data\_path }\OperatorTok{=} \StringTok{"https://s3.eu{-}west{-}2.amazonaws.com/qm2/wk3/inner.png"}
\NormalTok{Image(data\_path)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/RW. Merging and Joining_files/figure-pdf/cell-33-output-1.png}

}

\end{figure}

(Image from
http://blog.codinghorror.com/a-visual-explanation-of-sql-joins/)

The inner join \emph{only} includes data whose index appears in both
tables. Let's see what that looks like:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined }\OperatorTok{=}\NormalTok{ df1.merge(df5, left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall &
Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 4.898594 & 6.625739 & 3.587877 & 6.063331 & 4.342769 & 4.848734
& 6.598818 & 5.442444 \\
Paris & 6.032702 & 3.479265 & 2.383832 & 5.251509 & 5.158178 & 5.294294
& 4.282418 & 5.741057 \\
\end{longtable}

Here, we have a couple of arguments specifying the manner of the join -
we have specified that we are joining on the index of the left and right
dataset with the optional ``left\_index=True'' and
``right\_index=True''. Less obviously, the \textbf{left} dataset is df1
(because we're using \emph{df1.merge()} and the \textbf{right} dataset
is df5 (because it appears as an argument in merge(). There's no special
reason it shouldn't be the other way around, but for this function, it
is this way around and we need to remember that when we use it.

\hypertarget{inner-space}{%
\section{Inner Space}\label{inner-space}}

Although we haven't specified it, the merge() function has defaulted to
an inner join (like the diagram above). We can specify how the join is
calculated by changing the text in the optional argument ``how'':

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined }\OperatorTok{=}\NormalTok{ df1.merge(df5, left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, how}\OperatorTok{=}\StringTok{\textquotesingle{}inner\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall &
Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 4.898594 & 6.625739 & 3.587877 & 6.063331 & 4.342769 & 4.848734
& 6.598818 & 5.442444 \\
Paris & 6.032702 & 3.479265 & 2.383832 & 5.251509 & 5.158178 & 5.294294
& 4.282418 & 5.741057 \\
\end{longtable}

\hypertarget{the-future-of-the-left}{%
\section{The Future of The Left}\label{the-future-of-the-left}}

The \emph{left} join includes \textbf{all} rows where the index appears
on the \textbf{left} hand side of the join, and \textbf{any} data which
\textbf{matches} it on the \textbf{right} hand side. If the index
appears on the left but not the right, it will include the data from the
left table, and have blanks for the columns on the right.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data\_path }\OperatorTok{=} \StringTok{"https://s3.eu{-}west{-}2.amazonaws.com/qm2/wk3/left.png"}
\NormalTok{Image(data\_path)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/RW. Merging and Joining_files/figure-pdf/cell-38-output-1.png}

}

\end{figure}

What does \emph{this} look like? We will use the \emph{how=`left'}
optional argument to create a left join:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined }\OperatorTok{=}\NormalTok{ df1.merge(df5, left\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, right\_index}\OperatorTok{=}\VariableTok{True}\NormalTok{, how}\OperatorTok{=}\StringTok{\textquotesingle{}left\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_joined}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllll@{}}
\toprule\noalign{}
& area & population & mean temperature & elevation & annual rainfall &
Mean House Price & median income & walkability score \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
London & 4.898594 & 6.625739 & 3.587877 & 6.063331 & 4.342769 & 4.848734
& 6.598818 & 5.442444 \\
Paris & 6.032702 & 3.479265 & 2.383832 & 5.251509 & 5.158178 & 5.294294
& 4.282418 & 5.741057 \\
Beijing & 4.368419 & 4.993774 & 2.942992 & 3.761624 & 6.002863 & NaN &
NaN & NaN \\
Medellin & 7.437921 & 5.228150 & 3.902431 & 4.437361 & 5.563400 & NaN &
NaN & NaN \\
Port Elizabeth & 7.053265 & 5.936734 & 5.842155 & 6.042136 & 7.057592 &
NaN & NaN & NaN \\
\end{longtable}

As we see, the missing data appears as \textbf{NaN} - Not a Number.

\hypertarget{exercise-18}{%
\section{Exercise:}\label{exercise-18}}

Carry out \emph{right} and \emph{outer} joins on the dataframes df1 and
df5 and explain how they're filtering and joining the data.

\hypertarget{i-am-the-one-and-only}{%
\section{I Am The One and Only}\label{i-am-the-one-and-only}}

So far, we've carried out joins on data which have a \emph{one-to-one}
relationship; data for cities or people. What if our data has a
\emph{one-to-many} correspondence?

\emph{Example:} We want to look at the quality of life in cities (a real
student project from 2014). We have a dataset listing city-level
characteristics for a number of cities in Europe, including the country
each city is in. We also have a dataset listing the GDP, life expectancy
and other indicators for a number of \emph{countries} in Europe. How do
we create a dataframe which, for each city, lists all of the
characteristics of a city and those of its parent country?

We'll be working now with data from the UN, covering information about
cities - real data this time. The UN has some great data, we've taken
some from here and processed it in various ways:

http://data.un.org/Data.aspx?d=POP\&f=tableCode\%3A240

Let's load up data on city population - this set contains data for
2012-2014 inclusive:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data\_path }\OperatorTok{=} \StringTok{"data/wk5/UN\_Cities\_1214\_population.csv"}

\NormalTok{city\_pop }\OperatorTok{=}\NormalTok{ pd.read\_csv(data\_path, encoding}\OperatorTok{=}\StringTok{\textquotesingle{}latin1\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{city\_pop.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllllll@{}}
\toprule\noalign{}
& Year & Area & Sex & City & City type & Record Type & Reliability &
Source Year & Value & Value Footnotes \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 2013 & Total & Both Sexes & MARIEHAMN & City proper & Estimate - de
jure & Final figure, complete & 2014 & 11370.0 & NaN \\
1 & 2013 & Total & Male & MARIEHAMN & City proper & Estimate - de jure &
Final figure, complete & 2014 & 5445.0 & NaN \\
2 & 2013 & Total & Female & MARIEHAMN & City proper & Estimate - de jure
& Final figure, complete & 2014 & 5925.0 & NaN \\
3 & 2012 & Total & Both Sexes & MARIEHAMN & City proper & Estimate - de
jure & Final figure, complete & 2013 & 11304.5 & NaN \\
4 & 2012 & Total & Male & MARIEHAMN & City proper & Estimate - de jure &
Final figure, complete & 2013 & 5408.0 & NaN \\
\end{longtable}

\hypertarget{exercise-19}{%
\section{Exercise}\label{exercise-19}}

There is a another datafile we downloaded called
\emph{UN\_Cities\_1214\_country.csv}. This is saved to
\texttt{data/wk5/UN\_Cities\_1214\_country.csv} - Load this into a
dataframe called \emph{city\_c} with the city name as the index and view
it; then, using \emph{merge} on city name with city\_pop to create a new
dataframe called \emph{cities}. You'll probably get some errors. google
the error messages, or ask ChatGPT/Gemini to help you understand them.

\textbf{Hints:} You'll notice that the index \textbf{won't} be the
column you want to merge on in the city\_pop data. What column
\emph{should} you merge on in city\_pop? Which column should you merge
on in city\_c?

The syntax for merging on a \textbf{column} (which is not the index) is
to pass the column name to the optional `left\_on=' or `right\_on='
arguments. And we don't use right\_index=True (or left\_index=True),
depending on which we're using.

So for example: \textbf{df1.merge(df2, left\_on=`Name',
right\_index=True)} would join df1 (on the left) to df2 (on the right),
using the column `Name' on the left (df1) and the index column (whatever
that is) on the right (df2).

\hypertarget{a-footnote-about-footnotes}{%
\section{A footnote about footnotes}\label{a-footnote-about-footnotes}}

Just a quick note - if you look at the primary UN data, you'll see
footnotes which will confuse the hell out of Pandas. I've taken the
footnotes out, but you can use .tail() to see whether there's any junk
in the trunk, and remove it via a text editor.

\hypertarget{clean-data}{%
\section{Clean data}\label{clean-data}}

We need to simplify this data a bit in the following ways:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  I'm going to focus on one year (2012)
\item
  I'm going to just look at ``Both Sexes'' (not focus on one gender)
\item
  I'm going to get rid of a column of data (the `Value Footnotes'
  column) using the \emph{drop()} method.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{cities }\OperatorTok{=}\NormalTok{ cities[cities[}\StringTok{\textquotesingle{}Sex\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\StringTok{\textquotesingle{}Both Sexes\textquotesingle{}}\NormalTok{]}
\NormalTok{cities }\OperatorTok{=}\NormalTok{ cities[cities[}\StringTok{\textquotesingle{}Year\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{2012}\NormalTok{]}
\NormalTok{cities.drop(}\StringTok{\textquotesingle{}Value Footnotes\textquotesingle{}}\NormalTok{, axis}\OperatorTok{=}\DecValTok{1}\NormalTok{, inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
NameError: ignored
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{cities.head()}
\end{Highlighting}
\end{Shaded}

\hypertarget{extension-in-my-place}{%
\section{Extension: In My Place}\label{extension-in-my-place}}

The command I used to get rid of that column is \emph{cities.drop(`Value
Footnotes', axis=1, inplace=True)}. The syntax is not so complex - the
first argument, \emph{`Value Footnotes'}, is just the name of the
column; the second argument, \emph{axis=1}, tells Pandas to look for a
column to remove (instead of a row which has \emph{axis=0}); the third
and final argument, \emph{inplace=True}, is a command that tells Pandas
to edit \emph{inplace}, i.e.~to edit the dataframe (\emph{cities})
directly. When \emph{inplace} is False (the default), this command does
not directly edit cities, but instead provide an output. So the syntax
for that would be

new\_cities = cities.drop(`Value Footnotes', axis=1)

and new\_cities would be a version of \emph{cities} without the
offending column. This is usually the safer option.

\hypertarget{life-oh-life}{%
\section{Life, Oh Life}\label{life-oh-life}}

The UN also has useful data by country, so let's try and work with some
of that and join it up with our city data. Let's work with Life
Expectancy Data:

http://data.un.org/Data.aspx?d=WDI\&f=Indicator\_Code\%3ASP.DYN.LE00.IN

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{life }\OperatorTok{=}\NormalTok{ pd.read\_csv(}\StringTok{\textquotesingle{}https://s3.eu{-}west{-}2.amazonaws.com/qm2/wk3/UN\_Life\_all.csv\textquotesingle{}}\NormalTok{, index\_col}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{life.head()}
\end{Highlighting}
\end{Shaded}

\hypertarget{exercise-20}{%
\section{Exercise:}\label{exercise-20}}

In a new cell, clean up the above dataframe by

\begin{itemize}
\tightlist
\item
  removing the ``Value Footnotes'' Column
\item
  use only the most recent data (2012)
\end{itemize}

Let's make it a little clearer what ``Value'' refers to, by renaming the
column. This is one way to do that:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{life.rename(columns}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}Value\textquotesingle{}}\NormalTok{:}\StringTok{\textquotesingle{}Life Expectancy\textquotesingle{}}\NormalTok{\}, inplace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{life.head()}
\end{Highlighting}
\end{Shaded}

Now, merge this data with the cities data to show life expectancy for
each city (based on the country it is in), and show the first 5 rows.

Plot population against life expectancy. Use plot's \emph{optional
arguments} to specify the x column, y column, and that kind=`scatter'.

Question: How much data was ``missing'' in the merge?

\bookmarksetup{startatroot}

\hypertarget{hypothesis-testing}{%
\chapter{Hypothesis Testing}\label{hypothesis-testing}}

\hypertarget{workshop-6-open-in-colab}{%
\section[\emph{Workshop 6} ]{\texorpdfstring{\emph{Workshop 6}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W06.\%20Hypothesis\%20Testing.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Workshop 6 Open In Colab}}\label{workshop-6-open-in-colab}}

For the rest of this course, we'll be working with data from the U.S.
Census \href{https://www.census.gov/programs-surveys/cps.html}{Current
Population Survey (CPS)}.

\hypertarget{aims-4}{%
\subsection{Aims:}\label{aims-4}}

\begin{itemize}
\tightlist
\item
  Understanding:

  \begin{itemize}
  \tightlist
  \item
    Confidence Intervals
  \item
    Hypothesis Testing

    \begin{enumerate}
    \def\labelenumi{\arabic{enumi}.}
    \tightlist
    \item
      Stating the Null and Alternative Hypotheses
    \item
      Selecting a Critical Value
    \item
      Calculating the Test Statistic
    \item
      Making a Decision
    \end{enumerate}
  \end{itemize}
\end{itemize}

\hypertarget{getting-started-2}{%
\section{Getting Started}\label{getting-started-2}}

We will be following on from the analysis we conducted in Workshop 5
(Distributions and Basic Statistics). We visually explored differences
in the income levels between different groups of people in the US
census. Now, we are going to conduct hypothesis testing to see if those
differences are statistically significant.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#This tells python to draw the graphs "inline" {-} in the notebook}
\OperatorTok{\%}\NormalTok{matplotlib inline  }
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ norm}
\ImportTok{import}\NormalTok{ statistics}
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}

\ImportTok{import}\NormalTok{ pylab}
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\CommentTok{\# make the plots (graphs) a little wider by default}
\NormalTok{pylab.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\FloatTok{10.}\NormalTok{, }\FloatTok{8.}\NormalTok{)}
\NormalTok{sns.}\BuiltInTok{set}\NormalTok{(font\_scale}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)}
\NormalTok{sns.set\_style(}\StringTok{"white"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{=}\NormalTok{pd.read\_csv(}\StringTok{\textquotesingle{}https://storage.googleapis.com/qm2/wk7/cps.csv\textquotesingle{}}\NormalTok{)}
\NormalTok{df[}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{].astype(}\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{)}
\NormalTok{df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{df[}\StringTok{\textquotesingle{}incwage\textquotesingle{}}\NormalTok{]}\OperatorTok{/}\DecValTok{1000}
\NormalTok{df}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{]}\OperatorTok{\textless{}}\DecValTok{200}\NormalTok{]}
\NormalTok{df}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{2013}\NormalTok{] }\CommentTok{\# filter the dataframe to only contain 2013 data}
\NormalTok{df.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllllllll@{}}
\toprule\noalign{}
& year & state & age & sex & race & sch & ind & union & incwage &
realhrwage & occupation & income \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
20 & 2013 & 50 & 62 & 1 & 1 & 14.0 & 8090 & 1.0 & 57000.0 & 23.889143 &
. & 57.0 \\
32 & 2013 & 39 & 59 & 1 & 1 & 13.0 & 9590 & 0.0 & 62000.0 & 29.726475 &
Consruction, Extraction, Installation & 62.0 \\
34 & 2013 & 44 & 44 & 1 & 3 & 12.0 & 7290 & 0.0 & 45000.0 & 20.745834 &
. & 45.0 \\
36 & 2013 & 12 & 41 & 1 & 1 & 12.0 & 7070 & 1.0 & 28000.0 & 12.293828 &
Managers & 28.0 \\
37 & 2013 & 33 & 35 & 1 & 1 & 12.0 & 770 & 0.0 & 42500.0 & 20.377020 &
Transportation and materials moving & 42.5 \\
\end{longtable}

This is once again the U.S. census data from Week 5. As a reminder, our
dataframe has 10 columns:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \emph{year}: Survey year
\item
  \emph{age}: the person's age
\item
  \emph{sex}: the person's sex

  \begin{itemize}
  \tightlist
  \item
    1=male
  \item
    2=female
  \end{itemize}
\item
  \emph{race}: the person's race

  \begin{itemize}
  \tightlist
  \item
    White non hispanic=1
  \item
    Black non hispanic=2
  \item
    Hispanic=3
  \item
    Other non hispanic=4)
  \end{itemize}
\item
  \emph{sch}: Educational attainment

  \begin{itemize}
  \tightlist
  \item
    None = 0,
  \item
    Grades 1-12 = 1-12
  \item
    Some University = 13,
  \item
    Associate's degree = 14,
  \item
    BA = 16
  \item
    Advanced Degree = 18
  \end{itemize}
\item
  \emph{union}: Union membership

  \begin{itemize}
  \tightlist
  \item
    N/A = 0,
  \item
    No union coverage = 1,
  \item
    Member of labor union=2,
  \item
    Covered by union but not a member=3
  \end{itemize}
\item
  \emph{incwage}: Wage and salary income
\item
  \emph{realhrwage}: Real Hourly Wage
\item
  \emph{occupation}: Occupation
\item
  \emph{ind}:
  \href{https://www.census.gov/naics/?58967?yearbck=2002}{industry code}
\end{enumerate}

\hypertarget{confidence-intervals}{%
\section{Confidence Intervals}\label{confidence-intervals}}

So far in this workshop, we've had the luxury of being able to draw many
random samples and plot the distributions of their sample means to infer
the population mean. The Central Limit Theorem lets us assume that these
sample means are normally distributed, and consequently that there is a
95.45\% chance that the \textbf{population mean} within two standard
errors of the \textbf{sample mean}. This allows us to make inferences on
the basis of \emph{a single sample}. The standard error is the

\hypertarget{sample-standard-deviation}{%
\subsection{Sample Standard Deviation}\label{sample-standard-deviation}}

\[\huge s = \sqrt{\frac{1}{n-1} \sum_{i=1}^n (x_i - \overline{x})^2}\]

\hypertarget{standard-error}{%
\subsection{Standard Error}\label{standard-error}}

\[\huge se = \frac{s}{\sqrt{n}}\]

Given a large enough sample \(x\), we can build a 95\% confidence
interval as follows:

\[ \huge 95\% CI = \overline{x} \pm (1.96 \times se)\]

Let's draw a sample of 1000 random individuals from our data, and
compute a 95\% confidence interval to estimate the population mean for
income. We'll begin by creating a swarmplot to get a sense of how the
data are distributed.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sample}\OperatorTok{=}\NormalTok{df.sample(}\DecValTok{1000}\NormalTok{) }\CommentTok{\# draw a random sample of 1000 observations from the dataframe}
\NormalTok{sns.swarmplot(data }\OperatorTok{=}\NormalTok{ sample, y}\OperatorTok{=}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{) }\CommentTok{\# plot a swarmplot of income}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}Income Distribution\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Text(0.5, 1.0, 'Income Distribution')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W06. Hypothesis Testing_files/figure-pdf/cell-4-output-2.png}

}

\end{figure}

Now let's set about calculating the 95\% confidence interval and
plotting it on our swarmplot. Luckily, the components we need to this
are easy to calculate. We just need the mean, standard deviation, and
number of observations. All of these are provided by the
\texttt{.describe()} function, which calculates summary statistics for a
sample.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{desc}\OperatorTok{=}\NormalTok{sample[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{].describe() }\CommentTok{\# calculate descriptive statistics for the sample}
\BuiltInTok{print}\NormalTok{(desc) }\CommentTok{\# print the descriptive statistics}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
count    1000.000000
mean       47.677568
std        33.041244
min         0.700000
25%        25.000000
50%        40.000000
75%        62.000000
max       190.000000
Name: income, dtype: float64
\end{verbatim}

From the set of descriptive statistics, we can pull out the relevant
components, calculate the standard error, and create a 95\% confidence
interval as follows:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mean}\OperatorTok{=}\NormalTok{desc[}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{] }\CommentTok{\# set the mean equal to a variable called "mean"}
\NormalTok{std}\OperatorTok{=}\NormalTok{desc[}\StringTok{\textquotesingle{}std\textquotesingle{}}\NormalTok{] }\CommentTok{\# set the standard deviation equal to a variable called "std"}
\NormalTok{n}\OperatorTok{=}\NormalTok{desc[}\StringTok{\textquotesingle{}count\textquotesingle{}}\NormalTok{] }\CommentTok{\# set the sample size equal to a variable called "n"}
\NormalTok{se}\OperatorTok{=}\NormalTok{std}\OperatorTok{/}\NormalTok{np.sqrt(n) }\CommentTok{\# calculate the standard error of the mean}

\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}The mean is\textquotesingle{}}\NormalTok{, }\BuiltInTok{round}\NormalTok{(mean, }\DecValTok{2}\NormalTok{), }\StringTok{\textquotesingle{}with a standard error of\textquotesingle{}}\NormalTok{, }\BuiltInTok{round}\NormalTok{(se, }\DecValTok{2}\NormalTok{)) }\CommentTok{\# print the mean and standard error}

\NormalTok{upper\_ci }\OperatorTok{=}\NormalTok{ mean}\OperatorTok{+}\NormalTok{se}\OperatorTok{*}\FloatTok{1.96} \CommentTok{\# calculate the upper confidence interval}
\NormalTok{lower\_ci }\OperatorTok{=}\NormalTok{ mean}\OperatorTok{{-}}\NormalTok{se}\OperatorTok{*}\FloatTok{1.96} \CommentTok{\# calculate the lower confidence interval}

\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}The 95}\SpecialCharTok{\% c}\StringTok{onfidence interval is\textquotesingle{}}\NormalTok{, }\BuiltInTok{round}\NormalTok{(lower\_ci, }\DecValTok{2}\NormalTok{), }\StringTok{\textquotesingle{}to\textquotesingle{}}\NormalTok{, }\BuiltInTok{round}\NormalTok{(upper\_ci, }\DecValTok{2}\NormalTok{)) }\CommentTok{\# print the confidence interval}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
The mean is 47.68 with a standard error of 1.04
The 95% confidence interval is 45.63 to 49.73
\end{verbatim}

Finally, let's plot these bounds on our swarmplot to graphically show
this range. We can now claim that based on our sample, there is a 95\%
chance that the true population mean of income (shown in red) lies
within this range.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.swarmplot(data }\OperatorTok{=}\NormalTok{ sample, y}\OperatorTok{=}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{,alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{) }\CommentTok{\# plot a swarmplot of income}
\NormalTok{plt.axhline(df[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{].mean(), color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}solid\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{3}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Population Mean\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a horizontal line at the mean}
\NormalTok{plt.axhline(upper\_ci, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{3}\NormalTok{) }\CommentTok{\# add a dashed black line at the upper confidence interval}
\NormalTok{plt.axhline(lower\_ci, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{3}\NormalTok{) }\CommentTok{\# add a dashed black line at the lower confidence interval}

\NormalTok{plt.title(}\StringTok{\textquotesingle{}Income Distribution, 95\% Confidence Interval\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Text(0.5, 1.0, 'Income Distribution, 95% Confidence Interval')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W06. Hypothesis Testing_files/figure-pdf/cell-7-output-2.png}

}

\end{figure}

\hypertarget{hypothesis-testing-1}{%
\section{Hypothesis Testing}\label{hypothesis-testing-1}}

If we create a boxplot of income disaggregated by sex using our sample,
we can observe that men seem to earn more than women:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.boxplot(data}\OperatorTok{=}\NormalTok{sample , x}\OperatorTok{=}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{).set\_xticklabels([}\StringTok{\textquotesingle{}Men\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Women\textquotesingle{}}\NormalTok{]) }\CommentTok{\# make a box plot of income by sex}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
[Text(0, 0, 'Men'), Text(1, 0, 'Women')]
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W06. Hypothesis Testing_files/figure-pdf/cell-8-output-2.png}

}

\end{figure}

But is this difference statistically significant? It could just be due
to sampling error, random chance. \textbf{Hypothesis testing} provides a
framework through which we can formally evaluate the likelihood of
encountering an effect as extreme (in this case, the the difference
between the mean incomes between both groups) as the one we observe in
our data. There are four main steps in hypothesis testing:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  State the hypotheses. H0 states that there is no difference between
  the two population means.
\item
  Select an \(\alpha\) level (e.g.~95\% confidence), and select a
  corresponding \textbf{critical value} (1.96 for large samples)
\item
  Compute the test statistic.
\item
  Make a decision; if the test statistic exceeds the critical value, we
  \textbf{reject the null hypothesis}.
\end{enumerate}

Steps 1, 2, and 4 remain fairly constant regardless of what kind of
hypothesis testing you're conducting. Step 3 can vary quite a bit, as
there are many different statistical tests that fall under the umbrella
of hypothesis testing. In today's workshop we'll be using the Student's
T-Test (more on that in a second). For now, let's begin the process of
hypothesis testing a

\hypertarget{state-the-hypotheses}{%
\subsection{1. State the hypotheses}\label{state-the-hypotheses}}

\hypertarget{the-null-hypothesis}{%
\subsubsection{The Null Hypothesis}\label{the-null-hypothesis}}

\begin{itemize}
\tightlist
\item
  \(H_0\) : There is no difference in the mean income between men and
  women
\item
  \(H_0\) : \(\overline{x}_{men} = \overline{x}_{women}\)
\end{itemize}

\hypertarget{the-alternative-hypothesis}{%
\subsubsection{The Alternative
Hypothesis}\label{the-alternative-hypothesis}}

\begin{itemize}
\tightlist
\item
  \(H_a\) : There is a difference in the mean income between men and
  women
\item
  \(H_a\) : \(\overline{x}_{men} \neq \overline{x}_{women}\)
\end{itemize}

\hypertarget{select-an-alpha-level}{%
\subsection{\texorpdfstring{2. Select an \(\alpha\)
level}{2. Select an \textbackslash alpha level}}\label{select-an-alpha-level}}

Locate the critical region; the critical values for the t statistic are
obtained using degrees of freedom (\(df=n-2\)). Given that we have 1000
observations, \(df=998\). If \(df>1000\), you can simply memorize the
following critical values:

\begin{itemize}
\tightlist
\item
  At the 95\% confidence level, the critical value is 1.96
\item
  At the 99\% confidence level, the critical value is 2.58
\end{itemize}

If our test statistic exceeds either of these values, we can reject the
null hypothesis with the according level of confidence. The function
below creates a plot which provides a visual reference for these values,
but isn't really necessary for the process of hypothesis testing. The
function accepts one argument \texttt{test\_statistic}, which it will
use to plot a vertical red line. If the red line falls within the dotted
lines, we fail to reject the null hypothesis at the corresponding
confidence level. If it's outside of these bounds, we reject the null
hypothesis.

In the last line of code below, i've called the function to plot a test
statistic of -2.3; Would we reject the null hypothesis at the 95\%
confidence level? what about the 99\% level?

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ plot\_z(test\_statistic):}
\NormalTok{    mu, se}\OperatorTok{=} \DecValTok{0}\NormalTok{, }\DecValTok{1} \CommentTok{\# create two variables, a mean "mu" equal to zero, and standard deviation "se" equal to 1}
\NormalTok{    x }\OperatorTok{=}\NormalTok{ np.linspace(mu }\OperatorTok{{-}} \DecValTok{3}\OperatorTok{*}\NormalTok{se, mu }\OperatorTok{+} \DecValTok{3}\OperatorTok{*}\NormalTok{se, }\DecValTok{100}\NormalTok{) }\CommentTok{\# create a range of values from {-}3 to 3 standard deviations}

\NormalTok{    plt.plot(x, norm.pdf(x, mu, se)) }\CommentTok{\# plot the normal distribution}
\NormalTok{    plt.axvline(mu}\OperatorTok{{-}}\NormalTok{se}\OperatorTok{*}\FloatTok{1.96}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}µ ± 1.96σ (95}\SpecialCharTok{\% c}\StringTok{onfidence)\textquotesingle{}}\NormalTok{) }\CommentTok{\# plot a vertical line at the mean plus 2 standard deviations}
\NormalTok{    plt.axvline(mu}\OperatorTok{+}\NormalTok{se}\OperatorTok{*}\FloatTok{1.96}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)  }\CommentTok{\# plot a vertical line at the mean minus 2 standard deviations}
\NormalTok{    plt.axvline(mu}\OperatorTok{{-}}\NormalTok{se}\OperatorTok{*}\FloatTok{2.58}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}green\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}µ ± 2.58σ (99}\SpecialCharTok{\% c}\StringTok{onfidence)\textquotesingle{}}\NormalTok{) }\CommentTok{\# plot a vertical line at the mean plus 2 standard deviations}
\NormalTok{    plt.axvline(mu}\OperatorTok{+}\NormalTok{se}\OperatorTok{*}\FloatTok{2.58}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}green\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)  }\CommentTok{\# plot a vertical line at the mean minus 2 standard deviations}
    
\NormalTok{    plt.axvline(test\_statistic, color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}solid\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\FloatTok{1.5}\NormalTok{,label}\OperatorTok{=}\StringTok{\textquotesingle{}Test Statistic\textquotesingle{}}\NormalTok{) }\CommentTok{\# plot a vertical line at the test statistic}


\NormalTok{    plt.ylim(}\DecValTok{0}\NormalTok{,}\FloatTok{0.4}\NormalTok{)}
\NormalTok{    plt.legend()}
\NormalTok{    plt.title(}\StringTok{\textquotesingle{}Z Distribution\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}
\NormalTok{    plt.show()}

\NormalTok{plot\_z(}\OperatorTok{{-}}\FloatTok{2.3}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W06. Hypothesis Testing_files/figure-pdf/cell-9-output-1.png}

}

\end{figure}

\hypertarget{calculate-the-test-statistic-the-students-t-test}{%
\subsection{3. Calculate the Test Statistic (The Student's
T-Test)}\label{calculate-the-test-statistic-the-students-t-test}}

The Student's T-Test is an \emph{independent-measures design} which is
used in situations where a researcher has no prior knowledge about
either of the two populations (or treatments) being compared. In
particular, the population means and standard deviations are all
unknown. Because the population variances are not known, these values
must be estimated from the sample data.

The purpose of a T-test is to determine whether the sample mean
difference indicates a real mean difference between the two populations
or whether the obtained difference is simply the result of sampling
error. Given two groups, \(x_1\) and \(x_2\), the \(t\) statistic is
calculated as:

\[ \Huge t = {\frac{\overline{x_1}-\overline{x_2}} {\sqrt{\frac{s^2_1}{n_1} + \frac{s^2_2}{n_2}}}} \]

Where:

\begin{itemize}
\tightlist
\item
  \(\overline{x}\): Sample Mean
\item
  \(s^2\): Sample Standard Deviation
\item
  \(n\): Number of observations
\end{itemize}

We've already seen how to calculate each of these components when we
made the 95\% confidence interval above using the \texttt{.describe()}
function. To calculate the t-statistic, we just have to plug these
values into the formula above and do some basic arithmetic. I've put
together a function that does this below, which accepts two main
arguments, \texttt{group1} and \texttt{group2}. For each group it
calculates descriptive statistics, and uses these values to calculate
the t-statistic. It also has an optional argument \texttt{plot}, which
when set to \texttt{True} will plot a 95\% confidence interval for each
group. It defaults to \texttt{False}, meaning that it won't generate the
plot.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ manual\_ttest(group1, group2, plot}\OperatorTok{=}\VariableTok{False}\NormalTok{): }\CommentTok{\# define a function called "manual\_ttest" that takes two groups and a boolean value for whether or not to plot the results as arguments}
    
\NormalTok{    desc1, desc2}\OperatorTok{=}\NormalTok{group1.describe(), group2.describe() }\CommentTok{\# get descriptive statistics for both samples}
    
\NormalTok{    n1,std1,mean1 }\OperatorTok{=}\NormalTok{ desc1[}\StringTok{\textquotesingle{}count\textquotesingle{}}\NormalTok{], desc1[}\StringTok{\textquotesingle{}std\textquotesingle{}}\NormalTok{] ,desc1[}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{] }\CommentTok{\# get the sample size, standard deviation, and mean of the first sample}
\NormalTok{    n2,std2,mean2 }\OperatorTok{=}\NormalTok{ desc2[}\StringTok{\textquotesingle{}count\textquotesingle{}}\NormalTok{], desc2[}\StringTok{\textquotesingle{}std\textquotesingle{}}\NormalTok{] ,desc2[}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{] }\CommentTok{\# get the sample size, standard deviation, and mean of the second sample}
    
    \CommentTok{\# calculate standard errors}
\NormalTok{    se1, se2 }\OperatorTok{=}\NormalTok{ std1}\OperatorTok{**}\DecValTok{2}\OperatorTok{/}\NormalTok{n1, std2}\OperatorTok{**}\DecValTok{2}\OperatorTok{/}\NormalTok{n2 }\CommentTok{\# \textquotesingle{}**2\textquotesingle{} is the same as squaring the number}

    \CommentTok{\# standard error on the difference between the samples}
\NormalTok{    sed }\OperatorTok{=}\NormalTok{ np.sqrt(se1 }\OperatorTok{+}\NormalTok{ se2)}

    \CommentTok{\# calculate the t statistic}
\NormalTok{    t\_stat }\OperatorTok{=}\NormalTok{ (mean1 }\OperatorTok{{-}}\NormalTok{ mean2) }\OperatorTok{/}\NormalTok{ sed}

    \CommentTok{\# print the results}
    \BuiltInTok{print}\NormalTok{(}\StringTok{"Group 1: n=\%.0f, mean=}\SpecialCharTok{\%.3f}\StringTok{, std=}\SpecialCharTok{\%.3f}\StringTok{"} \OperatorTok{\%}\NormalTok{ (n1,mean1,std1)) }
    \BuiltInTok{print}\NormalTok{(}\StringTok{"Group 2: n=\%.0f, mean=}\SpecialCharTok{\%.3f}\StringTok{, std=}\SpecialCharTok{\%.3f}\StringTok{"} \OperatorTok{\%}\NormalTok{ (n2,mean2,std2))}
    \BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}The t{-}statistic is }\SpecialCharTok{\%.3f}\StringTok{\textquotesingle{}} \OperatorTok{\%}\NormalTok{ t\_stat) }\CommentTok{\# print the t{-}statistic}

    \ControlFlowTok{if}\NormalTok{ plot}\OperatorTok{==}\VariableTok{True}\NormalTok{: }\CommentTok{\# if the plot argument is set to True, plot the results}
\NormalTok{        groups}\OperatorTok{=}\NormalTok{pd.DataFrame() }\CommentTok{\# create an empty dataframe}
\NormalTok{        i}\OperatorTok{=}\DecValTok{1} \CommentTok{\# create a counter variable called "i" and set it equal to 1}
        
        \ControlFlowTok{for}\NormalTok{ group }\KeywordTok{in}\NormalTok{ [group1, group2]: }\CommentTok{\# loop through each group in the list of groups}
\NormalTok{            plot\_df}\OperatorTok{=}\NormalTok{pd.DataFrame(\{}\StringTok{\textquotesingle{}Values\textquotesingle{}}\NormalTok{: group,}\StringTok{\textquotesingle{}Group\textquotesingle{}}\NormalTok{:i\}) }\CommentTok{\# create a dataframe with the values of the group and a column called "Group" that contains the group number}
\NormalTok{            groups}\OperatorTok{=}\NormalTok{groups.append(plot\_df) }\CommentTok{\# append the dataframe to the list of dataframes}
\NormalTok{            i}\OperatorTok{+=}\DecValTok{1} \CommentTok{\# increase the counter by 1}
        
\NormalTok{        sns.pointplot(data}\OperatorTok{=}\NormalTok{groups , x}\OperatorTok{=}\StringTok{\textquotesingle{}Group\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}Values\textquotesingle{}}\NormalTok{,errorbar}\OperatorTok{=}\NormalTok{(}\StringTok{\textquotesingle{}ci\textquotesingle{}}\NormalTok{, }\DecValTok{95}\NormalTok{), color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, join}\OperatorTok{=}\VariableTok{False}\NormalTok{, capsize}\OperatorTok{=}\FloatTok{.8}\NormalTok{) }\CommentTok{\# plot the means of the groups with a 95\% confidence interval}
\NormalTok{        plt.title(}\StringTok{\textquotesingle{}Comparison of Group Means with 95\% Confidence Intervals\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}
    
    \ControlFlowTok{return}\NormalTok{ t\_stat }\CommentTok{\# return the t{-}statistic}
\end{Highlighting}
\end{Shaded}

Having defined the function, we can now call it to calculate a t-test
for the difference in income between men and women

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{men}\OperatorTok{=}\NormalTok{sample[sample[}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{1}\NormalTok{] }\CommentTok{\# filter the sample to only include men}
\NormalTok{women}\OperatorTok{=}\NormalTok{sample[sample[}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{2}\NormalTok{] }\CommentTok{\# filter the sample to only include women}

\NormalTok{t }\OperatorTok{=}\NormalTok{ manual\_ttest(men[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{],women[}\StringTok{\textquotesingle{}income\textquotesingle{}}\NormalTok{]) }\CommentTok{\# run the t{-}test function and store the t{-}statistic in a variable called "t"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Group 1: n=485, mean=57.435, std=36.610
Group 2: n=515, mean=38.489, std=26.179
The t-statistic is 9.364
\end{verbatim}

\hypertarget{make-a-decision}{%
\subsection{4. Make a Decision}\label{make-a-decision}}

If the t statistic indicates that the obtained difference between sample
means (numerator) is substantially greater than the difference expected
by chance (denominator), we reject H0 and conclude that there is a real
mean difference between the two populations or treatments. Let plot the
T-statistic from our test against the critical values:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{plot\_z(t) }\CommentTok{\# plot the test statistic on the z distribution}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W06. Hypothesis Testing_files/figure-pdf/cell-12-output-1.png}

}

\end{figure}

Based on the plot above, can we reject the null hypothesis that there is
no difference in mean income between men and women?

\hypertarget{exercise-21}{%
\subsection{Exercise}\label{exercise-21}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  From the main dataframe \texttt{df}, draw a sample of 500 white men.
  Using t-tests, investigate whether there are statistically significant
  discrepancies in pay between white men and other groups (note: it
  would be best to sample 500 people in each of those groups as well).
  Between what groups does there exist the most significant pay gap?
\item
  Some of this variation may be due to occupation. Compare income
  disparities between men and women within different occupations. Which
  occupation has the largest pay gap? which has the smallest?
\item
  \href{https://journals.sagepub.com/doi/abs/10.1177/0730888401028004005}{Research
  suggests} that within occupational groups, collective bargaining
  through union membership reduces pay gaps. Read the abstract of this
  article, and try to replicate the analysis using our dataset.
\end{enumerate}

\hypertarget{assessed-question-5}{%
\section{Assessed Question}\label{assessed-question-5}}

When Elon musk bought Twitter, he promisted to restore ``free speech''
to the platform. He heralded this new era with a tweet on 28/10/2022,
which read ``the bird is freed''. A tidal wave of hate speech ensued
instead.

Using twitter's API, I downloaded tweets containing a racial slur. Using
the groupby function and regex, I counted the number of mentions of this
word per hour on the platform for about a month before the takeover, and
a few days thereafter. I've saved these counts (but not the tweets
themselves) as a csv file called ``elon\_tweets.csv''.

The code below downloads this csv file, and plots the number of
slur-containing tweets over time.

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{import}\NormalTok{ datetime}
\ImportTok{from}\NormalTok{ matplotlib.pyplot }\ImportTok{import}\NormalTok{ figure}
\ImportTok{import}\NormalTok{ matplotlib.dates }\ImportTok{as}\NormalTok{ mdates}

\NormalTok{tweets}\OperatorTok{=}\NormalTok{pd.read\_csv(}\StringTok{\textquotesingle{}https://storage.googleapis.com/qm2/wk7/elon\_twitter.csv\textquotesingle{}}\NormalTok{) }\CommentTok{\# read in the data}
\NormalTok{figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{10}\NormalTok{, }\DecValTok{4}\NormalTok{), dpi}\OperatorTok{=}\DecValTok{200}\NormalTok{)}
\NormalTok{tweets[}\StringTok{\textquotesingle{}hour\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{pd.to\_datetime(tweets[}\StringTok{\textquotesingle{}hour\textquotesingle{}}\NormalTok{])}

\NormalTok{tweet}\OperatorTok{=}\NormalTok{datetime.datetime(}\DecValTok{2022}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{28}\NormalTok{)}

\NormalTok{pre\_mean}\OperatorTok{=}\NormalTok{tweets[tweets[}\StringTok{\textquotesingle{}hour\textquotesingle{}}\NormalTok{]}\OperatorTok{\textless{}}\NormalTok{tweet][}\StringTok{\textquotesingle{}count\textquotesingle{}}\NormalTok{].mean()}
\NormalTok{post\_mean}\OperatorTok{=}\NormalTok{tweets[tweets[}\StringTok{\textquotesingle{}hour\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}}\NormalTok{tweet][}\StringTok{\textquotesingle{}count\textquotesingle{}}\NormalTok{].mean()}
\NormalTok{pct\_change}\OperatorTok{=} \BuiltInTok{int}\NormalTok{(((post\_mean}\OperatorTok{{-}}\NormalTok{pre\_mean)}\OperatorTok{/}\NormalTok{pre\_mean)}\OperatorTok{*}\DecValTok{100}\NormalTok{)}

\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Hourly Slur Tweets\textquotesingle{}}\NormalTok{)}

\NormalTok{plt.plot\_date(tweets[}\StringTok{\textquotesingle{}hour\textquotesingle{}}\NormalTok{], tweets[}\StringTok{\textquotesingle{}count\textquotesingle{}}\NormalTok{], }\StringTok{\textquotesingle{}b\textquotesingle{}}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.axvline(tweet, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}"the bird is freed"\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.legend()}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}Elon Musk tweets "The bird is freed"; Tweets containing racial slurs increase }\SpecialCharTok{\{\}}\StringTok{\%\textquotesingle{}}\NormalTok{.}\BuiltInTok{format}\NormalTok{(pct\_change))}
\NormalTok{plt.gca().xaxis.set\_major\_locator(mdates.DayLocator())}
\NormalTok{plt.gca().xaxis.set\_major\_formatter(mdates.DateFormatter(}\StringTok{\textquotesingle{}}\SpecialCharTok{\%d}\StringTok{/\%m\textquotesingle{}}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
/var/folders/6q/jt4x0r8n1rs0kbrrqrbj61fr0000gn/T/ipykernel_97088/4258694155.py:17: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.2980392156862745, 0.4470588235294118, 0.6901960784313725, 1)). The keyword argument will take precedence.
  plt.plot_date(tweets['hour'], tweets['count'], 'b', color='red')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W06. Hypothesis Testing_files/figure-pdf/cell-14-output-2.png}

}

\end{figure}

This plot definitely shows an uptick in the number of tweets containing
a racial slur following Musk's tweet. But is this increase statistically
significant?

\textbf{Question: Using a t-test and the full hypothesis testing
procedure, investigate wheter there was a statistically significant
increase in hate speech following Elon Musk's tweet. Make note of the T
statistic and the P value.}

\bookmarksetup{startatroot}

\hypertarget{regression}{%
\chapter{Regression}\label{regression}}

\hypertarget{workshop-7-open-in-colab}{%
\section[\emph{Workshop 7} ]{\texorpdfstring{\emph{Workshop 7}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W07.\%20Linear\%20Regression.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Workshop 7 Open In Colab}}\label{workshop-7-open-in-colab}}

\hypertarget{aims-5}{%
\subsection{Aims:}\label{aims-5}}

In this workshop, we're going to be modeling the relationship between
education and income. More precisely, we're going to be looking at the
effect of increasing education on hourly wages using Ordinary Least
Squares regression. We're going to accomplish this in four steps:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Summary Statistics

  \begin{itemize}
  \tightlist
  \item
    Table of Summary Statistics
  \end{itemize}
\item
  Visualisation

  \begin{itemize}
  \tightlist
  \item
    Exploratory Plots
  \end{itemize}
\item
  Assumptions

  \begin{itemize}
  \tightlist
  \item
    A. Independence
  \item
    B. Heteroscedasticity: Regression plots + Q-Q plot
  \item
    C. Multicollinearity: VIF + Correlation Matrix
  \end{itemize}
\item
  Regression

  \begin{itemize}
  \tightlist
  \item
    Regression Table
  \end{itemize}
\end{enumerate}

If you're conducting a regression, you must complete the steps above,
and produce each item indicated by a bullet point.

\hypertarget{getting-started-3}{%
\section{Getting Started}\label{getting-started-3}}

As always we'll start by importing the libraries I need

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#This tells python to draw the graphs "inline" {-} in the notebook}
\OperatorTok{\%}\NormalTok{matplotlib inline  }
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
\ImportTok{import}\NormalTok{ statsmodels.api }\ImportTok{as}\NormalTok{ sm}
\ImportTok{from}\NormalTok{ math }\ImportTok{import}\NormalTok{ sqrt}
\ImportTok{from}\NormalTok{ numpy.random }\ImportTok{import}\NormalTok{ seed}
\ImportTok{from}\NormalTok{ numpy.random }\ImportTok{import}\NormalTok{ randn}
\ImportTok{from}\NormalTok{ numpy }\ImportTok{import}\NormalTok{ mean}
\ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ sem}
\ImportTok{import}\NormalTok{ statistics }
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
\ImportTok{from}\NormalTok{ IPython.display }\ImportTok{import}\NormalTok{ display, Math, Latex, display\_latex}
\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
\ImportTok{import}\NormalTok{ pylab}
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\CommentTok{\# make the plots (graphs) a little wider by default}
\NormalTok{pylab.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\FloatTok{10.}\NormalTok{, }\FloatTok{8.}\NormalTok{)}
\NormalTok{sns.}\BuiltInTok{set}\NormalTok{(font\_scale}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)}
\NormalTok{sns.set\_style(}\StringTok{"white"}\NormalTok{)}

\end{Highlighting}
\end{Shaded}

Now that I've imported the libraries I'm going to be using, I'm ready to
import the data:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{=}\NormalTok{pd.read\_csv(}\StringTok{\textquotesingle{}https://storage.googleapis.com/qm2/wk7/cps.csv\textquotesingle{}}\NormalTok{)}
\NormalTok{df.head()}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllllllllll@{}}
\toprule\noalign{}
& year & state & age & sex & race & sch & ind & union & incwage &
realhrwage & occupation \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 1990 & 36 & 58 & 1 & 3 & 12.0 & 871 & 0.0 & 14200.0 & 12.269874 &
Office and Admin Support \\
1 & 2009 & 5 & 28 & 1 & 1 & 12.0 & 8660 & 1.0 & 17680.0 & 8.635149 &
Office and Admin Support \\
2 & 1990 & 36 & 37 & 1 & 1 & 14.0 & 380 & 1.0 & 28000.0 & 21.169851 &
. \\
3 & 1990 & 6 & 34 & 1 & 1 & 18.0 & 740 & 1.0 & 27500.0 & 20.447746 &
Computer and Math Technicians \\
4 & 1981 & 51 & 38 & 1 & 4 & 13.0 & 798 & NaN & 17000.0 & 18.892282 &
Managers \\
\end{longtable}

Our dataframe has 10 columns:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \emph{year}: Survey year
\item
  \emph{age}: the person's age
\item
  \emph{sex}: the person's sex

  \begin{itemize}
  \tightlist
  \item
    1=male
  \item
    2=female
  \end{itemize}
\item
  \emph{race}: the person's race

  \begin{itemize}
  \tightlist
  \item
    White non hispanic=1
  \item
    Black non hispanic=2
  \item
    Hispanic=3
  \item
    Other non hispanic=4)
  \end{itemize}
\item
  \emph{sch}: Educational attainment

  \begin{itemize}
  \tightlist
  \item
    None = 0,
  \item
    Grades 1-12 = 1-12
  \item
    Some University = 13,
  \item
    Associate's degree = 14,
  \item
    BA = 16
  \item
    Advanced Degree = 18
  \end{itemize}
\item
  \emph{union}: Union membership

  \begin{itemize}
  \tightlist
  \item
    N/A = 0,
  \item
    No union coverage = 1,
  \item
    Member of labor union=2,
  \item
    Covered by union but not a member=3
  \end{itemize}
\item
  \emph{incwage}: Wage and salary income
\item
  \emph{realhrwage}: Real Hourly Wage
\item
  \emph{occupation}: Occupation
\item
  \emph{ind}:
  \href{https://www.census.gov/naics/?58967?yearbck=2002}{industry code}
\item
  \emph{state}:
  \href{https://www.bls.gov/respondents/mwr/electronic-data-interchange/appendix-d-usps-state-abbreviations-and-fips-codes.htm}{FIPS
  code} denoting the state of residence.
\end{enumerate}

We'll begin, as we did with last week's workshop, by selecting the year
2013 in our data and making sure that all the variables that represent
categories are stored as categorical in python:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{reg\_df}\OperatorTok{=}\NormalTok{df[df[}\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{2013}\NormalTok{].drop([}\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{],axis}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\CommentTok{\# filter the whole dataset to 2013 and drop year column}
\NormalTok{reg\_df[[}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}union\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}occupation\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}ind\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{]]}\OperatorTok{=}\NormalTok{reg\_df[[}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}union\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}occupation\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}ind\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{]].astype(}\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{) }\CommentTok{\# convert these columns to categorical}
\end{Highlighting}
\end{Shaded}

\hypertarget{summary-statistics-1}{%
\section{1. Summary Statistics}\label{summary-statistics-1}}

Once our data has been cleaned and all our variables are stored as the
appropriate type, we can start with the first step of any regression
project: creating a table of summary statistics. This is an important
part of the process, since it gives the reader a qualitative
understanding of your data before you analyze it. It also serves to
demonstrate that you've cleaned the data appropriately, and that the
measures of the variables make sense.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{summary}\OperatorTok{=}\NormalTok{reg\_df.describe().}\BuiltInTok{round}\NormalTok{(}\DecValTok{2}\NormalTok{)  }\CommentTok{\# generate summary statistics, and round everything to 2 decimal degrees}
\NormalTok{summary}\OperatorTok{=}\NormalTok{summary.T }\CommentTok{\#.T transposes the table (rows become columns and vice versa)}
\NormalTok{summary}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}lllllllll@{}}
\toprule\noalign{}
& count & mean & std & min & 25\% & 50\% & 75\% & max \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
age & 53790.0 & 42.91 & 10.56 & 25.00 & 34.00 & 43.00 & 51.00 & 64.0 \\
sch & 53790.0 & 13.93 & 2.74 & 0.00 & 12.00 & 13.00 & 16.00 & 18.0 \\
incwage & 53790.0 & 51821.86 & 60163.45 & 38.00 & 24000.00 & 40000.00 &
63000.00 & 1102999.0 \\
realhrwage & 53790.0 & 24.38 & 151.90 & 2.01 & 12.17 & 18.44 & 28.12 &
34760.8 \\
\end{longtable}

This table is already informative. I now know that the average person in
this dataset is 42 years old, has around 14 years of schooling, and
makes \$24/hour (or \$51,821/year). However, it's also useful to spot
potential errors in data entry that may warrant greater attention.

Notice the max value for real hourly wage. Despite the fact that those
in the top 75\% of earners make \$28.12/hour, someone is making \$34,760
per hour. Must be nice (or, may be a data entry error). Either way,
because regresisons are sensitive to this sort of outlier, we should
remove it. I've defined a function below that calculates the quartiles
and filters out observations that are more than three times as far away
form the top quartile as the top quartile is from the bottom one. This
was a somewhat arbitrary choice, but it allows me to be consistent if I
want to apply it to other variables. You could also just pick a cutoff
qualitatively and justify it (e.g.~``I will focus on those making up to
\$250k per year, since they represent the population i'm trying to
understand'').

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{def}\NormalTok{ filter\_outliers(var):}
\NormalTok{    q1 }\OperatorTok{=}\NormalTok{ var.quantile(}\FloatTok{0.25}\NormalTok{) }\CommentTok{\# calculate the first quartile}
\NormalTok{    q3 }\OperatorTok{=}\NormalTok{ var.quantile(}\FloatTok{0.75}\NormalTok{) }\CommentTok{\# calculate the third quartile}
\NormalTok{    iqr }\OperatorTok{=}\NormalTok{ q3 }\OperatorTok{{-}}\NormalTok{ q1 }\CommentTok{\# calculate the interquartile range}
\NormalTok{    low }\OperatorTok{=}\NormalTok{ q1 }\OperatorTok{{-}} \DecValTok{3}\OperatorTok{*}\NormalTok{iqr }\CommentTok{\# calculate the lower bound}
\NormalTok{    high }\OperatorTok{=}\NormalTok{ q3 }\OperatorTok{+} \DecValTok{3}\OperatorTok{*}\NormalTok{iqr }\CommentTok{\# calculate the upper bound}
\NormalTok{    filtered }\OperatorTok{=}\NormalTok{ reg\_df[(var }\OperatorTok{\textgreater{}}\NormalTok{ low) }\OperatorTok{\&}\NormalTok{ (var }\OperatorTok{\textless{}}\NormalTok{ high)] }\CommentTok{\# filter  the values that are within the bounds}
\NormalTok{    dropped\_observations}\OperatorTok{=} \BuiltInTok{len}\NormalTok{(var)}\OperatorTok{{-}}\BuiltInTok{len}\NormalTok{(filtered) }\CommentTok{\# calculate the number of observations that were dropped}

    \BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Dropped }\SpecialCharTok{\{\}}\StringTok{ observations\textquotesingle{}}\NormalTok{.}\BuiltInTok{format}\NormalTok{(dropped\_observations))}
    \ControlFlowTok{return}\NormalTok{  filtered}

\NormalTok{reg\_df}\OperatorTok{=}\NormalTok{filter\_outliers(reg\_df[}\StringTok{\textquotesingle{}realhrwage\textquotesingle{}}\NormalTok{]) }\CommentTok{\# filter outliers from realhrwage}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Dropped 1040 observations
\end{verbatim}

We can see that this operation dropped 1040 observations that had
extreme values in the ``realhrwage'' variable. Let's re-generate the
table of summary statistics and only keep four columns: count, mean,
standard deviaiton, minimum, and maximum.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{summary}\OperatorTok{=}\NormalTok{reg\_df.describe().}\BuiltInTok{round}\NormalTok{(}\DecValTok{2}\NormalTok{).T}
\NormalTok{summary[[}\StringTok{\textquotesingle{}count\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}mean\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}std\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}min\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}max\textquotesingle{}}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}llllll@{}}
\toprule\noalign{}
& count & mean & std & min & max \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
age & 52750.0 & 42.84 & 10.57 & 25.00 & 64.00 \\
sch & 52750.0 & 13.88 & 2.73 & 0.00 & 18.00 \\
incwage & 52750.0 & 46849.39 & 33376.96 & 38.00 & 353000.00 \\
realhrwage & 52750.0 & 21.59 & 13.03 & 2.01 & 75.81 \\
\end{longtable}

\hypertarget{visualization}{%
\section{2. Visualization}\label{visualization}}

The summary statistics table provides us with a good overview of some of
the variables we're interested in. However, you'll notice that it omits
many of the other variables in our dataset: the categorical ones. This
is because calculating the mean, standard deviation, etc. of something
like the ``occupation'' column doesn't really make sense. For that, we
turn to visualization.

\hypertarget{visualizing-the-distribution-of-categorical-variables}{%
\subsection{Visualizing the distribution of categorical
variables}\label{visualizing-the-distribution-of-categorical-variables}}

So far in this course we've been using a python library called
Matplotlib to make our visualizations, which we've been calling using
the `plt' alias. But this isn't the only one that is avaialble to us.
\href{https://seaborn.pydata.org/}{Seaborn} is another library that has
some cool plotting functions that are more geared towards statistical
analysis. We've already imported seaborn above, and we'll be calling it
using the alias ``sns''. We can use it in conjunction with matplotlib.

To get a sense of the distribution of our categorical variables, we'll
make some plots that count the number of observations in each category.
Let's start with the race category:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.countplot(data}\OperatorTok{=}\NormalTok{reg\_df, x}\OperatorTok{=}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{) }\CommentTok{\# plot the union variable}

\NormalTok{plt.title(}\StringTok{\textquotesingle{}Race\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{) }\CommentTok{\# remove the x axis label}
\NormalTok{plt.xticks(ticks}\OperatorTok{=}\NormalTok{[}\DecValTok{0}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{],labels}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}White\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Black\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Hispanic\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Other\textquotesingle{}}\NormalTok{]) }\CommentTok{\# replace the x axis labels with more descriptive labels}
\NormalTok{plt.show() }\CommentTok{\# show the plot}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W07. Linear Regression_files/figure-pdf/cell-8-output-1.png}

}

\end{figure}

\hypertarget{exercise-22}{%
\subsection{Exercise}\label{exercise-22}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Generate an equivalent plot for the other categorical columns
\item
  What is the most common industry code, and what does it correpsond to?
\end{enumerate}

\hypertarget{assumptions}{%
\section{3. Assumptions}\label{assumptions}}

Once you've generated summary statistics for your continuous variables
and exploratory plots for the categorical ones, it's time to start
thinking about the relationships \emph{between} the variables. Today,
we're going to be modeling a linear relationship between income and
years of schooling, by means of a \textbf{linear regression}. But before
we do that, we need to check a couple things-- all statistical tests
have a number of assumptions that must be satisfied in order to yield
robust results. Before we run a regression, we must check that the
assumptions in this case are satisfied. There are four main ones:

\begin{verbatim}
A. Indepdendence 
B. Homoscedasticity
C. Multicollinearity 
\end{verbatim}

Let's go through them one by one.

\hypertarget{a.-independence}{%
\subsection{A. Independence}\label{a.-independence}}

\textbf{\texttt{Linear\ regression\ assumes\ that\ measurements\ for\ each\ sample\ subject\ are\ in\ no\ way\ influenced\ by\ or\ related\ to\ the\ measurements\ of\ other\ subjects.}}

Though in the full CPS dataset we have repeat observations of the same
individual over time, we've only been analyzing one year's worth of
data, so we satisfy the independence assumption. If we ran a regression
on the full sample over multiple years, \emph{this would violate the
independence assumption}. It's very possible to run a regression with
repeat observations of the same units (people, places, etc.) over time,
but you need to use a special type of regression called a \textbf{panel
regression}. More on that next week.

\hypertarget{b.-homoscedasticity}{%
\subsection{B. Homoscedasticity}\label{b.-homoscedasticity}}

\textbf{\texttt{Linear\ regression\ assumes\ that\ the\ variance\ of\ residuals\ is\ the\ same\ for\ any\ value\ of\ x,\ and\ that\ residuals\ are\ normally\ distributed\ with\ a\ mean\ of\ 0.}}

This is a complicated way of saying your regression line should fit
consistently across the full range of \(x\) values. If there are really
small residuals (i.e., all the data points are close to the line) for
low values of \(x\), but larger residuals for high values of \(x\), the
regression is not performing well-- we wouldn't have the same confidence
in our predictions at different values of \(x\). Similarly, if all the
residuals are on one side of the regression line in different parts of
the \(x\) range, the model will consistently over/underestimate in those
regions. When the variance of residuals from a regression model are
inconsistent, we have \textbf{\texttt{Heteroscedasticity}}.

We can explore potential heteroscedasticity by visually inspecting a
regression plot. In our case, we're primarily interested in the
relationship between years of schooling and hourly wages, so we'll be
plotting these variables against eachother. \texttt{sns.jointplot()}
lets us create a plot with four components which can help us diagnose
potential heteroscedasticity:

\begin{itemize}
\tightlist
\item
  The main plot is a scatterplot between hourly wages on the y axis, and
  years of schooling on the x axis.
\item
  A regression line overlaid on this plot lets us see the relationship
  between our model and the underlying data
\item
  A histogram to the right of the plot shows the distribution of the
  hourly wages variable, which is heavily skewed.
\item
  A histogram above the plot shows the distribution of the years of
  schooling variable, which has an almost bimodal form.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.jointplot(data}\OperatorTok{=}\NormalTok{reg\_df, }\CommentTok{\# plot a scatterplot with a regression line and two histograms}
\NormalTok{                x}\OperatorTok{=}\StringTok{\textquotesingle{}sch\textquotesingle{}}\NormalTok{, }\CommentTok{\# set the x axis to be the years of schooling}
\NormalTok{                y}\OperatorTok{=}\StringTok{\textquotesingle{}realhrwage\textquotesingle{}}\NormalTok{, }\CommentTok{\# set the y axis to be the hourly wage}
\NormalTok{                kind}\OperatorTok{=}\StringTok{"reg"}\NormalTok{,  }\CommentTok{\# set the kind of plot to be a regression plot}
\NormalTok{                scatter\_kws}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(alpha}\OperatorTok{=}\FloatTok{0.1}\NormalTok{), }\CommentTok{\# set the transparency of the points to be 0.1 (10\%)}
\NormalTok{                line\_kws}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{), }\CommentTok{\# set the color of the regression line to red}
\NormalTok{                height}\OperatorTok{=}\DecValTok{10}\NormalTok{) }\CommentTok{\# set the height of the plot to be 10 inches }

\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}Years of Schooling\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a label to the x axis}
\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Hourly Wage\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a label to the y axis}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Text(53.625, 0.5, 'Hourly Wage')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W07. Linear Regression_files/figure-pdf/cell-9-output-2.png}

}

\end{figure}

The plot above is cause for concern. From 0 to 5 years of schooling the
model has underestimated hourly wages for every single observation.
Conversely, at the far right tip of the regression line, we can see that
the model \emph{overestimates} income for many individuals with 18 years
of schooling. This gives us reason to suspect that there may be
asymmetry in the residuals of our model (heteroscedasticity). We're
going to fix this in the Exension section below. But for now, let's
proceed.

\hypertarget{c.-multicollinearity}{%
\subsection{C. Multicollinearity}\label{c.-multicollinearity}}

\textbf{\texttt{Multicollinearity\ emerges\ when\ two\ or\ more\ independent\ variables\ which\ are\ highly\ correlated\ are\ included\ in\ a\ model.}}
A key goal of regression analysis is to isolate the relationship between
each independent variable and the dependent variable. The interpretation
of a regression coefficient is that it represents the mean change in the
dependent variable for each 1 unit change in an independent variable
when you hold all of the other independent variables constant.

The idea is that you can change the value of one independent variable
and not the others. However, when independent variables are correlated,
it indicates that changes in one variable are associated with shifts in
another variable. The stronger the correlation, the more difficult it is
to change one variable without changing another. See this
\href{https://statisticsbyjim.com/regression/multicollinearity-in-regression-analysis/}{blog
post} for a thorough explanation.

One way of visually exporing multicollinearity is through a correlation
matrix:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.heatmap(reg\_df[[}\StringTok{\textquotesingle{}incwage\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}realhrwage\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}sch\textquotesingle{}}\NormalTok{]].corr(), }\CommentTok{\# plot a correlation matrix }
\NormalTok{            annot}\OperatorTok{=}\VariableTok{True}\NormalTok{, }\CommentTok{\# show the correlation values on the plot}
\NormalTok{            fmt}\OperatorTok{=}\StringTok{".2f"}\NormalTok{, }\CommentTok{\# set the format of the correlation values to be two decimal places}
\NormalTok{            cmap}\OperatorTok{=}\StringTok{\textquotesingle{}coolwarm\textquotesingle{}}\NormalTok{) }\CommentTok{\# set the color palette to be coolwarm (blue for negative correlations, red for positive correlations)}

\NormalTok{plt.title(}\StringTok{\textquotesingle{}Correlation Matrix\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Text(0.5, 1.0, 'Correlation Matrix')
\end{verbatim}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W07. Linear Regression_files/figure-pdf/cell-10-output-2.png}

}

\end{figure}

This matrix has each of the continuous variables in \texttt{reg\_df} on
both axes. Each cell denotes the correlation between the corresponding
variables. Naturally, on the diagonal we have a series of perfect
correlations (1.00), as each variable is perfectly correlated with
itself. \texttt{incwage} (annual salary) and \texttt{realhrwage} (hourly
wage) are highly correlated with each other, which makes a lot of sense.
This isn't a concern for multicollinearity, though, since
\texttt{realhrwage} will be our dependent variable. This type of
correlation matrix is also a good way of conducting exploratory data
analysis-- we can already see that the next-highest set of correlations
is between years of schooling and both hourly wages and annual salary.

Though a very high correlagtion coefficient between independent
variables is a cause for concern, the formal way of dealing with
muticollinearity is through the use of the
\textbf{\texttt{Variance\ Inflation\ Factor\ (VIF)}}. VIF is the ratio
of the variance in a model with multiple predictors by the variance of a
model with a single predictor:

\[\large VIF_j=\frac{1}{1-R_{j}^{2}}\]

VIFs start at 1 and have no upper limit. A value of 1 indicates that
there is no correlation between this independent variable and any
others. VIFs between 1 and 5 suggest that there is a moderate
correlation, but it is not severe enough to warrant corrective measures.
VIFs greater than 5 represent critical levels of multicollinearity where
the coefficients are poorly estimated, and the p-values are
questionable. More explanation of the theory can be found
\href{https://en.wikipedia.org/wiki/Variance_inflation_factor}{here}.

Below is a function that calculates VIF for each independent variable in
a dataframe, and drops them if they exceed a threshold (set to 5).

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# calculating VIF}
\CommentTok{\# This function is amended from: https://stackoverflow.com/a/51329496/4667568}

\ImportTok{from}\NormalTok{ statsmodels.stats.outliers\_influence }\ImportTok{import}\NormalTok{ variance\_inflation\_factor }
\ImportTok{from}\NormalTok{ statsmodels.tools.tools }\ImportTok{import}\NormalTok{ add\_constant}

\KeywordTok{def}\NormalTok{ drop\_column\_using\_vif\_(df, list\_var\_not\_to\_remove}\OperatorTok{=}\VariableTok{None}\NormalTok{, thresh}\OperatorTok{=}\DecValTok{5}\NormalTok{):}
    \CommentTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}}
\CommentTok{    Calculates VIF each feature in a pandas dataframe, and repeatedly drop the columns with the highest VIF}
\CommentTok{    A constant must be added to variance\_inflation\_factor or the results will be incorrect}

\CommentTok{    :param df: the pandas dataframe containing only the predictor features, not the response variable}
\CommentTok{    :param list\_var\_not\_to\_remove: the list of variables that should not be removed even though it has a high VIF. For example, dummy (or indicator) variables represent a categorical variable with three or more categories.}
\CommentTok{    :param thresh: the max VIF value before the feature is removed from the dataframe}
\CommentTok{    :return: dataframe with multicollinear features removed}
\CommentTok{    \textquotesingle{}\textquotesingle{}\textquotesingle{}}
    \ControlFlowTok{while} \VariableTok{True}\NormalTok{:}
        \CommentTok{\# adding a constatnt item to the data}
\NormalTok{        df\_with\_const }\OperatorTok{=}\NormalTok{ add\_constant(df)}

\NormalTok{        vif\_df }\OperatorTok{=}\NormalTok{ pd.Series([variance\_inflation\_factor(df\_with\_const.values, i) }
               \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(df\_with\_const.shape[}\DecValTok{1}\NormalTok{])], name}\OperatorTok{=} \StringTok{"VIF"}\NormalTok{,}
\NormalTok{              index}\OperatorTok{=}\NormalTok{df\_with\_const.columns).to\_frame()}

        \CommentTok{\# drop the const as const should not be removed}
\NormalTok{        vif\_df }\OperatorTok{=}\NormalTok{ vif\_df.drop(}\StringTok{\textquotesingle{}const\textquotesingle{}}\NormalTok{)}
        
        \CommentTok{\# drop the variables that should not be removed}
        \ControlFlowTok{if}\NormalTok{ list\_var\_not\_to\_remove }\KeywordTok{is} \KeywordTok{not} \VariableTok{None}\NormalTok{:}
\NormalTok{            vif\_df }\OperatorTok{=}\NormalTok{ vif\_df.drop(list\_var\_not\_to\_remove)}
            
        \BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Max VIF:\textquotesingle{}}\NormalTok{, vif\_df.VIF.}\BuiltInTok{max}\NormalTok{())}
        
        \CommentTok{\# if the largest VIF is above the thresh, remove a variable with the largest VIF}
        \ControlFlowTok{if}\NormalTok{ vif\_df.VIF.}\BuiltInTok{max}\NormalTok{() }\OperatorTok{\textgreater{}}\NormalTok{ thresh:}
            \CommentTok{\# If there are multiple variables with the maximum VIF, choose the first one}
\NormalTok{            index\_to\_drop }\OperatorTok{=}\NormalTok{ vif\_df.index[vif\_df.VIF }\OperatorTok{==}\NormalTok{ vif\_df.VIF.}\BuiltInTok{max}\NormalTok{()].tolist()[}\DecValTok{0}\NormalTok{]}
            \BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}Dropping: }\SpecialCharTok{\{\}}\StringTok{\textquotesingle{}}\NormalTok{.}\BuiltInTok{format}\NormalTok{(index\_to\_drop))}
\NormalTok{            df }\OperatorTok{=}\NormalTok{ df.drop(columns }\OperatorTok{=}\NormalTok{ index\_to\_drop)}
        \ControlFlowTok{else}\NormalTok{:}
            \CommentTok{\# No VIF is above threshold. Exit the loop}
            \ControlFlowTok{break}

    \ControlFlowTok{return}\NormalTok{ df}
\end{Highlighting}
\end{Shaded}

Now we can implement this on our dataset:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ind\_vars}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}sch\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}union\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}race\textquotesingle{}}\NormalTok{]}

\NormalTok{vif }\OperatorTok{=}\NormalTok{ drop\_column\_using\_vif\_(reg\_df[ind\_vars], thresh}\OperatorTok{=}\DecValTok{5}\NormalTok{)}
\BuiltInTok{print}\NormalTok{(}\StringTok{"The columns remaining after VIF selection are:"}\NormalTok{)}
\BuiltInTok{print}\NormalTok{(vif.columns)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
Max VIF: 1.0471509029112354
The columns remaining after VIF selection are:
Index(['sex', 'age', 'sch', 'union', 'race'], dtype='object')
\end{verbatim}

The maximum VIF value encountered was 1.04-- well within the acceptable
range. Accordingly, the function hasn't dropped any of the independent
variables in our dataset.

Having explored our data through visualizations and summary statistics,
and checked the assumptions of linear regression, we're now ready to
begin building a model.

\hypertarget{regression-1}{%
\section{4. Regression}\label{regression-1}}

Remember, the Ordinary Least Squares (OLS) regression seeks to find a
straight line that best describes the relationship between two
variables:

\[y= \beta_0 + \beta_1x+\epsilon \]

In our case, we're trying to predict hourly income-- this is our
\textbf{dependent variable}, and there can be only one per regression.
The variable we're using to predict hourly income is years of schooling,
which is our \textbf{independent variable}. We can have multiple of
these per regression. As such, the regression equation in our scenario
looks like this:

\[Hourly\ Income= \beta_0 + \beta_1 \times Years\ of\ Schooling +\epsilon \]

Because the regression model will estimate the parameters
\(\beta_0, \beta_1\) and \(\epsilon\), we just need to supply python
with \(x\) and \(y\); We can do so by passing
\texttt{realhrwage\ \textasciitilde{}\ \ sch} to the \texttt{ols()}
function from statsmodels. This will run a regression of the form
specified above, which we will store in an variable called
\texttt{model}. We can get the output from this model using
\texttt{model.summary()}:

\begin{Shaded}
\begin{Highlighting}[]
\ImportTok{from}\NormalTok{ statsmodels.formula.api }\ImportTok{import}\NormalTok{ ols}
\ImportTok{from}\NormalTok{ statsmodels.iolib.summary2 }\ImportTok{import}\NormalTok{ summary\_col}

\NormalTok{model}\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}realhrwage \textasciitilde{}  sch\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{reg\_df).fit() }\CommentTok{\# fit the model}
\BuiltInTok{print}\NormalTok{(model.summary()) }\CommentTok{\# print the summary}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             realhrwage   R-squared:                       0.181
Model:                            OLS   Adj. R-squared:                  0.181
Method:                 Least Squares   F-statistic:                 1.164e+04
Date:                Fri, 01 Dec 2023   Prob (F-statistic):               0.00
Time:                        09:05:19   Log-Likelihood:            -2.0503e+05
No. Observations:               52750   AIC:                         4.101e+05
Df Residuals:                   52748   BIC:                         4.101e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -6.6246      0.266    -24.858      0.000      -7.147      -6.102
sch            2.0327      0.019    107.887      0.000       1.996       2.070
==============================================================================
Omnibus:                    10230.138   Durbin-Watson:                   1.900
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            19817.137
Skew:                           1.187   Prob(JB):                         0.00
Kurtosis:                       4.838   Cond. No.                         73.7
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
\end{verbatim}

There's a lot going on in the regression output above. If you want a
more detailed explanation of what each part means, check out this
\href{https://medium.com/swlh/interpreting-linear-regression-through-statsmodels-summary-4796d359035a}{blog
post}. In practice, we only need to focus on a couple parts of this
output:

\begin{itemize}
\tightlist
\item
  \texttt{R-squared}: This value tells the proportion of the variation
  in our dependent variable (realhrwage) that is explained by the model
  we fit. In this case we can interpret it as follows:

  \begin{itemize}
  \tightlist
  \item
    \textbf{18.1\% of the variation in hourly wages can be explained by
    this regresion model}
  \end{itemize}
\item
  \texttt{coef}: These are our \(\beta\) estimates; it is the slope of
  the regression line that describes the relationship between a given
  independent variable (sch) and the dependent variable (realhrwage).
  There are two coefficients listed under this

  \begin{itemize}
  \tightlist
  \item
    \texttt{sch}: This is \(\beta_1\), the slope coefficient on the
    years of schooling variable. It tells us the change in \(y\) that
    results from a 1-unit increase in \(x\). In robotic terms, we can
    interpret it as follows:

    \begin{itemize}
    \tightlist
    \item
      \textbf{A 1 unit increase in \texttt{sch} leads to a 2.0327
      increase in \texttt{realhrwage}}. But we are not robots, and both
      of these variables are in units that we can interpret in plain
      english. Here's a more natural interpretation:
    \item
      \textbf{On average, every additional year of schooling is
      associated with a \$2.03 increase in hourly wages.}
    \end{itemize}
  \item
    \texttt{Intercept}: This is \(\beta_0\). It tells us the value of
    \(y\) when all of the independent variables in the model are held at
    0. In this case, it can be interpreted as

    \begin{itemize}
    \tightlist
    \item
      \textbf{According to our model, a person with 0 years of schooling
      is predicted to earn -\$6.62 per hour}
    \item
      Naturally, this is a nonsensical prediction. There are no jobs
      that pay negative wages. We'll examine why this is happening in
      the next section, when we look into the assumptions of linear
      regression.
    \end{itemize}
  \end{itemize}
\item
  \texttt{P\textgreater{}\textbar{}t\textbar{}}: this is known as the
  ``p-value'', and is the main measure of statistical significance.
  \textbf{A p-value denotes the probability of obtaining a result at
  least as extreme as the one observed, assuming that the null
  hypothesis is true}. In the case of a regression, the null hypothesis
  is that there is no relationship between our variables-- increasing
  \(x\) has no effect on \(y\). In other words, that the regression line
  is flat: \(\beta_1=0\) . A p-value of 0.05 means that the coefficient
  is statistically significant at the 5\% level. In our case, the
  p-value is 0.000 (note: this doesn't mean it's equal to zero, just
  very very small), and we can therefore reject the null hypothesis that
  \(\beta_1=0\) at the 1\% confidence level. However, this isn't the end
  of the story-- remember our weird negative intercept, and the fact
  that our model explains less than 20\% of the variation in hourly
  wages (\(R^2=0.181\)). For a good overview of what exactly a p-value
  is, and why we should be cautious when interpreting them, see this
  \href{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6532382/}{journal
  article}.
\end{itemize}

\hypertarget{categorical-variables}{%
\subsection{Categorical Variables}\label{categorical-variables}}

The results of our first regression seem to show that the more education
a person has, the higher their hourly wage. This makes intuitive sense,
but it's probably not the whole picture. We may also suspect that older
people earn more, since they have more experience and are more senior.
We've also seen i previous classes that there are significant
disparities in income. Considering we have data on all these variables,
we can set up the following model:

\[Hourly\ Income= \beta_0 + \beta_1 \times Years\ of\ Schooling + \beta_2 \times Age + \beta_3 \times Sex +\epsilon \]

When we convert this equation into the python equivalent, it will look
like this:

\texttt{realhrwage\ \textasciitilde{}\ \ sch\ +\ age\ +\ C(sex)}

Notice that for the sex variable is put within \texttt{C()}. This is how
we indicate that the variable in question is categorical, and that it
should be treated differently. Unlike a continuous variable, we're not
interested in the change in \(y\) that results from a 1 unit increase in
\(x\), since our units have no meaningful order. Instead, we'll have to
pick one of the categories (called a \textbf{base
category}/\textbf{reference category}), and compare each of the other
categories in that variable against this one. You can specify the base
category explicitly (for example
\texttt{realhrwage\ \textasciitilde{}\ \ sch\ +\ age\ +\ C(sex,\ Treatment(reference=2))}
makes women the base category), or python will pick one for you. As
such, for a categorical variable with \(n\) categories, we get \(n-1\)
coefficeints which denote the change in \(y\) associated with membership
of a given category compared to the base category. For example, if we
have a categorical variable with three levels \(a, b, c\) where \(a\) is
the base category, we would get \emph{two} coefficients: \(\beta_1 b\)
and \(\beta_2 c\). Then we would interpret the resulting coefficient as

\begin{itemize}
\tightlist
\item
  ``Compared to category \(a\), membership of category \(b\) is
  associated with a \(\beta_1\) change in \(y\).''
\item
  ``Compared to category \(a\), membership of category \(c\) is
  associated with a \(\beta_2\) change in \(y\).''
\end{itemize}

Let's see what this looks like in our regression output:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model }\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}realhrwage \textasciitilde{}  sch + age + C(sex)\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{reg\_df).fit() }
\BuiltInTok{print}\NormalTok{(model.summary())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             realhrwage   R-squared:                       0.239
Model:                            OLS   Adj. R-squared:                  0.239
Method:                 Least Squares   F-statistic:                     5514.
Date:                Fri, 01 Dec 2023   Prob (F-statistic):               0.00
Time:                        09:05:19   Log-Likelihood:            -2.0310e+05
No. Observations:               52750   AIC:                         4.062e+05
Df Residuals:                   52746   BIC:                         4.062e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     -12.6722      0.330    -38.392      0.000     -13.319     -12.025
C(sex)[T.2]    -5.2692      0.099    -52.967      0.000      -5.464      -5.074
sch             2.1343      0.018    116.995      0.000       2.098       2.170
age             0.1695      0.005     36.164      0.000       0.160       0.179
==============================================================================
Omnibus:                     9831.824   Durbin-Watson:                   1.995
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            19481.335
Skew:                           1.131   Prob(JB):                         0.00
Kurtosis:                       4.935   Cond. No.                         308.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
\end{verbatim}

We now have 4 coefficients. In general, we don't always have to
interpret the Intercept coefficient. It's not really that meaningful in
this case, since now it denotes the predicted hourly income of someone
who is male, has 0 years of schooling, and is 0 years old. It's good to
keep it in mind as a sense check, though. The rest of the coefficients
can be interpreted as follows:

\begin{itemize}
\tightlist
\item
  \texttt{C(sex){[}T.2{]}}: On average, women earn \$5.2 less per hour
  than men.

  \begin{itemize}
  \tightlist
  \item
    {[}T.2{]} in this line denotes the category in this variable
    associated with the given coefficient. So this is telling us that
    what is being shown is the coefficient associated with membership of
    category 2 in the sex variable; based on the description of the
    variables above, we know that sex=1 indicates men, and sex=2
    indicates women. Naturally, we don't see a coefficient for
    \texttt{C(sex){[}T.1{]}}, because this is the \emph{base category}.
  \end{itemize}
\item
  \texttt{sch}: Every additional year of schooling is associated with a
  \$2.13 increase in hourly income
\item
  \texttt{age}: Every additional year of age is associated with a \$0.16
  increase in hourly income
\end{itemize}

\hypertarget{exercise-23}{%
\subsection{Exercise}\label{exercise-23}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Estimate a regression of the following form and store the results in a
  variable called \textbf{model1}:
\end{enumerate}

\[Hourly\ Income= \beta_0 + \beta_1 \times Years\ of\ Schooling + \beta_2 \times Age + \beta_3 \times Sex + \beta_4 \times Union\ Membership + \beta_5 \times Race +\epsilon \]

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Intepret each of the coefficients appropriately. Make note of the
  statistical significance of each result, and comment on the overall
  fit of the model.
\end{enumerate}

\hypertarget{creating-a-regression-table}{%
\subsection{Creating a Regression
Table}\label{creating-a-regression-table}}

Now that we've got a good sense of how regressions work and how to
interpret them, we need to communicate these results properly. Many of
you have probably read journal articles in which regression results are
reported, but I doubt you've ever seen the output of
\texttt{model.summary()} copied and pasted in the text of an article.
Instead, these results are reported following a fairly standardized
convention: a regression table. It picks out the components of the model
summary that we're interested in, and formats them in a consistent and
easy-to-interpret way. Luckly, the statsmodels package has a function
called \texttt{summary\_col} that takes a fitted model and formats it
for us automatically; we just need to tweak a few options.

In the example below, i'm going to run two regressions; one in which i
filter the data to only include people from California, and another for
people in Mississippi (the richest and poorest states, respectively), to
see if the relationship between wages, sex, age, and schooling differ
geographically. I'm then going to create a regression table in which
each column is a different regression model, and row will contain the
coefficient for a given independent variable with the standard error in
parentheses underneath and the level of statistical significance (i.e.,
size of the p-value) denotes by stars such that: * p\textless0.05, **
p\textless0.01, *** p\textless0.001.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{california }\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}realhrwage \textasciitilde{}  sch + age + C(sex)\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{reg\_df[reg\_df[}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{6}\NormalTok{]).fit()  }\CommentTok{\# fit a model to california{-}{-} i\textquotesingle{}m filtering the data using the FIPS code for california, which is 6}
\NormalTok{mississippi }\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}realhrwage \textasciitilde{}  sch + age + C(sex)\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{reg\_df[reg\_df[}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\DecValTok{28}\NormalTok{]).fit()  }\CommentTok{\# same thing for mississippi (FIPS code 28)}

\NormalTok{table}\OperatorTok{=}\NormalTok{summary\_col( }\CommentTok{\# create a regression table }
\NormalTok{    [california,mississippi], }\CommentTok{\# pass the models to the summary\_col function}
\NormalTok{    stars}\OperatorTok{=}\VariableTok{True}\NormalTok{, }\CommentTok{\# add stars denoting the p{-}values of the coefficient to the table; * p\textless{}0.05, ** p\textless{}0.01, *** p\textless{}0.001}
\NormalTok{    float\_format}\OperatorTok{=}\StringTok{\textquotesingle{}}\SpecialCharTok{\%0.3f}\StringTok{\textquotesingle{}}\NormalTok{, }\CommentTok{\# set the decimal places to 3}
\NormalTok{    model\_names}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}California\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Mississippi\textquotesingle{}}\NormalTok{], }\CommentTok{\# set the name of the model}
\NormalTok{    info\_dict }\OperatorTok{=}\NormalTok{ \{}\StringTok{"N"}\NormalTok{:}\KeywordTok{lambda}\NormalTok{ x: }\StringTok{"}\SpecialCharTok{\{0:d\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(}\BuiltInTok{int}\NormalTok{(x.nobs))\}) }\CommentTok{\# add the number of observations to the table}

\BuiltInTok{print}\NormalTok{(table)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}

=====================================
               California Mississippi
-------------------------------------
Intercept      -12.136*** -10.111*** 
               (1.030)    (3.497)    
C(sex)[T.2]    -5.154***  -4.980***  
               (0.350)    (0.924)    
sch            2.096***   1.850***   
               (0.053)    (0.205)    
age            0.217***   0.115**    
               (0.017)    (0.045)    
R-squared      0.262      0.199      
R-squared Adj. 0.262      0.193      
N              5079       430        
=====================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
\end{verbatim}

This layout lets us clearly explore our regresison results. This lets us
clearly compare the coefficients of the same variable in different
models. For example, we can see that men tend to earn \$5.15 more per
hour than women in California, but just \$4.98 more per hour in
Mississippi, and both of these results are statistically significant at
the 1\% level. This suggests that the wage gap is actually somewhat
higher in California! Why might this be?

\hypertarget{exercise-24}{%
\subsection{Exercise}\label{exercise-24}}

\[ Hourly\ Income= \beta_0 + \beta_1 \times Years\ of\ Schooling + \beta_2 \times Age + \beta_3 \times Sex + \beta_4 \times Union\ Membership + \beta_5 \times Race +\epsilon \]

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Run five regressions, each of the form above (same as earlier):

  \begin{itemize}
  \tightlist
  \item
    In the first model, run the regression on the full sample contained
    in \texttt{reg\_df}. In subsequent modles, restrict the sample to
    the following professions:

    \begin{itemize}
    \tightlist
    \item
      Production
    \item
      Farmers
    \item
      Bankers
    \item
      Doctors \& Lawyers
    \end{itemize}
  \end{itemize}
\item
  Create a regression table containing the results of each model in a
  separate column
\item
  Interpret the coefficients on the union related variables

  \begin{itemize}
  \tightlist
  \item
    How does union membership affect hourly wages across different
    sectors?
  \item
    How does the gender wage gap vary across sectors?
  \end{itemize}
\end{enumerate}

\hypertarget{extension-2}{%
\section{Extension}\label{extension-2}}

Though we've gotten some significant results and interesting insights
from our modeling effort so far, we can further improve our model. In
particular, we may want to revisit the way we've defined some of our
variables, since we suspect that we may have some heteroscedasticity in
our models, and have consequently been getting some weird results
(e.g.~negative hourly income).

\hypertarget{hourly-wages}{%
\subsection{Hourly Wages}\label{hourly-wages}}

When checking the regression assumptions, we suspected that there may be
some heteroscedasticity-- i.e., that our model performs better in some
regions of the \(x\) distribution compared to others; remember, it
consistently underestimated hourly income for those with little/no
schooling, as evidenced by the negative intercept and the regression
scatterplot:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sns.jointplot(data}\OperatorTok{=}\NormalTok{reg\_df, x}\OperatorTok{=}\StringTok{\textquotesingle{}sch\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}realhrwage\textquotesingle{}}\NormalTok{, kind}\OperatorTok{=}\StringTok{"reg"}\NormalTok{,  scatter\_kws}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(alpha}\OperatorTok{=}\FloatTok{0.1}\NormalTok{), line\_kws}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{), height}\OperatorTok{=}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W07. Linear Regression_files/figure-pdf/cell-17-output-1.png}

}

\end{figure}

We can more thoroughly diagnose heteroscedasticity \emph{after} having
run our regression models, since we have access to the model's
\textbf{residuals} (the difference between the observed values and the
predicted values). Remember, one of the assumptions of linear regression
is that the residuals are normally distributed. A Quantile-Quantile Plot
(Q-Q Plot) is a plot of the quantiles of a sample against the quantiles
of a theoretical distribution. The quantiles are the values that divide
the range of a probability distribution into continuous intervals with
equal probabilities. Thus, we can use a Q-Q plot to compare the
residuals of our model to a normal distribution as follows:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model }\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}realhrwage \textasciitilde{}  sch\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{reg\_df).fit()  }\CommentTok{\# fit a model}
\NormalTok{residuals }\OperatorTok{=}\NormalTok{ model.resid }\CommentTok{\# get the residuals}

\CommentTok{\# make the figure wider}
\NormalTok{plt.rcParams[}\StringTok{"figure.figsize"}\NormalTok{] }\OperatorTok{=}\NormalTok{ [}\DecValTok{20}\NormalTok{, }\DecValTok{10}\NormalTok{]}

\NormalTok{f, axes }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)}
\NormalTok{sns.histplot(residuals, kde}\OperatorTok{=}\VariableTok{True}\NormalTok{, ax}\OperatorTok{=}\NormalTok{axes[}\DecValTok{0}\NormalTok{]) }\CommentTok{\# plot the residuals}
\NormalTok{axes[}\DecValTok{0}\NormalTok{].set\_title(}\StringTok{\textquotesingle{}Histogram of Residuals\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}

\NormalTok{sm.qqplot(residuals, line}\OperatorTok{=}\StringTok{\textquotesingle{}45\textquotesingle{}}\NormalTok{, fit}\OperatorTok{=}\VariableTok{True}\NormalTok{,  ax}\OperatorTok{=}\NormalTok{axes[}\DecValTok{1}\NormalTok{]) }\CommentTok{\# plot the residuals}
\NormalTok{axes[}\DecValTok{1}\NormalTok{].set\_title(}\StringTok{\textquotesingle{}Q{-}Q Plot\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}

\NormalTok{plt.show() }\CommentTok{\# show the plot}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W07. Linear Regression_files/figure-pdf/cell-18-output-1.png}

}

\end{figure}

This Q-Q plot suggests that our residuals are not normally distributed,
as very few of them are on the red line. This is probably due to the
fact that the \texttt{realhrwage} variable is itself highly skewed.

Log transformations are often recommended for skewed data, such as
monetary measures or certain biological and demographic measures. Log
transforming data usually has the effect of spreading out clumps of data
and bringing together spread-out data. So instead of:

\[Hourly\ Income= \beta_0 + \beta_1 \times Years\ of\ Schooling +\epsilon \]

we get:

\[\log{(Hourly\ Income)}= \beta_0 + \beta_1 \times Years\ of\ Schooling +\epsilon \]

In effect, this means changing our belief that there is a linear
relationship between schooling and income (a constant increase in x
leads to a constant increase in y across the whole range of x).
Qualitatively, this means

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{reg\_df[}\StringTok{\textquotesingle{}logwage\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{np.log(reg\_df[}\StringTok{\textquotesingle{}realhrwage\textquotesingle{}}\NormalTok{])}
\NormalTok{sns.jointplot(data}\OperatorTok{=}\NormalTok{reg\_df, x}\OperatorTok{=}\StringTok{\textquotesingle{}sch\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}logwage\textquotesingle{}}\NormalTok{, kind}\OperatorTok{=}\StringTok{"reg"}\NormalTok{,  scatter\_kws}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(alpha}\OperatorTok{=}\FloatTok{0.1}\NormalTok{), line\_kws}\OperatorTok{=}\BuiltInTok{dict}\NormalTok{(color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{), height}\OperatorTok{=}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W07. Linear Regression_files/figure-pdf/cell-19-output-1.png}

}

\end{figure}

A few things are noticeably different in this plot. First, the histogram
of \texttt{logwage} on the far right is a lot less skewed than the
histogram of \texttt{realhrwage}. Consequently, the regression line
seems to fit the data slightly better across the whole range of the
data.

We can generate the same residual histogram and Q-Q plot as before, but
using a model in which \texttt{logwage} is the dependent variable:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{log\_model }\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}logwage \textasciitilde{}  sch\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{reg\_df).fit()  }\CommentTok{\# fit a model}
\NormalTok{log\_model\_residuals }\OperatorTok{=}\NormalTok{ log\_model.resid }\CommentTok{\# get the residuals}

\CommentTok{\# make the figure wider}
\NormalTok{plt.rcParams[}\StringTok{"figure.figsize"}\NormalTok{] }\OperatorTok{=}\NormalTok{ [}\DecValTok{20}\NormalTok{, }\DecValTok{10}\NormalTok{]}

\NormalTok{f, axes }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)}
\NormalTok{sns.histplot(log\_model\_residuals, kde}\OperatorTok{=}\VariableTok{True}\NormalTok{, ax}\OperatorTok{=}\NormalTok{axes[}\DecValTok{0}\NormalTok{]) }\CommentTok{\# plot the residuals}
\NormalTok{axes[}\DecValTok{0}\NormalTok{].set\_title(}\StringTok{\textquotesingle{}Histogram of Residuals\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}

\NormalTok{sm.qqplot(log\_model\_residuals, line}\OperatorTok{=}\StringTok{\textquotesingle{}45\textquotesingle{}}\NormalTok{, fit}\OperatorTok{=}\VariableTok{True}\NormalTok{,  ax}\OperatorTok{=}\NormalTok{axes[}\DecValTok{1}\NormalTok{]) }\CommentTok{\# plot the residuals}
\NormalTok{axes[}\DecValTok{1}\NormalTok{].set\_title(}\StringTok{\textquotesingle{}Q{-}Q Plot\textquotesingle{}}\NormalTok{) }\CommentTok{\# add a title}

\NormalTok{plt.show() }\CommentTok{\# show the plot}
\end{Highlighting}
\end{Shaded}

\begin{figure}[H]

{\centering \includegraphics{notebooks/W07. Linear Regression_files/figure-pdf/cell-20-output-1.png}

}

\end{figure}

It's not perfect, but it's a lot better than the unlogged version; a
large proportion of the residuals fall on the red line in the Q-Q plot,
though they diverge at the tips. The histogram of residuals also seems
to be less skewed, and more evenly distributed around 0.

\hypertarget{coefficient-interpretation.}{%
\section{Coefficient
interpretation.}\label{coefficient-interpretation.}}

Only the dependent/response variable is log-transformed. Exponentiate
the coefficient, subtract one from this number, and multiply by 100.
This gives the percent increase (or decrease) in the response for every
one-unit increase in the independent variable. Here's a
\href{https://data.library.virginia.edu/interpreting-log-transformations-in-a-linear-model/\#:~:text=Interpret\%20the\%20coefficient\%20as\%20the,variable\%20increases\%20by\%20about\%200.20\%25.}{full
guide} to interpreting the coefficients on log-transformed variables.

First, let's compare the unlogged and logged models:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table}\OperatorTok{=}\NormalTok{summary\_col( }\CommentTok{\# create a regression table }
\NormalTok{    [model,log\_model], }\CommentTok{\# pass the models to the summary\_col function}
\NormalTok{    stars}\OperatorTok{=}\VariableTok{True}\NormalTok{, }\CommentTok{\# add stars denoting the p{-}values of the coefficient to the table; * p\textless{}0.05, ** p\textless{}0.01, *** p\textless{}0.001}
\NormalTok{    float\_format}\OperatorTok{=}\StringTok{\textquotesingle{}}\SpecialCharTok{\%0.3f}\StringTok{\textquotesingle{}}\NormalTok{, }\CommentTok{\# set the decimal places to 3}
\NormalTok{    model\_names}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}Unlogged\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Logged\textquotesingle{}}\NormalTok{], }\CommentTok{\# set the name of the model}
\NormalTok{    info\_dict }\OperatorTok{=}\NormalTok{ \{}\StringTok{"N"}\NormalTok{:}\KeywordTok{lambda}\NormalTok{ x: }\StringTok{"}\SpecialCharTok{\{0:d\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(}\BuiltInTok{int}\NormalTok{(x.nobs))\}) }\CommentTok{\# add the number of observations to the table}

\BuiltInTok{print}\NormalTok{(table)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}

=================================
                Unlogged  Logged 
---------------------------------
Intercept      -6.625*** 1.573***
               (0.266)   (0.012) 
sch            2.033***  0.096***
               (0.019)   (0.001) 
R-squared      0.181     0.191   
R-squared Adj. 0.181     0.191   
N              52750     52750   
=================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
\end{verbatim}

Interestingly, we can see that we've also got a 1\% increase in \(R^2\)
just from logging the dependent variable. While the coefficient for
schooling can be interpreted normally for the unlogged model (every
additional year of schooling leads to a \$2.03 increase in hourly
wages), this is not the case for the logged model. We can interpret the
coefficeint in the logged model as follows:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{b1}\OperatorTok{=}\NormalTok{log\_model.params.sch }\CommentTok{\# get the coefficient for sch}
\NormalTok{exp\_b1}\OperatorTok{=}\NormalTok{np.exp(b1) }\CommentTok{\# exponentiate the coefficient}

\NormalTok{pct\_change}\OperatorTok{=}\NormalTok{(exp\_b1}\OperatorTok{{-}}\DecValTok{1}\NormalTok{)}\OperatorTok{*}\DecValTok{100} \CommentTok{\# multiply by 100 to get the percentage change}
\BuiltInTok{print}\NormalTok{(}\StringTok{\textquotesingle{}For every additional year of schooling, log wages increase by }\SpecialCharTok{\{\}}\StringTok{\%\textquotesingle{}}\NormalTok{.}\BuiltInTok{format}\NormalTok{(}\BuiltInTok{round}\NormalTok{(pct\_change,}\DecValTok{2}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
For every additional year of schooling, log wages increase by 10.04%
\end{verbatim}

\bookmarksetup{startatroot}

\hypertarget{assessed-question-6}{%
\chapter{Assessed Question}\label{assessed-question-6}}

Filter the dataframe to only contain people who work in construction,
extraction and installation. Compared to those who are covered by a
union but not themselves members, what is the difference in log hourly
earnings for union members? Is this difference statistically
significant?
<<<<<<< HEAD

\bookmarksetup{startatroot}

\hypertarget{difference-in-differences}{%
\chapter{Difference in Differences}\label{difference-in-differences}}

\hypertarget{workshop-08-open-in-colab}{%
\section[\emph{Workshop 08} ]{\texorpdfstring{\emph{Workshop 08}
\href{https://colab.research.google.com/github/oballinger/QM2/blob/main/notebooks/W08.\%20Diff-in-Diff.ipynb}{\protect\includegraphics{index_files/mediabag/colab-badge.png}}}{Workshop 08 Open In Colab}}\label{workshop-08-open-in-colab}}

\hypertarget{aims-6}{%
\subsection{Aims:}\label{aims-6}}

This workshop builds on last week's material, replicating analysis in
published academic research on the relationship between minimum wages
and unemployment.

As always we'll start by importing the libraries I need

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#!pip install linearmodels}
\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd}
\ImportTok{import}\NormalTok{ seaborn }\ImportTok{as}\NormalTok{ sns}
\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
\ImportTok{import}\NormalTok{ plotly}
\ImportTok{import}\NormalTok{ plotly.express }\ImportTok{as}\NormalTok{ px}
\ImportTok{import}\NormalTok{ warnings}
\ImportTok{from}\NormalTok{ statsmodels.formula.api }\ImportTok{import}\NormalTok{ ols}
\ImportTok{from}\NormalTok{ statsmodels.iolib.summary2 }\ImportTok{import}\NormalTok{ summary\_col}
\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}

\NormalTok{warnings.filterwarnings(}\StringTok{\textquotesingle{}ignore\textquotesingle{}}\NormalTok{)}
\NormalTok{sns.}\BuiltInTok{set}\NormalTok{(font\_scale}\OperatorTok{=}\FloatTok{1.5}\NormalTok{)}
\NormalTok{sns.set\_style(}\StringTok{"white"}\NormalTok{)}
\NormalTok{plt.rcParams[}\StringTok{\textquotesingle{}figure.figsize\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (}\DecValTok{12}\NormalTok{, }\DecValTok{8}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 0\tabcolsep) * \real{0.3194}}@{}}
\toprule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
\#\# Panel Regression \\
\href{https://www.pewresearch.org/politics/2012/04/17/with-voters-focused-on-economy-obama-lead-narrows/}{Surveys}
indicate that ``jobs'' are consistently one of the most important issues
among voters in U.S. presidential elections, and that Republicans are
\href{https://thehill.com/homenews/campaign/3700047-republicans-hold-14-point-advantage-on-which-party-would-do-better-job-on-economy-poll/}{typically
perceived} as better in handling the economy than Democrats. An
\href{https://www.nbcnews.com/news/us-news/where-did-trump-make-election-gains-unemployment-data-tells-surprising-n1247935}{article}
in NBC claims that ``analysis of unemployment and voting data found that
the president's share of the vote held steady or increased in each of
the 20 counties with the highest rise in unemployment from September
2019 to September 2020. And his vote share improved by 1 percentage
point or more in 70 of the 100 hardest-hit counties.'' Let's look into
this. \\
\#\#\# Data Collection \\
There are only 50 states in the U.S. but there are over 3000 counties--
this allows us to increase our sample size and perform a more
fine-grained analysis. This is particularly important if we're
interested in investigating the relationship between unemployment and
voting behaviour, because of the urban-rural divide. For example, with
in the state of New York there are probably vast differences in social
and economic factors relevant to voting behaviour between Manhattan and
very rural areas; this variation is lost when we look at aggregate
state-level resutls, but visible when we look at the county-level. As
such, in addition to the datasets we've just imported, we're going to be
downloading county-level unemployment data straight from the BLS using
the loop below. \\
::: \{.cell\} \\
Part of the cleaning process in the cell above involves the creation of
a column called ``county\_fips''-- this stands for
\href{https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt\#:~:text=FIPS\%20codes\%20are\%20numbers\%20which,to\%20which\%20the\%20county\%20belongs.}{Federal
Information Processing System}. This is a code that uniquely identifies
states and counties in the U.S. A two digit FIPS code identifies states
(e.g.~01: Alabama, 02: Alaska, etc.) and a five digit fips code
identifies counties (e.g.~010001: Atauga County, Alabama; 02068: Denali
Borough, Alaska). Notice, the first two digits of the five-digit county
FIPS code indicates the state. Boring, yes, but these codes are
imperative in allowing us to join county- and state- level datasets from
different sources quicky and easily. Imagine what a nightmare it would
be to try to join them using the names of the counties, having to deal
with capitalizations, punctuation, etc. Yikes. \\
\#\#\# Maps \\
Great-- we've now got clean, county-level unemployment and population
data spanning from 1990-2022 on an annual basis. Lets make a map to
explore the spatial distribution of unemployment across time in the U.S.
In order to do that, we're going to need a spatial file that tells us
the shapes of the counties; I've imported it as a variable called
\texttt{county\_polygons}. We're then going to create an map using the
\href{https://plotly.com/python/}{Plotly} library, which is great for
making pretty, interactive maps and plots. It will have a slider on the
bottom that lets us view unemployment in different years. It's doing
quite a bit under the hood so it will take some time to plot. Be
patient. \\
::: \{.cell\} ``` \{.python .cell-code\} import json !mkdir data !mkdir
data/wk10/ !curl
https://storage.googleapis.com/qm2/wk10/geojson-counties-fips.json -o
data/wk10/geojson-counties-fips.json \\
county\_polygons =
json.load(open(``data/wk10/geojson-counties-fips.json'')) ``` ::: \\
::: \{.cell\} ``` \{.python .cell-code\}
plot\_sample=counties{[}counties{[}`year'{]}\textgreater2007{]} \#
subset the data to only include years after 2007 -- it would take too
long to plot all of the data \\
px.choropleth( \# plot a choropleth map using the plotly express (px)
library plot\_sample, \# load the dataframe locations=`county\_fips', \#
set the location column to the state code geojson=county\_polygons, \#
set the location mode to USA states (you could add your own custom
geojson/spatial file here) scope=`usa', \# set the scope to the USA, so
that it only plots the states color=``unemployment'', \# set the color
of the states to correspond to the unemployment rate
animation\_frame=plot\_sample{[}``year''{]}.astype(str), \# set the
animation frame to the date, creating a slider at the bottom of the map
color\_continuous\_scale=px.colors.sequential.Viridis, \# set the color
scale to Viridis, a commonly used color scale range\_color={[}0, 10{]},
\# set the range of the color scale to 0-10 height=1000) \# set the
height of the map to 1000 pixels ``` ::: \\
This map is interactive-- meaning you can zoom in, pan around, and hover
over it to get further information on the unemployment level in each
county. You can also use the slider at the bottom to toggle between
different years; if you move the slider from 2008 to 2009, you'll see
lots of yellow suddenly appearing. A similar thing happens between 2019
and 2020. What's going on? Play around with this map for a second, and
make note of spatial and temporal trends in unemployment. \\
Now we're going to do the same thing for the elections data, which I've
taken the liberty of cleaning. Let's load it up as a dataframe called
\texttt{elections}, and make another map in which we plot vote shares in
various elections such that red shows republican support, and blue shows
democratic support. \\
::: \{.cell\} \\
Explore the map above. What do you notice about republican vote share,
particularly as it relates to the previous map of unemployment? \\
Now we've got two datasets-- one on unemployment and another on election
results. We want to merge them but CAREFUL: each row corresponds to the
value of a variable \(x\) in county \(i\) and time \(t\) (so,
\(x_{it}\)); for example, the value in the first row of our dataset
under the unemployment column would be \(unemployment_{01001, 2000}\);
i.e., the unemployment rate in Atauga County, Alabama (FIPS code 01001),
in the year 2000. When our data has this structure (\(x_{it}\)), we call
it \textbf{panel data}. It must be handled differently from
\textbf{cross sectional data} (\(x_i\)), from merging to estimation. \\
We can't just merge on \(i\) or \(t\), we need to merge on both. We can
do so as follows: \\
::: \{.cell\} \\
\#\#\# Exercise \\
OK. Our data is clean and ready for analysis. Because we're going to be
investigating the relationship between unempoyment rates and republican
voteshare via a regression model, we're going to need to follow the four
steps of regression modeling from
\href{https://oballinger.github.io/QM2/notebooks/W09.\%20Linear\%20Regression.html}{last
week}. \\
First, formulate a research question (complete with null and alternative
hypothesis), and then follow these steps for our dataset, \texttt{df\_c}
(bonus points if you account for the influence of population). \\
1. Summary Statistics * Table of Summary Statistics 2. Visualisation *
Exploratory Plots 3. Assumptions * A. Independence * B.
Heteroscedasticity: Regression plots + Q-Q plot * C. Multicollinearity:
VIF + Correlation Matrix 4. Regression * Regression Table \\
For the moment, when you run the regression, ignore the fact that we
have panel data and just run a regular regression of the form
\(\huge Y= \beta_0 + \beta_1X+\epsilon \) \\
\#\#\# Accounting for Space and Time \\
If you've done things correctly, you'll notice two things. First, the
appears to be a generally negative relationship between unemployment and
republican voteshare; in other words, places with higher unemployment
tend to vote \emph{against} republicans. Second, we've egregiously
violated the independence assumption. We have repeat observations of the
same individuals (counties) over time. As such, this result may be
biased unless we account for space and time. \\
As we saw in the lecture, panel data actually contains \emph{two}
sources of variation: differences \emph{between} individuals (in this
case, counties), and \emph{within} individuials. So, a simple research
question such as ``Does unemployment increase republican voteshare'' is
actually two different questions: \\
1. Does a higher level of unemployment lead to higher republican vote
shares \textbf{between counties}? 2. Does an \emph{increase} in the
unemployment rate over time lead to an \emph{increase} in republican
vote shares \textbf{within counties}? \\
Neither is more important than the other, but we must be careful not to
conflate them as they are very different questions. A straighforward way
of answering the first question would be to get rid of the time
dimension in our data by running a separate regression for each year: \\
::: \{.cell\} ``` \{.python .cell-code\} models={[}{]} \# create empty
list to store the models names={[}{]} \# create empty list to store the
names of the models years=df\_c{[}`year'{]}.unique() \\
for year in years: \# loop through years from 2000 to 2020 in increments
of 4 election=df\_c{[}df\_c{[}`year'{]}==year{]} \# subset the data to
only include the year of interest model= ols(`r\_votes \textasciitilde{}
unemployment + population', data=election).fit() \# run a regression of
the republican vote share on the unemployment rate models.append(model)
\# append the model to the list of models names.append(str(year)) \#
append the name of the model to the list of names \\
table=summary\_col( \# create a regression table models, \# pass the
models to the summary\_col function stars=True, \# add stars denoting
the p-values of the coefficient to the table; * p\textless0.05, **
p\textless0.01, *** p\textless0.001 float\_format=`\%0.3f', \# set the
decimal places to 3 model\_names=names, \# set the names of the model
info\_dict = \{``N'':lambda x: ``\{0:d\}''.format(int(x.nobs))\}) \# add
the number of observations to the table \\
print(table) \# print the table ``` ::: \\
This table is pretty informative. Using what we learned from last week,
we can say that for the 2020 election, \\
* A 1\% increase in the unemployment rate was associated with a 2.3\%
\emph{decrease} in republican voteshare. * A 1000-person increase in
population was associated with 0.029\% decrease in republican voteshare.
* both of these results are statistically signifiant at the 0.01 level.
* 23\% of the variation in republican voteshare can be explained by
unemployment and population. \\
Crucially, ``increase'' in this context pertains to \emph{differences in
between counties}! \\
We can also compare these results across different elections. The
coefficient for the unemployment variable in 2020 is over twice the size
of the same coefficient in 2016! So it looks like actually unemployment
and republican voteshare are \emph{negatively} related, contrary to
popular belief. \\
But is this the whole story? \\
Below, i've isolated West Virginia, one of the states with the highest
unemployment rates in America. Instead of drawing a new regression line
every year, i've drawn a new regression line for each county over the
six elections. \\
::: \{.cell\} \\
Within a given county, an increase in the unemployment rate is
associated with an \textbf{increase} in republican voteshare! This is
where the second question comes in (variation within counties). \\
We got away with doing a series of cross-sectional analyses (a new
regression for each election) because we have over 3000 counties, so
\(n>3000\) for each of those regressions (though even so, we're still
splitting our data up and it would be better to leverage the full
dataset of \textgreater18000 observations in one regression). It also
provides relatively useful information about the importance of
unemployment across the country for each election. We can't really apply
the same thinking to this situation, since we only have six time
periods. If we ran a separate regression for each county, we would only
have six observations per regression-- nowhere near enough to satisfy
the central limit theorem (at least n\textgreater30). The insights would
also be of limited utility; we would get over 3000 unique estimates for
the realtionship between county-level employment and election results.
Imagine trying to fit \emph{that} into one table. \\
Luckily, there's a way of modeling this relationship that allows us to
account for differences in between counties, while also capturing the
variation within counties. This is called a \textbf{Fixed Effect
regression} \\
\textgreater{} \textbf{Fixed Effects Models}: In experimental research,
unmeasured differences between subjects are often controlled for via
random assignment to treatment and control groups. Hence, even if a
variable like Socio-Economic Status is not explicitly measured, because
of random assignment, we can be reasonably confident that the effects of
SES are approximately equal for all groups. Of course, random assignment
is usually not possible with most survey research. If we want to control
for the effect of a variable, we must explicitly measure it. If we don't
measure it, we can't control for it. In practice, there will almost
certainly be some variables we have failed to measure (or have measured
poorly), so our models will likely suffer from some degree of omitted
variable bias. \textgreater When we have panel data (the same
people/states/counties. etc. measured at two or more points in time)
another alternative presents itself: we can use the subjects as their
own controls. With panel data we can control for stable characteristics
(i.e.~characteristics that do not change across time) whether they are
measured or not. These include such things as sex, race, and ethnicity
for individuals, or urban/rural, topography, economic structure for
geographic areas. The idea is that, whatever effect these variables have
at one point in time, they will have the same effect at a different
point in time because the values of such variables do not change. \\
A fixed effect regression takes the following form: \\
\(\huge Y_{it}=\alpha_i+\beta X_{it}+\epsilon_{it}\) \\
Where: * \(X_{it}\) are the independent variables (e.g.~population and
unemployment) whose values vary over time. * \(\beta\) is the slope
coefficient for variable \(x\) (e.g.~unemployment). The model assumes
that these effects are time-invariant, e.g.~the effect of \(x\) is the
same at same 1 as it is at time 4 (although the value of \(x\) can be
different at different time periods). * \(\alpha_i\) and
\(\epsilon_{it}\) are both error terms. \(\epsilon_{it}\) is different
for each individual at each point in time. \(\alpha_i\) only varies
across individuals but not across time. We can think of \(\alpha_i\) as
representing the effects of all the time invariant/stable variables that
have NOT been included in the model. So, given that we have 6 time
periods for each county then the six records for county 1 would all have
the same value for \(\alpha_1\), the six records for county 2 would all
have the same value for \(\alpha_2\), etc. But, \(\epsilon_{it}\) is
free to be different for every case at every time period. \\
A fixed effect regression allows us to account for \(\alpha_i\) through
a technique called \textbf{demeaning} \\
\textgreater{}\textbf{Demeaning}: After demeaning, all variables for all
cases have a mean of 0. That means that all the between-subject
variability has been eliminated. All that is left is the within-subject
variability. So, with a fixed effects model, we are analyzing what
causes individual's values to change across time. Variables whose values
do not change (like race or gender) cannot cause changes across time
(unless their effects change across time as well). However, whatever
effect they have at one time is the same effect that they have at other
times, so the effects of such stable characteristics are controlled. \\
In essence, you can picture this as allowing you to draw a separate
regression line through each set of observations from the same group in
your data (in this case, one county over time); however, while the
\emph{intercept} of these lines can vary (their absolute position), they
will all have the same \emph{slope} and will therefore be parallel. This
is important, as we want to find one slope-- one common effect of x--
that fits \emph{all} groups. \\
Run the command below to install the library. \\
::: \{.cell\} \\
::: \{.cell\} ``` \{.python .cell-code\} from linearmodels import
PanelOLS from linearmodels import RandomEffects import
statsmodels.formula.api as smf from linearmodels.panel import compare \\
df\_c=df\_c.set\_index({[}`county\_fips',`year'{]}) \# set the index to
the county fips code and the year panel =
PanelOLS.from\_formula(`r\_votes \textasciitilde{} 1 + population +
unemployment + EntityEffects',df\_c).fit() \# run a fixed effects model
print(compare(\{`Fixed Effects': panel,\}, stars=True)) \# print the
model formatted as a regression table ``` ::: \\
When accounting for time-invariant differences between counties, the
effect of population remains negative. This suggests that counties in
which the population is \emph{decreasing} tend to experience an increase
in republican voteshare. More specifically, for every 1000 people that
leave a county, republican voteshare increases by 0.06\%. \\
The really interesting part of this regression table, however, is the
coefficient on the unemployment variable, which is now positive. This
suggests that-- once we account for the differences between counties--
an increase in the unemployment rate \emph{within} a county is
\emph{positively} associated with republican voteshare. Indeed, a 1\%
increase in the unemployment rate leads to a 0.28\% increase in
republican voteshare. \\
This regression output even gives us three separate \(R^2\) values-- one
for between-variation, another for within, and one overall. \\
\end{longtable}

\hypertarget{difference-in-differences-1}{%
\section{2. Difference in
Differences}\label{difference-in-differences-1}}

One of the reasons that we observe a signficant relationship between
unemployment and voting behaviour in last week's workshop is that the
Republican and Democratic parties have opposing views on what to do
about unemployment. Democratic lawmakers have historically been in
favour of increasing the minimum wage to benefit low-income workers,
while Republicans have generally opposed this on the basis that it would
hurt these very workers by increase unemployment. Indeed, classical
economic theory holds that an increase in wages would lead to a
reduction in employment; A business that makes \$100k in revenue per
year and spends all of it on employing 20 people can't suddenly start
paying their workers double their salaries-- unless it fires half of its
workers. This is obviously a simplified model though-- minimum wage laws
typically don't double wages, and businesses don't operate at-cost, they
turn a profit which they could use to pay their workers more. In the
rest of this workshop, we're going to be investigating this question
empirically:

\hypertarget{do-minimum-wage-laws-increase-unemployment}{%
\subsection{Do minimum wage laws increase
unemployment?}\label{do-minimum-wage-laws-increase-unemployment}}

Note that this is a \emph{causal} question; i'm not asking if they're
correlated-- i'm asking if one causes the other. The burden of proof
here is much higher than observing correlations, and we have to think
seriously about \textbf{endogeneity}. In partiuclar, we need to account
for the influence of omitted variables (e.g.~a recession, or the
economic composition of a state), the potential for reverse causality
(states implementing minimum wage laws in response to unemployment
crises), and selection bias.

In a lab, you can conduct causal inference by running an experiment. You
can randomly select individuals, split them into a control group and a
treatment group, measure their values in an outcome variable prior to a
treatment, administer a treatment, and measure their respective values
after the treatment. If you observe a change in the outcome variable in
the treatment group after having administered the treatment, you can
interpert that as the causal effect of treatment. This is because we're
able to make a plausible argument that the \textbf{control group can act
as a counterfactual (a stand-in) for the treatment group in the absence
of treatment}. Both groups had the same values before the treatment,
then the only thing that changed between them was the treatment, so if
we observe a change in the outcome variable, it must be due to
treatment.

In the real world, we rarely get to run expermients of this kind.
Instead, we have to hunt for \textbf{natural experiments}: situations in
which there is a \textbf{treatment} which we're interested in measuring
the effect of, and two groups that can plausibly act as a treatment and
control group.

\begin{quote}
\textbf{\href{https://www.publichealth.columbia.edu/research/population-health-methods/difference-difference-estimation\#:~:text=DID\%20relies\%20on\%20a\%20less,individual\%20level\%20is\%20not\%20possible.}{Difference
in Difference}} is a quasi-experimental design that makes use of
longitudinal data from treatment and control groups to obtain an
appropriate counterfactual to estimate a causal effect. DID is typically
used to estimate the effect of a specific intervention or treatment
(such as a passage of law, enactment of policy, or large-scale program
implementation) by comparing the changes in outcomes over time between a
population that is enrolled in a program (the intervention group) and a
population that is not (the control group).
\end{quote}

The Difference in Difference model can be estimated as a simple
regression model of the following form:

\[\huge Y_{it} = \beta_0 + \beta_1 Treatment_i + \beta_2 Post_t + \beta_3 (Treatment_i \times Post_t) + \varepsilon_{it}\]

\begin{itemize}
\tightlist
\item
  \(Treatment_i\) is 0 for the control group and 1 for the treatment
  group
\item
  \(Post_t\) is 0 for before and 1 for after
\end{itemize}

we can insert the values of \(Treatment\) and \(Post\) using the table
below and see that coefficient (\(\beta_3\)) of the interaction of
\(Treatment\) and \(Post\) is the Difference in Differences (DID)
estimator:

\href{https://davidcard.berkeley.edu/papers/njmin-aer.pdf}{Card and
Krueger (1994)} found one such natural experiment, allowing them to
estimate the causal effect of an increase in the state minimum wage on
unemployment using a DiD model; In 1992, New Jersey raised the state
minimum wage from \$4.25 to \$5.05 while the minimum wage in
neighbouring Pennsylvania stayed the same at \$4.25.

\begin{itemize}
\tightlist
\item
  Treatmeng Group: New Jersey
\item
  Control Group: Pennsylvania
\item
  Pre-Treatment Period: before 1992
\item
  Post-Treatment Period: after 1992
\end{itemize}

They conducted a survey of 384 fast-food restaurants across both states,
right before and right after the law came into effect in New Jersey,
asking them how many people they employed. They ran a
Difference-in-Differences model, and found that the coefficient
\(\beta_3\) was positive but not statistically significant. In other
words, the average total employees per restaurant \emph{increased} after
the minimum wage increased, but this could have been due to random
chance.

That was a long time ago. Things have changed since then, including the
fact that we have access to a lot more data and computational power.
Let's see if we can replicate Card and Krueger's results with more
recent data. I've downloaded data on unemployment, minimum wage levels,
and Gross Domestic Product at the state level going back to 1976. Let's
have a look at minimum wages in New Jersey and Pennsylvania over time:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df\_s}\OperatorTok{=}\NormalTok{pd.read\_csv(}\StringTok{\textquotesingle{}https://storage.googleapis.com/qm2/wk10/state\_data.csv\textquotesingle{}}\NormalTok{, parse\_dates}\OperatorTok{=}\NormalTok{[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]) }\CommentTok{\# read in the state{-}level data}
\NormalTok{did}\OperatorTok{=}\NormalTok{df\_s[df\_s[}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{].isin([}\StringTok{\textquotesingle{}pennsylvania\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}new jersey\textquotesingle{}}\NormalTok{])] }\CommentTok{\# subset the data to only include pennsylvania and new jersey}

\NormalTok{px.line(did, x}\OperatorTok{=}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}minwage\textquotesingle{}}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{, title}\OperatorTok{=}\StringTok{"Minimum Wages in New Jersey and Pennsylvania"}\NormalTok{) }\CommentTok{\# plot the minimum wage over time}
\end{Highlighting}
\end{Shaded}

The plot above sort of looks like a set of descending staircases; this
is for two reasons. The plateaus exist because each row in the dataframe
\texttt{df\_s} is the value of a state in a given \emph{month}, but we
only have minimum wage data for every \emph{year}. So we get 12
consecutive values of minimum wage every year. The reason that the
staircases are descending is because these minimum wages are adjusted
for inflation. No matter where you're from, you've probably heard a
grandparent say something along the lines of ``My parents would send me
to the shops with 25 cents to buy groceries for the week'', but now it
costs £9 for a bag of chips. That's inflation-- every year things tend
to get slightly more expensive, so if the same \emph{absolute} minimum
wage actually diminishes in ``real'' terms, which is what the variable
\texttt{minwage} measures. Incidentally, this is one of the main reasons
University staff have been on
\href{https://www.ucu.org.uk/article/11830/University-staff-pay-cut-by-20-new-figures-show}{strike}.
Anyway. Back to minimum wages.

This plot shows that for the past fifty years, New Jersey and
Pennsylvania have had largely similar minimum wage policies. There have
been a couple moments of divergence, including in the 1990s when the
Card and Krueger study was conducted. However, the biggest divergence
actually started taking place in 2014 when New Jersey seems to have
begun taking a wildly different approach. While Pennsylvania has had the
same minimum wage since 2008 (and therefore seen a decline in
inflation-adjusted wages), New Jersey has raised the minimum wage
significantly twice. In 2020, New Jersey's minimum wage was around 50\%
higher than Pennsylvania's. We can exploit the fact that these two
states have historically had similar minimum wage laws but have recently
experienced a big divergence to see if that change in minimum wages has
resulted in a change in employment levels.

Our Difference-in-Differences setup is as follows:

\[\large Unemployment_{state, year} = \beta_0 + \beta_1 Treatment_{state} + \beta_2 Post_{year} + \beta_3 (Treatment_{state} \times Post_{year}) + \beta_4 GDP_{state,year} + \varepsilon_{it}\]

\begin{itemize}
\tightlist
\item
  New Jersey is the \textbf{treatment group}
\item
  Pennsylvania is the \textbf{control group}
\item
  Years before 2014 is the \textbf{pre-treatment period}
\item
  Years after 2014 is the \textbf{post-treatment period}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{did[}\StringTok{\textquotesingle{}post\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{np.where(did[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}=}\StringTok{\textquotesingle{}2014{-}01{-}01\textquotesingle{}}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{0}\NormalTok{) }\CommentTok{\# create a variable that is 1 if the date is after the minimum wage increase and 0 otherwise}
\NormalTok{did[}\StringTok{\textquotesingle{}treatment\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{np.where(did[}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\StringTok{\textquotesingle{}new jersey\textquotesingle{}}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{0}\NormalTok{) }\CommentTok{\# create a variable that is 1 if the state is new jersey (i.e., the treatment group) and 0 for pennsylvania (the control group)}
\NormalTok{did[}\StringTok{\textquotesingle{}post\_treatment\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{did[}\StringTok{\textquotesingle{}post\textquotesingle{}}\NormalTok{]}\OperatorTok{*}\NormalTok{did[}\StringTok{\textquotesingle{}treatment\textquotesingle{}}\NormalTok{] }\CommentTok{\# create a variable that is 1 if the date is after the minimum wage increase and the state is new jersey and 0 otherwise}
\end{Highlighting}
\end{Shaded}

Before we proceed with the analysis, though, we need to satisfy two
assumptions that will allow us to argue that Pennsylvania can act as a
valid control group for New Jersey:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  No simultaneous treatments:

  \begin{itemize}
  \tightlist
  \item
    If, for example, New Jersey suddenly entered a massive recession in
    2014 as well, we couldn't really argue that resulting effects on
    employment are due solely to the minimum wage law. To account for
    this, we'll be including state-level GDP as an additional
    independent variable in our DiD model.
  \end{itemize}
\item
  Parallel Trends:

  \begin{itemize}
  \tightlist
  \item
    Both states have to have been experiencing similar trends in the
    \textbf{dependent variable} (unemployment) prior to the treatment
    (minimum wage law). If they were trending in opposite directions for
    unobserved reasons, ensuing differences in unemployment may be due
    to those unobserved reasons rather than the treatment.
  \item
    We can check this by plotting the dependent variable for both groups
    over time, and indicating the timing of the treatment.
  \end{itemize}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{did}\OperatorTok{=}\NormalTok{did[(did[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}=}\StringTok{\textquotesingle{}2008{-}01{-}01\textquotesingle{}}\NormalTok{) }\OperatorTok{\&}\NormalTok{ (did[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textless{}=}\StringTok{\textquotesingle{}2020{-}01{-}01\textquotesingle{}}\NormalTok{)]}
\NormalTok{sns.lineplot(data}\OperatorTok{=}\NormalTok{did,x}\OperatorTok{=}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{,y}\OperatorTok{=}\StringTok{\textquotesingle{}unemployment\textquotesingle{}}\NormalTok{,hue}\OperatorTok{=}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.axvline(pd.to\_datetime(}\StringTok{\textquotesingle{}2014{-}01{-}01\textquotesingle{}}\NormalTok{),color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{,linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}dashed\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}NJ Minimum Wage Increase\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}Unemployment in Pennsylvania and New Jersey\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.legend()}
\end{Highlighting}
\end{Shaded}

This plot shows a big spike in unemployment occurring for both
Pennsylvania and New Jersey as a result of the 2008 financial crisis.
New jersey had a higher unemployment rate than Pennsylvania, but their
trends are largely parallel and decreasing after 2012. In the years
following the minimum wage law, New Jersey's unemployment rate actually
dips below Pennsylvania's for the first time in years. Let's look at
this in the form of boxplots:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{did[}\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{did[}\StringTok{\textquotesingle{}treatment\textquotesingle{}}\NormalTok{].astype(}\BuiltInTok{str}\NormalTok{)}\OperatorTok{+}\NormalTok{did[}\StringTok{\textquotesingle{}post\textquotesingle{}}\NormalTok{].astype(}\BuiltInTok{str}\NormalTok{) }\CommentTok{\# this variable is just for the plot below}
\NormalTok{sns.boxplot(x}\OperatorTok{=}\StringTok{\textquotesingle{}category\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}unemployment\textquotesingle{}}\NormalTok{, hue}\OperatorTok{=}\StringTok{\textquotesingle{}treatment\textquotesingle{}}\NormalTok{, data}\OperatorTok{=}\NormalTok{did).set\_xticklabels([}\StringTok{"Pre x Treatment"}\NormalTok{, }\StringTok{"Pre x Control"}\NormalTok{,}\StringTok{\textquotesingle{}Post x Treatment\textquotesingle{}}\NormalTok{,}\StringTok{\textquotesingle{}Post x Control\textquotesingle{}}\NormalTok{]) }
\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.title(}\StringTok{\textquotesingle{}Unemployment Rates by Treatment and Post Treatment\textquotesingle{}}\NormalTok{)}
\NormalTok{plt.show()}
\end{Highlighting}
\end{Shaded}

This plot is fascinating in and of itself. The two box plots on the left
show the unemployment values of the counties prior to the minimum wage
law in 2014, while the two on the right show their values after the
minimum wage increases. Pennsylvania (the ``control'' group) is colored
in blue, and New Jersey (the ``treatment'' group) is colored orange.
Prior to the minimum wage increase in 2014, Pennsylvania (blue) has a
lower unemployment rate than New Jersey (orange). In the years following
New Jersey's passage of the minimum wage law, New Jersey actually has a
\emph{lower} unemployment rate than Pennsylvania! This is the only
boxplot where the ``treatment'' (a minimum wage law) is being applied,
and it has the lowest unemployment rate.

Let's see if this difference is statistically signfiicant, and calculate
a treatment effect:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{did\_model }\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}unemployment \textasciitilde{}  post + treatment + post\_treatment\textquotesingle{}}\NormalTok{, did).fit()}
\BuiltInTok{print}\NormalTok{(did\_model.summary())}
\end{Highlighting}
\end{Shaded}

There are some really interesting results from this model-- let's
interpret the coefficients one by one.

\begin{itemize}
\tightlist
\item
  \texttt{gdp}: GDP is inversely related to unemployment. This makes
  sense: GDP basically measures the total amount of economic activity,
  so more economic activity = more employment.
\item
  \texttt{post}: this coefficient is negative, but statistically
  insignificant at the 0.05 level; it indicates that unemployment
  \emph{generally} decreased for both groups, but that this could be due
  to random chance.
\item
  \texttt{treatment}: again negative but insignficant, meaning that
  there is no significant difference in unemployment levels between NJ
  and PA over the entire period.
\item
  \texttt{post\_treatment}: this is our difference-in-differences
  estimator, and reflects the causal effect of treatment. It is negative
  and statistically significant. If we believe that the asusmptions of
  our model are satisfied, we can claim that:

  \begin{itemize}
  \tightlist
  \item
    \textbf{The introduction of a minimum wage in New Jersey led to a
    1.95\% decrease in unemployment relative to Pennsylvania}
  \end{itemize}
\end{itemize}

This is a bold claim. We should do our best to back it up. Notice that
i've sort of arbitrarily chosen a window of dates around the minimum
wage law-- maybe this result is a fluke, due to the timespan ive chosen.

To address this concern, I'll run the same model 10 times, starting with
a really small time window-- just one year on either side of the law--
and progressively expand it.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{models}\OperatorTok{=}\NormalTok{[] }\CommentTok{\# create empty list to store the models}
\NormalTok{names}\OperatorTok{=}\NormalTok{[] }\CommentTok{\# create empty list to store the names of the models}

\ControlFlowTok{for}\NormalTok{ window }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{1}\NormalTok{,}\DecValTok{10}\NormalTok{): }\CommentTok{\# loop through years from 2000 to 2020 in increments of 4}
\NormalTok{    did}\OperatorTok{=}\NormalTok{df\_s[(df\_s[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}=}\BuiltInTok{str}\NormalTok{(}\DecValTok{2014}\OperatorTok{{-}}\NormalTok{window)}\OperatorTok{+}\StringTok{\textquotesingle{}{-}01{-}01\textquotesingle{}}\NormalTok{) }\OperatorTok{\&}\NormalTok{ (df\_s[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textless{}=}\BuiltInTok{str}\NormalTok{(}\DecValTok{2014}\OperatorTok{+}\NormalTok{window)}\OperatorTok{+}\StringTok{\textquotesingle{}{-}01{-}01\textquotesingle{}}\NormalTok{) }\OperatorTok{\&}\NormalTok{ df\_s[}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{].isin([}\StringTok{\textquotesingle{}pennsylvania\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}new jersey\textquotesingle{}}\NormalTok{])] }\CommentTok{\# subset the data within the window of interest around 2014}
\NormalTok{    did[}\StringTok{\textquotesingle{}post\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{np.where(did[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}=}\StringTok{\textquotesingle{}2014{-}01{-}01\textquotesingle{}}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{0}\NormalTok{) }\CommentTok{\# create a dummy variable indicating the period after the minimum wage increase}
\NormalTok{    did[}\StringTok{\textquotesingle{}treatment\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{np.where(did[}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{]}\OperatorTok{==}\StringTok{\textquotesingle{}new jersey\textquotesingle{}}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{0}\NormalTok{) }\CommentTok{\# create a dummy variable for treatment}
\NormalTok{    did[}\StringTok{\textquotesingle{}post\_treatment\textquotesingle{}}\NormalTok{]}\OperatorTok{=}\NormalTok{did[}\StringTok{\textquotesingle{}post\textquotesingle{}}\NormalTok{]}\OperatorTok{*}\NormalTok{did[}\StringTok{\textquotesingle{}treatment\textquotesingle{}}\NormalTok{] }\CommentTok{\# create an interaction term between the post and treatment variables}
\NormalTok{    did\_model }\OperatorTok{=}\NormalTok{ ols(}\StringTok{\textquotesingle{}unemployment \textasciitilde{} gdp+ post + treatment + post\_treatment\textquotesingle{}}\NormalTok{, did).fit() }\CommentTok{\# run the difference in difference model}

\NormalTok{    models.append(did\_model) }\CommentTok{\# append the model to the list of models}
\NormalTok{    names.append(}\StringTok{\textquotesingle{}± \textquotesingle{}}\OperatorTok{+}\BuiltInTok{str}\NormalTok{(window)}\OperatorTok{+}\StringTok{\textquotesingle{} Year\textquotesingle{}}\NormalTok{) }\CommentTok{\# append the name of the model to the list of names}

\NormalTok{table}\OperatorTok{=}\NormalTok{summary\_col( }\CommentTok{\# create a regression table }
\NormalTok{    models, }\CommentTok{\# pass the models to the summary\_col function}
\NormalTok{    stars}\OperatorTok{=}\VariableTok{True}\NormalTok{, }\CommentTok{\# add stars denoting the p{-}values of the coefficient to the table; * p\textless{}0.05, ** p\textless{}0.01, *** p\textless{}0.001}
\NormalTok{    float\_format}\OperatorTok{=}\StringTok{\textquotesingle{}}\SpecialCharTok{\%0.3f}\StringTok{\textquotesingle{}}\NormalTok{, }\CommentTok{\# set the decimal places to 3}
\NormalTok{    model\_names}\OperatorTok{=}\NormalTok{names, }\CommentTok{\# set the names of the model}
\NormalTok{    info\_dict }\OperatorTok{=}\NormalTok{ \{}\StringTok{"N"}\NormalTok{:}\KeywordTok{lambda}\NormalTok{ x: }\StringTok{"}\SpecialCharTok{\{0:d\}}\StringTok{"}\NormalTok{.}\BuiltInTok{format}\NormalTok{(}\BuiltInTok{int}\NormalTok{(x.nobs))\}) }\CommentTok{\# add the number of observations to the table}

\BuiltInTok{print}\NormalTok{(table) }\CommentTok{\# print the table}
\end{Highlighting}
\end{Shaded}

The row we're mainly interested in is the \texttt{post\_treatment}
coefficient, the treatment effect. It remains significant and negative
in all time periods smaller than 8 years, after which point it becomes
insignificant;

How do you think this affects our conclusion?

\bookmarksetup{startatroot}

\hypertarget{assessed-question-7}{%
\chapter{Assessed Question}\label{assessed-question-7}}

Now we've got evidence that minimum wage laws may actually
\emph{decrease} unemployment in the case of New Jersey and Pennsylvania.
But we've got quite a bit of data, and minimum wages change frequently.
Let's find another example where we may be able to run a difference in
differences regression to see if this trend holds in a different
context.

Below, I've picked out Arizona and Louisiana; they had nearly the exact
same minimum wage for seven years, but in 2007 Arizona nearly tripled
its minimum wage while Louisiana kept it the same (\ldots by not having
one).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{did2}\OperatorTok{=}\NormalTok{df\_s[(df\_s[}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{].isin([}\StringTok{\textquotesingle{}arizona\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}louisiana\textquotesingle{}}\NormalTok{]))}\OperatorTok{\&}\NormalTok{(df\_s[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textgreater{}=}\StringTok{\textquotesingle{}2000\textquotesingle{}}\NormalTok{)}\OperatorTok{\&}\NormalTok{ (df\_s[}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{]}\OperatorTok{\textless{}}\StringTok{\textquotesingle{}2010\textquotesingle{}}\NormalTok{)] }
\NormalTok{px.line(did2, x}\OperatorTok{=}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, y}\OperatorTok{=}\StringTok{\textquotesingle{}minwage\textquotesingle{}}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}state\textquotesingle{}}\NormalTok{, title}\OperatorTok{=}\StringTok{"Minimum Wages in Kansas and Ohio"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Run a difference in differences regression to measure the effect of this
minimum wage increase on unemployment. Define three variables (post,
treatment, post\_treatment), and include just these three variables in
the model.

\begin{itemize}
\tightlist
\item
  Part A: What is the effect of the minimum wage increase on
  unemployment in the case of Arizona and Louisiana?
\item
  Part B: Difference in Differences designs have two assumptions:
  parallel trends, and no simultaneous treatment. Can you think of any
  events that ocurred in 2008 that might violate the ``no simultaneous
  treatment'' assumption?
\end{itemize}
=======
>>>>>>> 58a0f66f1426d670b4b670029a7c08abf59b4a8c


\end{document}